xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision a324c34037ef2e1101962fca4ad0c021253288e1)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/STLExtras.h"
69 #include "llvm/ADT/SmallPtrSet.h"
70 #include "llvm/ADT/SmallSet.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/Statistic.h"
73 #include "llvm/ADT/StringRef.h"
74 #include "llvm/ADT/Twine.h"
75 #include "llvm/ADT/iterator_range.h"
76 #include "llvm/Analysis/AssumptionCache.h"
77 #include "llvm/Analysis/BasicAliasAnalysis.h"
78 #include "llvm/Analysis/BlockFrequencyInfo.h"
79 #include "llvm/Analysis/CFG.h"
80 #include "llvm/Analysis/CodeMetrics.h"
81 #include "llvm/Analysis/DemandedBits.h"
82 #include "llvm/Analysis/GlobalsModRef.h"
83 #include "llvm/Analysis/LoopAccessAnalysis.h"
84 #include "llvm/Analysis/LoopAnalysisManager.h"
85 #include "llvm/Analysis/LoopInfo.h"
86 #include "llvm/Analysis/LoopIterator.h"
87 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
88 #include "llvm/Analysis/ProfileSummaryInfo.h"
89 #include "llvm/Analysis/ScalarEvolution.h"
90 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
91 #include "llvm/Analysis/TargetLibraryInfo.h"
92 #include "llvm/Analysis/TargetTransformInfo.h"
93 #include "llvm/Analysis/ValueTracking.h"
94 #include "llvm/Analysis/VectorUtils.h"
95 #include "llvm/IR/Attributes.h"
96 #include "llvm/IR/BasicBlock.h"
97 #include "llvm/IR/CFG.h"
98 #include "llvm/IR/Constant.h"
99 #include "llvm/IR/Constants.h"
100 #include "llvm/IR/DataLayout.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/DebugLoc.h"
103 #include "llvm/IR/DerivedTypes.h"
104 #include "llvm/IR/DiagnosticInfo.h"
105 #include "llvm/IR/Dominators.h"
106 #include "llvm/IR/Function.h"
107 #include "llvm/IR/IRBuilder.h"
108 #include "llvm/IR/InstrTypes.h"
109 #include "llvm/IR/Instruction.h"
110 #include "llvm/IR/Instructions.h"
111 #include "llvm/IR/IntrinsicInst.h"
112 #include "llvm/IR/Intrinsics.h"
113 #include "llvm/IR/Metadata.h"
114 #include "llvm/IR/Module.h"
115 #include "llvm/IR/Operator.h"
116 #include "llvm/IR/PatternMatch.h"
117 #include "llvm/IR/Type.h"
118 #include "llvm/IR/Use.h"
119 #include "llvm/IR/User.h"
120 #include "llvm/IR/Value.h"
121 #include "llvm/IR/ValueHandle.h"
122 #include "llvm/IR/Verifier.h"
123 #include "llvm/InitializePasses.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
126 #include "llvm/Support/CommandLine.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
129 #include "llvm/Support/ErrorHandling.h"
130 #include "llvm/Support/InstructionCost.h"
131 #include "llvm/Support/MathExtras.h"
132 #include "llvm/Support/raw_ostream.h"
133 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
134 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
135 #include "llvm/Transforms/Utils/LoopSimplify.h"
136 #include "llvm/Transforms/Utils/LoopUtils.h"
137 #include "llvm/Transforms/Utils/LoopVersioning.h"
138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cmath>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks"));
202 
203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204 // that predication is preferred, and this lists all options. I.e., the
205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
206 // and predicate the instructions accordingly. If tail-folding fails, there are
207 // different fallback strategies depending on these values:
208 namespace PreferPredicateTy {
209   enum Option {
210     ScalarEpilogue = 0,
211     PredicateElseScalarEpilogue,
212     PredicateOrDontVectorize
213   };
214 } // namespace PreferPredicateTy
215 
216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217     "prefer-predicate-over-epilogue",
218     cl::init(PreferPredicateTy::ScalarEpilogue),
219     cl::Hidden,
220     cl::desc("Tail-folding and predication preferences over creating a scalar "
221              "epilogue loop."),
222     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
223                          "scalar-epilogue",
224                          "Don't tail-predicate loops, create scalar epilogue"),
225               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
226                          "predicate-else-scalar-epilogue",
227                          "prefer tail-folding, create scalar epilogue if tail "
228                          "folding fails."),
229               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
230                          "predicate-dont-vectorize",
231                          "prefers tail-folding, don't attempt vectorization if "
232                          "tail-folding fails.")));
233 
234 static cl::opt<bool> MaximizeBandwidth(
235     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236     cl::desc("Maximize bandwidth when selecting vectorization factor which "
237              "will be determined by the smallest type in loop."));
238 
239 static cl::opt<bool> EnableInterleavedMemAccesses(
240     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242 
243 /// An interleave-group may need masking if it resides in a block that needs
244 /// predication, or in order to mask away gaps.
245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248 
249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251     cl::desc("We don't interleave loops with a estimated constant trip count "
252              "below this number"));
253 
254 static cl::opt<unsigned> ForceTargetNumScalarRegs(
255     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256     cl::desc("A flag that overrides the target's number of scalar registers."));
257 
258 static cl::opt<unsigned> ForceTargetNumVectorRegs(
259     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of vector registers."));
261 
262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's max interleave factor for "
265              "scalar loops."));
266 
267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269     cl::desc("A flag that overrides the target's max interleave factor for "
270              "vectorized loops."));
271 
272 static cl::opt<unsigned> ForceTargetInstructionCost(
273     "force-target-instruction-cost", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's expected cost for "
275              "an instruction to a single constant value. Mostly "
276              "useful for getting consistent testing."));
277 
278 static cl::opt<bool> ForceTargetSupportsScalableVectors(
279     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280     cl::desc(
281         "Pretend that scalable vectors are supported, even if the target does "
282         "not support them. This flag should only be used for testing."));
283 
284 static cl::opt<unsigned> SmallLoopCost(
285     "small-loop-cost", cl::init(20), cl::Hidden,
286     cl::desc(
287         "The cost of a loop that is considered 'small' by the interleaver."));
288 
289 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291     cl::desc("Enable the use of the block frequency analysis to access PGO "
292              "heuristics minimizing code growth in cold regions and being more "
293              "aggressive in hot regions."));
294 
295 // Runtime interleave loops for load/store throughput.
296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298     cl::desc(
299         "Enable runtime interleaving until load/store ports are saturated"));
300 
301 /// Interleave small loops with scalar reductions.
302 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304     cl::desc("Enable interleaving for loops with small iteration counts that "
305              "contain scalar reductions to expose ILP."));
306 
307 /// The number of stores in a loop that are allowed to need predication.
308 static cl::opt<unsigned> NumberOfStoresToPredicate(
309     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310     cl::desc("Max number of stores to be predicated behind an if."));
311 
312 static cl::opt<bool> EnableIndVarRegisterHeur(
313     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314     cl::desc("Count the induction variable only once when interleaving"));
315 
316 static cl::opt<bool> EnableCondStoresVectorization(
317     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318     cl::desc("Enable if predication of stores during vectorization."));
319 
320 static cl::opt<unsigned> MaxNestedScalarReductionIC(
321     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322     cl::desc("The maximum interleave count to use when interleaving a scalar "
323              "reduction in a nested loop."));
324 
325 static cl::opt<bool>
326     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327                            cl::Hidden,
328                            cl::desc("Prefer in-loop vector reductions, "
329                                     "overriding the targets preference."));
330 
331 static cl::opt<bool> ForceOrderedReductions(
332     "force-ordered-reductions", cl::init(false), cl::Hidden,
333     cl::desc("Enable the vectorisation of loops with in-order (strict) "
334              "FP reductions"));
335 
336 static cl::opt<bool> PreferPredicatedReductionSelect(
337     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338     cl::desc(
339         "Prefer predicating a reduction operation over an after loop select."));
340 
341 cl::opt<bool> EnableVPlanNativePath(
342     "enable-vplan-native-path", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 static cl::opt<bool> PrintVPlansInDotFormat(
365     "vplan-print-in-dot-format", cl::Hidden,
366     cl::desc("Use dot format instead of plain text when dumping VPlans"));
367 
368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369     "force-widen-divrem-via-safe-divisor", cl::Hidden,
370     cl::desc(
371         "Override cost based safe divisor widening for div/rem instructions"));
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377   // Determine if an array of N elements of type Ty is "bitcast compatible"
378   // with a <N x Ty> vector.
379   // This is only true if there is no padding between the array elements.
380   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381 }
382 
383 /// A helper function that returns the reciprocal of the block probability of
384 /// predicated blocks. If we return X, we are assuming the predicated block
385 /// will execute once for every X iterations of the loop header.
386 ///
387 /// TODO: We should use actual block probability here, if available. Currently,
388 ///       we always assume predicated blocks have a 50% chance of executing.
389 static unsigned getReciprocalPredBlockProb() { return 2; }
390 
391 /// A helper function that returns an integer or floating-point constant with
392 /// value C.
393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395                            : ConstantFP::get(Ty, C);
396 }
397 
398 /// Returns "best known" trip count for the specified loop \p L as defined by
399 /// the following procedure:
400 ///   1) Returns exact trip count if it is known.
401 ///   2) Returns expected trip count according to profile data if any.
402 ///   3) Returns upper bound estimate if it is known.
403 ///   4) Returns std::nullopt if all of the above failed.
404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405                                                    Loop *L) {
406   // Check if exact trip count is known.
407   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408     return ExpectedTC;
409 
410   // Check if there is an expected trip count available from profile data.
411   if (LoopVectorizeWithBlockFrequency)
412     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413       return *EstimatedTC;
414 
415   // Check if upper bound estimate is known.
416   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417     return ExpectedTC;
418 
419   return std::nullopt;
420 }
421 
422 namespace {
423 // Forward declare GeneratedRTChecks.
424 class GeneratedRTChecks;
425 } // namespace
426 
427 namespace llvm {
428 
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       ElementCount MinProfitableTripCount,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464 
465     if (MinProfitableTripCount.isZero())
466       this->MinProfitableTripCount = VecWidth;
467     else
468       this->MinProfitableTripCount = MinProfitableTripCount;
469   }
470 
471   virtual ~InnerLoopVectorizer() = default;
472 
473   /// Create a new empty loop that will contain vectorized instructions later
474   /// on, while the old loop will be used as the scalar remainder. Control flow
475   /// is generated around the vectorized (and scalar epilogue) loops consisting
476   /// of various checks and bypasses. Return the pre-header block of the new
477   /// loop and the start value for the canonical induction, if it is != 0. The
478   /// latter is the case when vectorizing the epilogue loop. In the case of
479   /// epilogue vectorization, this function is overriden to handle the more
480   /// complex control flow around the loops.
481   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482 
483   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485 
486   // Return true if any runtime check is added.
487   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488 
489   /// A type for vectorized values in the new loop. Each value from the
490   /// original loop, when vectorized, is represented by UF vector values in the
491   /// new unrolled loop, where UF is the unroll factor.
492   using VectorParts = SmallVector<Value *, 2>;
493 
494   /// A helper function to scalarize a single Instruction in the innermost loop.
495   /// Generates a sequence of scalar instances for each lane between \p MinLane
496   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498   /// Instr's operands.
499   void scalarizeInstruction(const Instruction *Instr,
500                             VPReplicateRecipe *RepRecipe,
501                             const VPIteration &Instance, bool IfPredicateInstr,
502                             VPTransformState &State);
503 
504   /// Construct the vector value of a scalarized value \p V one lane at a time.
505   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506                                  VPTransformState &State);
507 
508   /// Try to vectorize interleaved access group \p Group with the base address
509   /// given in \p Addr, optionally masking the vector operations if \p
510   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511   /// values in the vectorized loop.
512   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513                                 ArrayRef<VPValue *> VPDefs,
514                                 VPTransformState &State, VPValue *Addr,
515                                 ArrayRef<VPValue *> StoredValues,
516                                 VPValue *BlockInMask = nullptr);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520 
521   /// Returns true if the reordering of FP operations is not allowed, but we are
522   /// able to vectorize with strict in-order reductions for the given RdxDesc.
523   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524 
525   /// Create a broadcast instruction. This method generates a broadcast
526   /// instruction (shuffle) for loop invariant values and for the induction
527   /// value. If this is the induction variable then we extend it to N, N+1, ...
528   /// this is needed because each iteration in the loop corresponds to a SIMD
529   /// element.
530   virtual Value *getBroadcastInstrs(Value *V);
531 
532   // Returns the resume value (bc.merge.rdx) for a reduction as
533   // generated by fixReduction.
534   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535 
536   /// Create a new phi node for the induction variable \p OrigPhi to resume
537   /// iteration count in the scalar epilogue, from where the vectorized loop
538   /// left off. In cases where the loop skeleton is more complicated (eg.
539   /// epilogue vectorization) and the resume values can come from an additional
540   /// bypass block, the \p AdditionalBypass pair provides information about the
541   /// bypass block and the end value on the edge from bypass to this loop.
542   PHINode *createInductionResumeValue(
543       PHINode *OrigPhi, const InductionDescriptor &ID,
544       ArrayRef<BasicBlock *> BypassBlocks,
545       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546 
547 protected:
548   friend class LoopVectorizationPlanner;
549 
550   /// A small list of PHINodes.
551   using PhiVector = SmallVector<PHINode *, 4>;
552 
553   /// A type for scalarized values in the new loop. Each value from the
554   /// original loop, when scalarized, is represented by UF x VF scalar values
555   /// in the new unrolled loop, where UF is the unroll factor and VF is the
556   /// vectorization factor.
557   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558 
559   /// Set up the values of the IVs correctly when exiting the vector loop.
560   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561                     Value *VectorTripCount, Value *EndValue,
562                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563                     VPlan &Plan);
564 
565   /// Handle all cross-iteration phis in the header.
566   void fixCrossIterationPHIs(VPTransformState &State);
567 
568   /// Create the exit value of first order recurrences in the middle block and
569   /// update their users.
570   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571                                VPTransformState &State);
572 
573   /// Create code for the loop exit value of the reduction.
574   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575 
576   /// Clear NSW/NUW flags from reduction instructions if necessary.
577   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Iteratively sink the scalarized operands of a predicated instruction into
581   /// the block that was created for it.
582   void sinkScalarOperands(Instruction *PredInst);
583 
584   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585   /// represented as.
586   void truncateToMinimalBitwidths(VPTransformState &State);
587 
588   /// Returns (and creates if needed) the original loop trip count.
589   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590 
591   /// Returns (and creates if needed) the trip count of the widened loop.
592   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593 
594   /// Returns a bitcasted value to the requested vector type.
595   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597                                 const DataLayout &DL);
598 
599   /// Emit a bypass check to see if the vector trip count is zero, including if
600   /// it overflows.
601   void emitIterationCountCheck(BasicBlock *Bypass);
602 
603   /// Emit a bypass check to see if all of the SCEV assumptions we've
604   /// had to make are correct. Returns the block containing the checks or
605   /// nullptr if no checks have been added.
606   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607 
608   /// Emit bypass checks to check any memory assumptions we may have made.
609   /// Returns the block containing the checks or nullptr if no checks have been
610   /// added.
611   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612 
613   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614   /// vector loop preheader, middle block and scalar preheader.
615   void createVectorLoopSkeleton(StringRef Prefix);
616 
617   /// Create new phi nodes for the induction variables to resume iteration count
618   /// in the scalar epilogue, from where the vectorized loop left off.
619   /// In cases where the loop skeleton is more complicated (eg. epilogue
620   /// vectorization) and the resume values can come from an additional bypass
621   /// block, the \p AdditionalBypass pair provides information about the bypass
622   /// block and the end value on the edge from bypass to this loop.
623   void createInductionResumeValues(
624       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625 
626   /// Complete the loop skeleton by adding debug MDs, creating appropriate
627   /// conditional branches in the middle block, preparing the builder and
628   /// running the verifier. Return the preheader of the completed vector loop.
629   BasicBlock *completeLoopSkeleton();
630 
631   /// Collect poison-generating recipes that may generate a poison value that is
632   /// used after vectorization, even when their operands are not poison. Those
633   /// recipes meet the following conditions:
634   ///  * Contribute to the address computation of a recipe generating a widen
635   ///    memory load/store (VPWidenMemoryInstructionRecipe or
636   ///    VPInterleaveRecipe).
637   ///  * Such a widen memory load/store has at least one underlying Instruction
638   ///    that is in a basic block that needs predication and after vectorization
639   ///    the generated instruction won't be predicated.
640   void collectPoisonGeneratingRecipes(VPTransformState &State);
641 
642   /// Allow subclasses to override and print debug traces before/after vplan
643   /// execution, when trace information is requested.
644   virtual void printDebugTracesAtStart(){};
645   virtual void printDebugTracesAtEnd(){};
646 
647   /// The original loop.
648   Loop *OrigLoop;
649 
650   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651   /// dynamic knowledge to simplify SCEV expressions and converts them to a
652   /// more usable form.
653   PredicatedScalarEvolution &PSE;
654 
655   /// Loop Info.
656   LoopInfo *LI;
657 
658   /// Dominator Tree.
659   DominatorTree *DT;
660 
661   /// Target Library Info.
662   const TargetLibraryInfo *TLI;
663 
664   /// Target Transform Info.
665   const TargetTransformInfo *TTI;
666 
667   /// Assumption Cache.
668   AssumptionCache *AC;
669 
670   /// Interface to emit optimization remarks.
671   OptimizationRemarkEmitter *ORE;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   ElementCount VF;
676 
677   ElementCount MinProfitableTripCount;
678 
679   /// The vectorization unroll factor to use. Each scalar is vectorized to this
680   /// many different vector instructions.
681   unsigned UF;
682 
683   /// The builder that we use
684   IRBuilder<> Builder;
685 
686   // --- Vectorization state ---
687 
688   /// The vector-loop preheader.
689   BasicBlock *LoopVectorPreHeader;
690 
691   /// The scalar-loop preheader.
692   BasicBlock *LoopScalarPreHeader;
693 
694   /// Middle Block between the vector and the scalar.
695   BasicBlock *LoopMiddleBlock;
696 
697   /// The unique ExitBlock of the scalar loop if one exists.  Note that
698   /// there can be multiple exiting edges reaching this block.
699   BasicBlock *LoopExitBlock;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// Store instructions that were predicated.
708   SmallVector<Instruction *, 4> PredicatedInstructions;
709 
710   /// Trip count of the original loop.
711   Value *TripCount = nullptr;
712 
713   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714   Value *VectorTripCount = nullptr;
715 
716   /// The legality analysis.
717   LoopVectorizationLegality *Legal;
718 
719   /// The profitablity analysis.
720   LoopVectorizationCostModel *Cost;
721 
722   // Record whether runtime checks are added.
723   bool AddedSafetyChecks = false;
724 
725   // Holds the end values for each induction variable. We save the end values
726   // so we can later fix-up the external users of the induction variables.
727   DenseMap<PHINode *, Value *> IVEndValues;
728 
729   /// BFI and PSI are used to check for profile guided size optimizations.
730   BlockFrequencyInfo *BFI;
731   ProfileSummaryInfo *PSI;
732 
733   // Whether this loop should be optimized for size based on profile guided size
734   // optimizatios.
735   bool OptForSizeBasedOnProfile;
736 
737   /// Structure to hold information about generated runtime checks, responsible
738   /// for cleaning the checks, if vectorization turns out unprofitable.
739   GeneratedRTChecks &RTChecks;
740 
741   // Holds the resume values for reductions in the loops, used to set the
742   // correct start value of reduction PHIs when vectorizing the epilogue.
743   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744       ReductionResumeValues;
745 };
746 
747 class InnerLoopUnroller : public InnerLoopVectorizer {
748 public:
749   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750                     LoopInfo *LI, DominatorTree *DT,
751                     const TargetLibraryInfo *TLI,
752                     const TargetTransformInfo *TTI, AssumptionCache *AC,
753                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754                     LoopVectorizationLegality *LVL,
755                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758                             ElementCount::getFixed(1),
759                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760                             BFI, PSI, Check) {}
761 
762 private:
763   Value *getBroadcastInstrs(Value *V) override;
764 };
765 
766 /// Encapsulate information regarding vectorization of a loop and its epilogue.
767 /// This information is meant to be updated and used across two stages of
768 /// epilogue vectorization.
769 struct EpilogueLoopVectorizationInfo {
770   ElementCount MainLoopVF = ElementCount::getFixed(0);
771   unsigned MainLoopUF = 0;
772   ElementCount EpilogueVF = ElementCount::getFixed(0);
773   unsigned EpilogueUF = 0;
774   BasicBlock *MainLoopIterationCountCheck = nullptr;
775   BasicBlock *EpilogueIterationCountCheck = nullptr;
776   BasicBlock *SCEVSafetyCheck = nullptr;
777   BasicBlock *MemSafetyCheck = nullptr;
778   Value *TripCount = nullptr;
779   Value *VectorTripCount = nullptr;
780 
781   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782                                 ElementCount EVF, unsigned EUF)
783       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784     assert(EUF == 1 &&
785            "A high UF for the epilogue loop is likely not beneficial.");
786   }
787 };
788 
789 /// An extension of the inner loop vectorizer that creates a skeleton for a
790 /// vectorized loop that has its epilogue (residual) also vectorized.
791 /// The idea is to run the vplan on a given loop twice, firstly to setup the
792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
793 /// from the first step and vectorize the epilogue.  This is achieved by
794 /// deriving two concrete strategy classes from this base class and invoking
795 /// them in succession from the loop vectorizer planner.
796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797 public:
798   InnerLoopAndEpilogueVectorizer(
799       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800       DominatorTree *DT, const TargetLibraryInfo *TLI,
801       const TargetTransformInfo *TTI, AssumptionCache *AC,
802       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805       GeneratedRTChecks &Checks)
806       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808                             CM, BFI, PSI, Checks),
809         EPI(EPI) {}
810 
811   // Override this function to handle the more complex control flow around the
812   // three loops.
813   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814     return createEpilogueVectorizedLoopSkeleton();
815   }
816 
817   /// The interface for creating a vectorized skeleton using one of two
818   /// different strategies, each corresponding to one execution of the vplan
819   /// as described above.
820   virtual std::pair<BasicBlock *, Value *>
821   createEpilogueVectorizedLoopSkeleton() = 0;
822 
823   /// Holds and updates state information required to vectorize the main loop
824   /// and its epilogue in two separate passes. This setup helps us avoid
825   /// regenerating and recomputing runtime safety checks. It also helps us to
826   /// shorten the iteration-count-check path length for the cases where the
827   /// iteration count of the loop is so small that the main vector loop is
828   /// completely skipped.
829   EpilogueLoopVectorizationInfo &EPI;
830 };
831 
832 /// A specialized derived class of inner loop vectorizer that performs
833 /// vectorization of *main* loops in the process of vectorizing loops and their
834 /// epilogues.
835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836 public:
837   EpilogueVectorizerMainLoop(
838       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839       DominatorTree *DT, const TargetLibraryInfo *TLI,
840       const TargetTransformInfo *TTI, AssumptionCache *AC,
841       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844       GeneratedRTChecks &Check)
845       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846                                        EPI, LVL, CM, BFI, PSI, Check) {}
847   /// Implements the interface for creating a vectorized skeleton using the
848   /// *main loop* strategy (ie the first pass of vplan execution).
849   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850 
851 protected:
852   /// Emits an iteration count bypass check once for the main loop (when \p
853   /// ForEpilogue is false) and once for the epilogue loop (when \p
854   /// ForEpilogue is true).
855   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856   void printDebugTracesAtStart() override;
857   void printDebugTracesAtEnd() override;
858 };
859 
860 // A specialized derived class of inner loop vectorizer that performs
861 // vectorization of *epilogue* loops in the process of vectorizing loops and
862 // their epilogues.
863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864 public:
865   EpilogueVectorizerEpilogueLoop(
866       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867       DominatorTree *DT, const TargetLibraryInfo *TLI,
868       const TargetTransformInfo *TTI, AssumptionCache *AC,
869       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872       GeneratedRTChecks &Checks)
873       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874                                        EPI, LVL, CM, BFI, PSI, Checks) {
875     TripCount = EPI.TripCount;
876   }
877   /// Implements the interface for creating a vectorized skeleton using the
878   /// *epilogue loop* strategy (ie the second pass of vplan execution).
879   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880 
881 protected:
882   /// Emits an iteration count bypass check after the main vector loop has
883   /// finished to see if there are any iterations left to execute by either
884   /// the vector epilogue or the scalar epilogue.
885   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886                                                       BasicBlock *Bypass,
887                                                       BasicBlock *Insert);
888   void printDebugTracesAtStart() override;
889   void printDebugTracesAtEnd() override;
890 };
891 } // end namespace llvm
892 
893 /// Look for a meaningful debug location on the instruction or it's
894 /// operands.
895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896   if (!I)
897     return I;
898 
899   DebugLoc Empty;
900   if (I->getDebugLoc() != Empty)
901     return I;
902 
903   for (Use &Op : I->operands()) {
904     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905       if (OpInst->getDebugLoc() != Empty)
906         return OpInst;
907   }
908 
909   return I;
910 }
911 
912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913 /// is passed, the message relates to that particular instruction.
914 #ifndef NDEBUG
915 static void debugVectorizationMessage(const StringRef Prefix,
916                                       const StringRef DebugMsg,
917                                       Instruction *I) {
918   dbgs() << "LV: " << Prefix << DebugMsg;
919   if (I != nullptr)
920     dbgs() << " " << *I;
921   else
922     dbgs() << '.';
923   dbgs() << '\n';
924 }
925 #endif
926 
927 /// Create an analysis remark that explains why vectorization failed
928 ///
929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
930 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
931 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
932 /// the location of the remark.  \return the remark object that can be
933 /// streamed to.
934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936   Value *CodeRegion = TheLoop->getHeader();
937   DebugLoc DL = TheLoop->getStartLoc();
938 
939   if (I) {
940     CodeRegion = I->getParent();
941     // If there is no debug location attached to the instruction, revert back to
942     // using the loop's.
943     if (I->getDebugLoc())
944       DL = I->getDebugLoc();
945   }
946 
947   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948 }
949 
950 namespace llvm {
951 
952 /// Return a value for Step multiplied by VF.
953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954                        int64_t Step) {
955   assert(Ty->isIntegerTy() && "Expected an integer step");
956   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958 }
959 
960 /// Return the runtime value for VF.
961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963   return VF.isScalable() ? B.CreateVScale(EC) : EC;
964 }
965 
966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
969 
970   ScalarEvolution &SE = *PSE.getSE();
971 
972   // The exit count might have the type of i64 while the phi is i32. This can
973   // happen if we have an induction variable that is sign extended before the
974   // compare. The only way that we get a backedge taken count is that the
975   // induction variable was signed and as such will not overflow. In such a case
976   // truncation is legal.
977   if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978       IdxTy->getPrimitiveSizeInBits())
979     BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980   BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981 
982   // Get the total trip count from the count by adding 1.
983   return SE.getAddExpr(BackedgeTakenCount,
984                        SE.getOne(BackedgeTakenCount->getType()));
985 }
986 
987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988                                   ElementCount VF) {
989   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
990   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992   return B.CreateUIToFP(RuntimeVF, FTy);
993 }
994 
995 void reportVectorizationFailure(const StringRef DebugMsg,
996                                 const StringRef OREMsg, const StringRef ORETag,
997                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998                                 Instruction *I) {
999   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1000   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001   ORE->emit(
1002       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003       << "loop not vectorized: " << OREMsg);
1004 }
1005 
1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008                              Instruction *I) {
1009   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1010   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011   ORE->emit(
1012       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013       << Msg);
1014 }
1015 
1016 } // end namespace llvm
1017 
1018 #ifndef NDEBUG
1019 /// \return string containing a file name and a line # for the given loop.
1020 static std::string getDebugLocString(const Loop *L) {
1021   std::string Result;
1022   if (L) {
1023     raw_string_ostream OS(Result);
1024     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025       LoopDbgLoc.print(OS);
1026     else
1027       // Just print the module name.
1028       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029     OS.flush();
1030   }
1031   return Result;
1032 }
1033 #endif
1034 
1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036     VPTransformState &State) {
1037 
1038   // Collect recipes in the backward slice of `Root` that may generate a poison
1039   // value that is used after vectorization.
1040   SmallPtrSet<VPRecipeBase *, 16> Visited;
1041   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042     SmallVector<VPRecipeBase *, 16> Worklist;
1043     Worklist.push_back(Root);
1044 
1045     // Traverse the backward slice of Root through its use-def chain.
1046     while (!Worklist.empty()) {
1047       VPRecipeBase *CurRec = Worklist.back();
1048       Worklist.pop_back();
1049 
1050       if (!Visited.insert(CurRec).second)
1051         continue;
1052 
1053       // Prune search if we find another recipe generating a widen memory
1054       // instruction. Widen memory instructions involved in address computation
1055       // will lead to gather/scatter instructions, which don't need to be
1056       // handled.
1057       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058           isa<VPInterleaveRecipe>(CurRec) ||
1059           isa<VPScalarIVStepsRecipe>(CurRec) ||
1060           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062         continue;
1063 
1064       // This recipe contributes to the address computation of a widen
1065       // load/store. Collect recipe if its underlying instruction has
1066       // poison-generating flags.
1067       Instruction *Instr = CurRec->getUnderlyingInstr();
1068       if (Instr && Instr->hasPoisonGeneratingFlags())
1069         State.MayGeneratePoisonRecipes.insert(CurRec);
1070 
1071       // Add new definitions to the worklist.
1072       for (VPValue *operand : CurRec->operands())
1073         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074           Worklist.push_back(OpDef);
1075     }
1076   });
1077 
1078   // Traverse all the recipes in the VPlan and collect the poison-generating
1079   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080   // VPInterleaveRecipe.
1081   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083     for (VPRecipeBase &Recipe : *VPBB) {
1084       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087         if (AddrDef && WidenRec->isConsecutive() &&
1088             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092         if (AddrDef) {
1093           // Check if any member of the interleave group needs predication.
1094           const InterleaveGroup<Instruction> *InterGroup =
1095               InterleaveRec->getInterleaveGroup();
1096           bool NeedPredication = false;
1097           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098                I < NumMembers; ++I) {
1099             Instruction *Member = InterGroup->getMember(I);
1100             if (Member)
1101               NeedPredication |=
1102                   Legal->blockNeedsPredication(Member->getParent());
1103           }
1104 
1105           if (NeedPredication)
1106             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107         }
1108       }
1109     }
1110   }
1111 }
1112 
1113 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114     const RecurrenceDescriptor &RdxDesc) {
1115   auto It = ReductionResumeValues.find(&RdxDesc);
1116   assert(It != ReductionResumeValues.end() &&
1117          "Expected to find a resume value for the reduction.");
1118   return It->second;
1119 }
1120 
1121 namespace llvm {
1122 
1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1124 // lowered.
1125 enum ScalarEpilogueLowering {
1126 
1127   // The default: allowing scalar epilogues.
1128   CM_ScalarEpilogueAllowed,
1129 
1130   // Vectorization with OptForSize: don't allow epilogues.
1131   CM_ScalarEpilogueNotAllowedOptSize,
1132 
1133   // A special case of vectorisation with OptForSize: loops with a very small
1134   // trip count are considered for vectorization under OptForSize, thereby
1135   // making sure the cost of their loop body is dominant, free of runtime
1136   // guards and scalar iteration overheads.
1137   CM_ScalarEpilogueNotAllowedLowTripLoop,
1138 
1139   // Loop hint predicate indicating an epilogue is undesired.
1140   CM_ScalarEpilogueNotNeededUsePredicate,
1141 
1142   // Directive indicating we must either tail fold or not vectorize
1143   CM_ScalarEpilogueNotAllowedUsePredicate
1144 };
1145 
1146 /// ElementCountComparator creates a total ordering for ElementCount
1147 /// for the purposes of using it in a set structure.
1148 struct ElementCountComparator {
1149   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152   }
1153 };
1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155 
1156 /// LoopVectorizationCostModel - estimates the expected speedups due to
1157 /// vectorization.
1158 /// In many cases vectorization is not profitable. This can happen because of
1159 /// a number of reasons. In this class we mainly attempt to predict the
1160 /// expected speedup/slowdowns due to the supported instruction set. We use the
1161 /// TargetTransformInfo to query the different backends for the cost of
1162 /// different operations.
1163 class LoopVectorizationCostModel {
1164 public:
1165   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167                              LoopVectorizationLegality *Legal,
1168                              const TargetTransformInfo &TTI,
1169                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1170                              AssumptionCache *AC,
1171                              OptimizationRemarkEmitter *ORE, const Function *F,
1172                              const LoopVectorizeHints *Hints,
1173                              InterleavedAccessInfo &IAI)
1174       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176         Hints(Hints), InterleaveInfo(IAI) {}
1177 
1178   /// \return An upper bound for the vectorization factors (both fixed and
1179   /// scalable). If the factors are 0, vectorization and interleaving should be
1180   /// avoided up front.
1181   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182 
1183   /// \return True if runtime checks are required for vectorization, and false
1184   /// otherwise.
1185   bool runtimeChecksRequired();
1186 
1187   /// \return The most profitable vectorization factor and the cost of that VF.
1188   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189   /// then this vectorization factor will be selected if vectorization is
1190   /// possible.
1191   VectorizationFactor
1192   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193 
1194   VectorizationFactor
1195   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196                                     const LoopVectorizationPlanner &LVP);
1197 
1198   /// Setup cost-based decisions for user vectorization factor.
1199   /// \return true if the UserVF is a feasible VF to be chosen.
1200   bool selectUserVectorizationFactor(ElementCount UserVF) {
1201     collectUniformsAndScalars(UserVF);
1202     collectInstsToScalarize(UserVF);
1203     return expectedCost(UserVF).first.isValid();
1204   }
1205 
1206   /// \return The size (in bits) of the smallest and widest types in the code
1207   /// that needs to be vectorized. We ignore values that remain scalar such as
1208   /// 64 bit loop indices.
1209   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210 
1211   /// \return The desired interleave count.
1212   /// If interleave count has been specified by metadata it will be returned.
1213   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214   /// are the selected vectorization factor and the cost of the selected VF.
1215   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216 
1217   /// Memory access instruction may be vectorized in more than one way.
1218   /// Form of instruction after vectorization depends on cost.
1219   /// This function takes cost-based decisions for Load/Store instructions
1220   /// and collects them in a map. This decisions map is used for building
1221   /// the lists of loop-uniform and loop-scalar instructions.
1222   /// The calculated cost is saved with widening decision in order to
1223   /// avoid redundant calculations.
1224   void setCostBasedWideningDecision(ElementCount VF);
1225 
1226   /// A struct that represents some properties of the register usage
1227   /// of a loop.
1228   struct RegisterUsage {
1229     /// Holds the number of loop invariant values that are used in the loop.
1230     /// The key is ClassID of target-provided register class.
1231     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232     /// Holds the maximum number of concurrent live intervals in the loop.
1233     /// The key is ClassID of target-provided register class.
1234     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235   };
1236 
1237   /// \return Returns information about the register usages of the loop for the
1238   /// given vectorization factors.
1239   SmallVector<RegisterUsage, 8>
1240   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241 
1242   /// Collect values we want to ignore in the cost model.
1243   void collectValuesToIgnore();
1244 
1245   /// Collect all element types in the loop for which widening is needed.
1246   void collectElementTypesForWidening();
1247 
1248   /// Split reductions into those that happen in the loop, and those that happen
1249   /// outside. In loop reductions are collected into InLoopReductionChains.
1250   void collectInLoopReductions();
1251 
1252   /// Returns true if we should use strict in-order reductions for the given
1253   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255   /// of FP operations.
1256   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257     return !Hints->allowReordering() && RdxDesc.isOrdered();
1258   }
1259 
1260   /// \returns The smallest bitwidth each instruction can be represented with.
1261   /// The vector equivalents of these instructions should be truncated to this
1262   /// type.
1263   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264     return MinBWs;
1265   }
1266 
1267   /// \returns True if it is more profitable to scalarize instruction \p I for
1268   /// vectorization factor \p VF.
1269   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270     assert(VF.isVector() &&
1271            "Profitable to scalarize relevant only for VF > 1.");
1272 
1273     // Cost model is not run in the VPlan-native path - return conservative
1274     // result until this changes.
1275     if (EnableVPlanNativePath)
1276       return false;
1277 
1278     auto Scalars = InstsToScalarize.find(VF);
1279     assert(Scalars != InstsToScalarize.end() &&
1280            "VF not yet analyzed for scalarization profitability");
1281     return Scalars->second.find(I) != Scalars->second.end();
1282   }
1283 
1284   /// Returns true if \p I is known to be uniform after vectorization.
1285   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286     if (VF.isScalar())
1287       return true;
1288 
1289     // Cost model is not run in the VPlan-native path - return conservative
1290     // result until this changes.
1291     if (EnableVPlanNativePath)
1292       return false;
1293 
1294     auto UniformsPerVF = Uniforms.find(VF);
1295     assert(UniformsPerVF != Uniforms.end() &&
1296            "VF not yet analyzed for uniformity");
1297     return UniformsPerVF->second.count(I);
1298   }
1299 
1300   /// Returns true if \p I is known to be scalar after vectorization.
1301   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302     if (VF.isScalar())
1303       return true;
1304 
1305     // Cost model is not run in the VPlan-native path - return conservative
1306     // result until this changes.
1307     if (EnableVPlanNativePath)
1308       return false;
1309 
1310     auto ScalarsPerVF = Scalars.find(VF);
1311     assert(ScalarsPerVF != Scalars.end() &&
1312            "Scalar values are not calculated for VF");
1313     return ScalarsPerVF->second.count(I);
1314   }
1315 
1316   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317   /// for vectorization factor \p VF.
1318   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320            !isProfitableToScalarize(I, VF) &&
1321            !isScalarAfterVectorization(I, VF);
1322   }
1323 
1324   /// Decision that was taken during cost calculation for memory instruction.
1325   enum InstWidening {
1326     CM_Unknown,
1327     CM_Widen,         // For consecutive accesses with stride +1.
1328     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329     CM_Interleave,
1330     CM_GatherScatter,
1331     CM_Scalarize
1332   };
1333 
1334   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335   /// instruction \p I and vector width \p VF.
1336   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337                            InstructionCost Cost) {
1338     assert(VF.isVector() && "Expected VF >=2");
1339     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340   }
1341 
1342   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343   /// interleaving group \p Grp and vector width \p VF.
1344   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345                            ElementCount VF, InstWidening W,
1346                            InstructionCost Cost) {
1347     assert(VF.isVector() && "Expected VF >=2");
1348     /// Broadcast this decicion to all instructions inside the group.
1349     /// But the cost will be assigned to one instruction only.
1350     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351       if (auto *I = Grp->getMember(i)) {
1352         if (Grp->getInsertPos() == I)
1353           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354         else
1355           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356       }
1357     }
1358   }
1359 
1360   /// Return the cost model decision for the given instruction \p I and vector
1361   /// width \p VF. Return CM_Unknown if this instruction did not pass
1362   /// through the cost modeling.
1363   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364     assert(VF.isVector() && "Expected VF to be a vector VF");
1365     // Cost model is not run in the VPlan-native path - return conservative
1366     // result until this changes.
1367     if (EnableVPlanNativePath)
1368       return CM_GatherScatter;
1369 
1370     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371     auto Itr = WideningDecisions.find(InstOnVF);
1372     if (Itr == WideningDecisions.end())
1373       return CM_Unknown;
1374     return Itr->second.first;
1375   }
1376 
1377   /// Return the vectorization cost for the given instruction \p I and vector
1378   /// width \p VF.
1379   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380     assert(VF.isVector() && "Expected VF >=2");
1381     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1383            "The cost is not calculated");
1384     return WideningDecisions[InstOnVF].second;
1385   }
1386 
1387   /// Return True if instruction \p I is an optimizable truncate whose operand
1388   /// is an induction variable. Such a truncate will be removed by adding a new
1389   /// induction variable with the destination type.
1390   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391     // If the instruction is not a truncate, return false.
1392     auto *Trunc = dyn_cast<TruncInst>(I);
1393     if (!Trunc)
1394       return false;
1395 
1396     // Get the source and destination types of the truncate.
1397     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399 
1400     // If the truncate is free for the given types, return false. Replacing a
1401     // free truncate with an induction variable would add an induction variable
1402     // update instruction to each iteration of the loop. We exclude from this
1403     // check the primary induction variable since it will need an update
1404     // instruction regardless.
1405     Value *Op = Trunc->getOperand(0);
1406     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407       return false;
1408 
1409     // If the truncated value is not an induction variable, return false.
1410     return Legal->isInductionPhi(Op);
1411   }
1412 
1413   /// Collects the instructions to scalarize for each predicated instruction in
1414   /// the loop.
1415   void collectInstsToScalarize(ElementCount VF);
1416 
1417   /// Collect Uniform and Scalar values for the given \p VF.
1418   /// The sets depend on CM decision for Load/Store instructions
1419   /// that may be vectorized as interleave, gather-scatter or scalarized.
1420   void collectUniformsAndScalars(ElementCount VF) {
1421     // Do the analysis once.
1422     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423       return;
1424     setCostBasedWideningDecision(VF);
1425     collectLoopUniforms(VF);
1426     collectLoopScalars(VF);
1427   }
1428 
1429   /// Returns true if the target machine supports masked store operation
1430   /// for the given \p DataType and kind of access to \p Ptr.
1431   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432     return Legal->isConsecutivePtr(DataType, Ptr) &&
1433            TTI.isLegalMaskedStore(DataType, Alignment);
1434   }
1435 
1436   /// Returns true if the target machine supports masked load operation
1437   /// for the given \p DataType and kind of access to \p Ptr.
1438   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439     return Legal->isConsecutivePtr(DataType, Ptr) &&
1440            TTI.isLegalMaskedLoad(DataType, Alignment);
1441   }
1442 
1443   /// Returns true if the target machine can represent \p V as a masked gather
1444   /// or scatter operation.
1445   bool isLegalGatherOrScatter(Value *V,
1446                               ElementCount VF = ElementCount::getFixed(1)) {
1447     bool LI = isa<LoadInst>(V);
1448     bool SI = isa<StoreInst>(V);
1449     if (!LI && !SI)
1450       return false;
1451     auto *Ty = getLoadStoreType(V);
1452     Align Align = getLoadStoreAlignment(V);
1453     if (VF.isVector())
1454       Ty = VectorType::get(Ty, VF);
1455     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457   }
1458 
1459   /// Returns true if the target machine supports all of the reduction
1460   /// variables found for the given VF.
1461   bool canVectorizeReductions(ElementCount VF) const {
1462     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465     }));
1466   }
1467 
1468   /// Given costs for both strategies, return true if the scalar predication
1469   /// lowering should be used for div/rem.  This incorporates an override
1470   /// option so it is not simply a cost comparison.
1471   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472                                      InstructionCost SafeDivisorCost) const {
1473     switch (ForceSafeDivisor) {
1474     case cl::BOU_UNSET:
1475       return ScalarCost < SafeDivisorCost;
1476     case cl::BOU_TRUE:
1477       return false;
1478     case cl::BOU_FALSE:
1479       return true;
1480     };
1481     llvm_unreachable("impossible case value");
1482   }
1483 
1484   /// Returns true if \p I is an instruction which requires predication and
1485   /// for which our chosen predication strategy is scalarization (i.e. we
1486   /// don't have an alternate strategy such as masking available).
1487   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489 
1490   /// Returns true if \p I is an instruction that needs to be predicated
1491   /// at runtime.  The result is independent of the predication mechanism.
1492   /// Superset of instructions that return true for isScalarWithPredication.
1493   bool isPredicatedInst(Instruction *I) const;
1494 
1495   /// Return the costs for our two available strategies for lowering a
1496   /// div/rem operation which requires speculating at least one lane.
1497   /// First result is for scalarization (will be invalid for scalable
1498   /// vectors); second is for the safe-divisor strategy.
1499   std::pair<InstructionCost, InstructionCost>
1500   getDivRemSpeculationCost(Instruction *I,
1501                            ElementCount VF) const;
1502 
1503   /// Returns true if \p I is a memory instruction with consecutive memory
1504   /// access that can be widened.
1505   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506 
1507   /// Returns true if \p I is a memory instruction in an interleaved-group
1508   /// of memory accesses that can be vectorized with wide vector loads/stores
1509   /// and shuffles.
1510   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511 
1512   /// Check if \p Instr belongs to any interleaved access group.
1513   bool isAccessInterleaved(Instruction *Instr) {
1514     return InterleaveInfo.isInterleaved(Instr);
1515   }
1516 
1517   /// Get the interleaved access group that \p Instr belongs to.
1518   const InterleaveGroup<Instruction> *
1519   getInterleavedAccessGroup(Instruction *Instr) {
1520     return InterleaveInfo.getInterleaveGroup(Instr);
1521   }
1522 
1523   /// Returns true if we're required to use a scalar epilogue for at least
1524   /// the final iteration of the original loop.
1525   bool requiresScalarEpilogue(ElementCount VF) const {
1526     if (!isScalarEpilogueAllowed())
1527       return false;
1528     // If we might exit from anywhere but the latch, must run the exiting
1529     // iteration in scalar form.
1530     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531       return true;
1532     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533   }
1534 
1535   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536   /// loop hint annotation.
1537   bool isScalarEpilogueAllowed() const {
1538     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539   }
1540 
1541   /// Returns true if all loop blocks should be masked to fold tail loop.
1542   bool foldTailByMasking() const { return FoldTailByMasking; }
1543 
1544   /// Returns true if were tail-folding and want to use the active lane mask
1545   /// for vector loop control flow.
1546   bool useActiveLaneMaskForControlFlow() const {
1547     return FoldTailByMasking &&
1548            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549   }
1550 
1551   /// Returns true if the instructions in this block requires predication
1552   /// for any reason, e.g. because tail folding now requires a predicate
1553   /// or because the block in the original loop was predicated.
1554   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556   }
1557 
1558   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559   /// nodes to the chain of instructions representing the reductions. Uses a
1560   /// MapVector to ensure deterministic iteration order.
1561   using ReductionChainMap =
1562       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563 
1564   /// Return the chain of instructions representing an inloop reduction.
1565   const ReductionChainMap &getInLoopReductionChains() const {
1566     return InLoopReductionChains;
1567   }
1568 
1569   /// Returns true if the Phi is part of an inloop reduction.
1570   bool isInLoopReduction(PHINode *Phi) const {
1571     return InLoopReductionChains.count(Phi);
1572   }
1573 
1574   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575   /// with factor VF.  Return the cost of the instruction, including
1576   /// scalarization overhead if it's needed.
1577   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578 
1579   /// Estimate cost of a call instruction CI if it were vectorized with factor
1580   /// VF. Return the cost of the instruction, including scalarization overhead
1581   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582   /// scalarized -
1583   /// i.e. either vector version isn't available, or is too expensive.
1584   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585                                     bool &NeedToScalarize) const;
1586 
1587   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588   /// that of B.
1589   bool isMoreProfitable(const VectorizationFactor &A,
1590                         const VectorizationFactor &B) const;
1591 
1592   /// Invalidates decisions already taken by the cost model.
1593   void invalidateCostModelingDecisions() {
1594     WideningDecisions.clear();
1595     Uniforms.clear();
1596     Scalars.clear();
1597   }
1598 
1599   /// Convenience function that returns the value of vscale_range iff
1600   /// vscale_range.min == vscale_range.max or otherwise returns the value
1601   /// returned by the corresponding TLI method.
1602   std::optional<unsigned> getVScaleForTuning() const;
1603 
1604 private:
1605   unsigned NumPredStores = 0;
1606 
1607   /// \return An upper bound for the vectorization factors for both
1608   /// fixed and scalable vectorization, where the minimum-known number of
1609   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610   /// disabled or unsupported, then the scalable part will be equal to
1611   /// ElementCount::getScalable(0).
1612   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613                                            ElementCount UserVF,
1614                                            bool FoldTailByMasking);
1615 
1616   /// \return the maximized element count based on the targets vector
1617   /// registers and the loop trip-count, but limited to a maximum safe VF.
1618   /// This is a helper function of computeFeasibleMaxVF.
1619   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620                                        unsigned SmallestType,
1621                                        unsigned WidestType,
1622                                        ElementCount MaxSafeVF,
1623                                        bool FoldTailByMasking);
1624 
1625   /// \return the maximum legal scalable VF, based on the safe max number
1626   /// of elements.
1627   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628 
1629   /// The vectorization cost is a combination of the cost itself and a boolean
1630   /// indicating whether any of the contributing operations will actually
1631   /// operate on vector values after type legalization in the backend. If this
1632   /// latter value is false, then all operations will be scalarized (i.e. no
1633   /// vectorization has actually taken place).
1634   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635 
1636   /// Returns the expected execution cost. The unit of the cost does
1637   /// not matter because we use the 'cost' units to compare different
1638   /// vector widths. The cost that is returned is *not* normalized by
1639   /// the factor width. If \p Invalid is not nullptr, this function
1640   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641   /// each instruction that has an Invalid cost for the given VF.
1642   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643   VectorizationCostTy
1644   expectedCost(ElementCount VF,
1645                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646 
1647   /// Returns the execution time cost of an instruction for a given vector
1648   /// width. Vector width of one means scalar.
1649   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost-computation logic from getInstructionCost which provides
1652   /// the vector type as an output parameter.
1653   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654                                      Type *&VectorTy);
1655 
1656   /// Return the cost of instructions in an inloop reduction pattern, if I is
1657   /// part of that pattern.
1658   std::optional<InstructionCost>
1659   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660                           TTI::TargetCostKind CostKind);
1661 
1662   /// Calculate vectorization cost of memory instruction \p I.
1663   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664 
1665   /// The cost computation for scalarized memory instruction.
1666   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667 
1668   /// The cost computation for interleaving group of memory instructions.
1669   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670 
1671   /// The cost computation for Gather/Scatter instruction.
1672   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673 
1674   /// The cost computation for widening instruction \p I with consecutive
1675   /// memory access.
1676   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677 
1678   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679   /// Load: scalar load + broadcast.
1680   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681   /// element)
1682   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683 
1684   /// Estimate the overhead of scalarizing an instruction. This is a
1685   /// convenience wrapper for the type-based getScalarizationOverhead API.
1686   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687                                            TTI::TargetCostKind CostKind) const;
1688 
1689   /// Returns true if an artificially high cost for emulated masked memrefs
1690   /// should be used.
1691   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692 
1693   /// Map of scalar integer values to the smallest bitwidth they can be legally
1694   /// represented as. The vector equivalents of these values should be truncated
1695   /// to this type.
1696   MapVector<Instruction *, uint64_t> MinBWs;
1697 
1698   /// A type representing the costs for instructions if they were to be
1699   /// scalarized rather than vectorized. The entries are Instruction-Cost
1700   /// pairs.
1701   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702 
1703   /// A set containing all BasicBlocks that are known to present after
1704   /// vectorization as a predicated block.
1705   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706       PredicatedBBsAfterVectorization;
1707 
1708   /// Records whether it is allowed to have the original scalar loop execute at
1709   /// least once. This may be needed as a fallback loop in case runtime
1710   /// aliasing/dependence checks fail, or to handle the tail/remainder
1711   /// iterations when the trip count is unknown or doesn't divide by the VF,
1712   /// or as a peel-loop to handle gaps in interleave-groups.
1713   /// Under optsize and when the trip count is very small we don't allow any
1714   /// iterations to execute in the scalar loop.
1715   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716 
1717   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718   bool FoldTailByMasking = false;
1719 
1720   /// A map holding scalar costs for different vectorization factors. The
1721   /// presence of a cost for an instruction in the mapping indicates that the
1722   /// instruction will be scalarized when vectorizing with the associated
1723   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725 
1726   /// Holds the instructions known to be uniform after vectorization.
1727   /// The data is collected per VF.
1728   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729 
1730   /// Holds the instructions known to be scalar after vectorization.
1731   /// The data is collected per VF.
1732   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733 
1734   /// Holds the instructions (address computations) that are forced to be
1735   /// scalarized.
1736   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737 
1738   /// PHINodes of the reductions that should be expanded in-loop along with
1739   /// their associated chains of reduction operations, in program order from top
1740   /// (PHI) to bottom
1741   ReductionChainMap InLoopReductionChains;
1742 
1743   /// A Map of inloop reduction operations and their immediate chain operand.
1744   /// FIXME: This can be removed once reductions can be costed correctly in
1745   /// vplan. This was added to allow quick lookup to the inloop operations,
1746   /// without having to loop through InLoopReductionChains.
1747   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748 
1749   /// Returns the expected difference in cost from scalarizing the expression
1750   /// feeding a predicated instruction \p PredInst. The instructions to
1751   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752   /// non-negative return value implies the expression will be scalarized.
1753   /// Currently, only single-use chains are considered for scalarization.
1754   InstructionCost computePredInstDiscount(Instruction *PredInst,
1755                                           ScalarCostsTy &ScalarCosts,
1756                                           ElementCount VF);
1757 
1758   /// Collect the instructions that are uniform after vectorization. An
1759   /// instruction is uniform if we represent it with a single scalar value in
1760   /// the vectorized loop corresponding to each vector iteration. Examples of
1761   /// uniform instructions include pointer operands of consecutive or
1762   /// interleaved memory accesses. Note that although uniformity implies an
1763   /// instruction will be scalar, the reverse is not true. In general, a
1764   /// scalarized instruction will be represented by VF scalar values in the
1765   /// vectorized loop, each corresponding to an iteration of the original
1766   /// scalar loop.
1767   void collectLoopUniforms(ElementCount VF);
1768 
1769   /// Collect the instructions that are scalar after vectorization. An
1770   /// instruction is scalar if it is known to be uniform or will be scalarized
1771   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772   /// to the list if they are used by a load/store instruction that is marked as
1773   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774   /// VF values in the vectorized loop, each corresponding to an iteration of
1775   /// the original scalar loop.
1776   void collectLoopScalars(ElementCount VF);
1777 
1778   /// Keeps cost model vectorization decision and cost for instructions.
1779   /// Right now it is used for memory instructions only.
1780   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781                                 std::pair<InstWidening, InstructionCost>>;
1782 
1783   DecisionList WideningDecisions;
1784 
1785   /// Returns true if \p V is expected to be vectorized and it needs to be
1786   /// extracted.
1787   bool needsExtract(Value *V, ElementCount VF) const {
1788     Instruction *I = dyn_cast<Instruction>(V);
1789     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790         TheLoop->isLoopInvariant(I))
1791       return false;
1792 
1793     // Assume we can vectorize V (and hence we need extraction) if the
1794     // scalars are not computed yet. This can happen, because it is called
1795     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796     // the scalars are collected. That should be a safe assumption in most
1797     // cases, because we check if the operands have vectorizable types
1798     // beforehand in LoopVectorizationLegality.
1799     return Scalars.find(VF) == Scalars.end() ||
1800            !isScalarAfterVectorization(I, VF);
1801   };
1802 
1803   /// Returns a range containing only operands needing to be extracted.
1804   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805                                                    ElementCount VF) const {
1806     return SmallVector<Value *, 4>(make_filter_range(
1807         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808   }
1809 
1810   /// Determines if we have the infrastructure to vectorize loop \p L and its
1811   /// epilogue, assuming the main loop is vectorized by \p VF.
1812   bool isCandidateForEpilogueVectorization(const Loop &L,
1813                                            const ElementCount VF) const;
1814 
1815   /// Returns true if epilogue vectorization is considered profitable, and
1816   /// false otherwise.
1817   /// \p VF is the vectorization factor chosen for the original loop.
1818   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819 
1820 public:
1821   /// The loop that we evaluate.
1822   Loop *TheLoop;
1823 
1824   /// Predicated scalar evolution analysis.
1825   PredicatedScalarEvolution &PSE;
1826 
1827   /// Loop Info analysis.
1828   LoopInfo *LI;
1829 
1830   /// Vectorization legality.
1831   LoopVectorizationLegality *Legal;
1832 
1833   /// Vector target information.
1834   const TargetTransformInfo &TTI;
1835 
1836   /// Target Library Info.
1837   const TargetLibraryInfo *TLI;
1838 
1839   /// Demanded bits analysis.
1840   DemandedBits *DB;
1841 
1842   /// Assumption cache.
1843   AssumptionCache *AC;
1844 
1845   /// Interface to emit optimization remarks.
1846   OptimizationRemarkEmitter *ORE;
1847 
1848   const Function *TheFunction;
1849 
1850   /// Loop Vectorize Hint.
1851   const LoopVectorizeHints *Hints;
1852 
1853   /// The interleave access information contains groups of interleaved accesses
1854   /// with the same stride and close to each other.
1855   InterleavedAccessInfo &InterleaveInfo;
1856 
1857   /// Values to ignore in the cost model.
1858   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859 
1860   /// Values to ignore in the cost model when VF > 1.
1861   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862 
1863   /// All element types found in the loop.
1864   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865 
1866   /// Profitable vector factors.
1867   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868 };
1869 } // end namespace llvm
1870 
1871 namespace {
1872 /// Helper struct to manage generating runtime checks for vectorization.
1873 ///
1874 /// The runtime checks are created up-front in temporary blocks to allow better
1875 /// estimating the cost and un-linked from the existing IR. After deciding to
1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1877 /// temporary blocks are completely removed.
1878 class GeneratedRTChecks {
1879   /// Basic block which contains the generated SCEV checks, if any.
1880   BasicBlock *SCEVCheckBlock = nullptr;
1881 
1882   /// The value representing the result of the generated SCEV checks. If it is
1883   /// nullptr, either no SCEV checks have been generated or they have been used.
1884   Value *SCEVCheckCond = nullptr;
1885 
1886   /// Basic block which contains the generated memory runtime checks, if any.
1887   BasicBlock *MemCheckBlock = nullptr;
1888 
1889   /// The value representing the result of the generated memory runtime checks.
1890   /// If it is nullptr, either no memory runtime checks have been generated or
1891   /// they have been used.
1892   Value *MemRuntimeCheckCond = nullptr;
1893 
1894   DominatorTree *DT;
1895   LoopInfo *LI;
1896   TargetTransformInfo *TTI;
1897 
1898   SCEVExpander SCEVExp;
1899   SCEVExpander MemCheckExp;
1900 
1901   bool CostTooHigh = false;
1902 
1903 public:
1904   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905                     TargetTransformInfo *TTI, const DataLayout &DL)
1906       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907         MemCheckExp(SE, DL, "scev.check") {}
1908 
1909   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910   /// accurately estimate the cost of the runtime checks. The blocks are
1911   /// un-linked from the IR and is added back during vector code generation. If
1912   /// there is no vector code generation, the check blocks are removed
1913   /// completely.
1914   void Create(Loop *L, const LoopAccessInfo &LAI,
1915               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916 
1917     // Hard cutoff to limit compile-time increase in case a very large number of
1918     // runtime checks needs to be generated.
1919     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920     // profile info.
1921     CostTooHigh =
1922         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923     if (CostTooHigh)
1924       return;
1925 
1926     BasicBlock *LoopHeader = L->getHeader();
1927     BasicBlock *Preheader = L->getLoopPreheader();
1928 
1929     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931     // may be used by SCEVExpander. The blocks will be un-linked from their
1932     // predecessors and removed from LI & DT at the end of the function.
1933     if (!UnionPred.isAlwaysTrue()) {
1934       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935                                   nullptr, "vector.scevcheck");
1936 
1937       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938           &UnionPred, SCEVCheckBlock->getTerminator());
1939     }
1940 
1941     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942     if (RtPtrChecking.Need) {
1943       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945                                  "vector.memcheck");
1946 
1947       auto DiffChecks = RtPtrChecking.getDiffChecks();
1948       if (DiffChecks) {
1949         Value *RuntimeVF = nullptr;
1950         MemRuntimeCheckCond = addDiffRuntimeChecks(
1951             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953               if (!RuntimeVF)
1954                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955               return RuntimeVF;
1956             },
1957             IC);
1958       } else {
1959         MemRuntimeCheckCond =
1960             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961                              RtPtrChecking.getChecks(), MemCheckExp);
1962       }
1963       assert(MemRuntimeCheckCond &&
1964              "no RT checks generated although RtPtrChecking "
1965              "claimed checks are required");
1966     }
1967 
1968     if (!MemCheckBlock && !SCEVCheckBlock)
1969       return;
1970 
1971     // Unhook the temporary block with the checks, update various places
1972     // accordingly.
1973     if (SCEVCheckBlock)
1974       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975     if (MemCheckBlock)
1976       MemCheckBlock->replaceAllUsesWith(Preheader);
1977 
1978     if (SCEVCheckBlock) {
1979       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981       Preheader->getTerminator()->eraseFromParent();
1982     }
1983     if (MemCheckBlock) {
1984       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986       Preheader->getTerminator()->eraseFromParent();
1987     }
1988 
1989     DT->changeImmediateDominator(LoopHeader, Preheader);
1990     if (MemCheckBlock) {
1991       DT->eraseNode(MemCheckBlock);
1992       LI->removeBlock(MemCheckBlock);
1993     }
1994     if (SCEVCheckBlock) {
1995       DT->eraseNode(SCEVCheckBlock);
1996       LI->removeBlock(SCEVCheckBlock);
1997     }
1998   }
1999 
2000   InstructionCost getCost() {
2001     if (SCEVCheckBlock || MemCheckBlock)
2002       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2003 
2004     if (CostTooHigh) {
2005       InstructionCost Cost;
2006       Cost.setInvalid();
2007       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2008       return Cost;
2009     }
2010 
2011     InstructionCost RTCheckCost = 0;
2012     if (SCEVCheckBlock)
2013       for (Instruction &I : *SCEVCheckBlock) {
2014         if (SCEVCheckBlock->getTerminator() == &I)
2015           continue;
2016         InstructionCost C =
2017             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2019         RTCheckCost += C;
2020       }
2021     if (MemCheckBlock)
2022       for (Instruction &I : *MemCheckBlock) {
2023         if (MemCheckBlock->getTerminator() == &I)
2024           continue;
2025         InstructionCost C =
2026             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2028         RTCheckCost += C;
2029       }
2030 
2031     if (SCEVCheckBlock || MemCheckBlock)
2032       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2033                         << "\n");
2034 
2035     return RTCheckCost;
2036   }
2037 
2038   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039   /// unused.
2040   ~GeneratedRTChecks() {
2041     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043     if (!SCEVCheckCond)
2044       SCEVCleaner.markResultUsed();
2045 
2046     if (!MemRuntimeCheckCond)
2047       MemCheckCleaner.markResultUsed();
2048 
2049     if (MemRuntimeCheckCond) {
2050       auto &SE = *MemCheckExp.getSE();
2051       // Memory runtime check generation creates compares that use expanded
2052       // values. Remove them before running the SCEVExpanderCleaners.
2053       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054         if (MemCheckExp.isInsertedInstruction(&I))
2055           continue;
2056         SE.forgetValue(&I);
2057         I.eraseFromParent();
2058       }
2059     }
2060     MemCheckCleaner.cleanup();
2061     SCEVCleaner.cleanup();
2062 
2063     if (SCEVCheckCond)
2064       SCEVCheckBlock->eraseFromParent();
2065     if (MemRuntimeCheckCond)
2066       MemCheckBlock->eraseFromParent();
2067   }
2068 
2069   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071   /// depending on the generated condition.
2072   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073                              BasicBlock *LoopVectorPreHeader,
2074                              BasicBlock *LoopExitBlock) {
2075     if (!SCEVCheckCond)
2076       return nullptr;
2077 
2078     Value *Cond = SCEVCheckCond;
2079     // Mark the check as used, to prevent it from being removed during cleanup.
2080     SCEVCheckCond = nullptr;
2081     if (auto *C = dyn_cast<ConstantInt>(Cond))
2082       if (C->isZero())
2083         return nullptr;
2084 
2085     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086 
2087     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088     // Create new preheader for vector loop.
2089     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091 
2092     SCEVCheckBlock->getTerminator()->eraseFromParent();
2093     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095                                                 SCEVCheckBlock);
2096 
2097     DT->addNewBlock(SCEVCheckBlock, Pred);
2098     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099 
2100     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102     return SCEVCheckBlock;
2103   }
2104 
2105   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106   /// the branches to branch to the vector preheader or \p Bypass, depending on
2107   /// the generated condition.
2108   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109                                    BasicBlock *LoopVectorPreHeader) {
2110     // Check if we generated code that checks in runtime if arrays overlap.
2111     if (!MemRuntimeCheckCond)
2112       return nullptr;
2113 
2114     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116                                                 MemCheckBlock);
2117 
2118     DT->addNewBlock(MemCheckBlock, Pred);
2119     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121 
2122     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124 
2125     ReplaceInstWithInst(
2126         MemCheckBlock->getTerminator(),
2127         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128     MemCheckBlock->getTerminator()->setDebugLoc(
2129         Pred->getTerminator()->getDebugLoc());
2130 
2131     // Mark the check as used, to prevent it from being removed during cleanup.
2132     MemRuntimeCheckCond = nullptr;
2133     return MemCheckBlock;
2134   }
2135 };
2136 } // namespace
2137 
2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139 // vectorization. The loop needs to be annotated with #pragma omp simd
2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141 // vector length information is not provided, vectorization is not considered
2142 // explicit. Interleave hints are not allowed either. These limitations will be
2143 // relaxed in the future.
2144 // Please, note that we are currently forced to abuse the pragma 'clang
2145 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147 // provides *explicit vectorization hints* (LV can bypass legal checks and
2148 // assume that vectorization is legal). However, both hints are implemented
2149 // using the same metadata (llvm.loop.vectorize, processed by
2150 // LoopVectorizeHints). This will be fixed in the future when the native IR
2151 // representation for pragma 'omp simd' is introduced.
2152 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153                                    OptimizationRemarkEmitter *ORE) {
2154   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2155   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156 
2157   // Only outer loops with an explicit vectorization hint are supported.
2158   // Unannotated outer loops are ignored.
2159   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160     return false;
2161 
2162   Function *Fn = OuterLp->getHeader()->getParent();
2163   if (!Hints.allowVectorization(Fn, OuterLp,
2164                                 true /*VectorizeOnlyWhenForced*/)) {
2165     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2166     return false;
2167   }
2168 
2169   if (Hints.getInterleave() > 1) {
2170     // TODO: Interleave support is future work.
2171     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2172                          "outer loops.\n");
2173     Hints.emitRemarkWithHints();
2174     return false;
2175   }
2176 
2177   return true;
2178 }
2179 
2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181                                   OptimizationRemarkEmitter *ORE,
2182                                   SmallVectorImpl<Loop *> &V) {
2183   // Collect inner loops and outer loops without irreducible control flow. For
2184   // now, only collect outer loops that have explicit vectorization hints. If we
2185   // are stress testing the VPlan H-CFG construction, we collect the outermost
2186   // loop of every loop nest.
2187   if (L.isInnermost() || VPlanBuildStressTest ||
2188       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189     LoopBlocksRPO RPOT(&L);
2190     RPOT.perform(LI);
2191     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192       V.push_back(&L);
2193       // TODO: Collect inner loops inside marked outer loops in case
2194       // vectorization fails for the outer loop. Do not invoke
2195       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196       // already known to be reducible. We can use an inherited attribute for
2197       // that.
2198       return;
2199     }
2200   }
2201   for (Loop *InnerL : L)
2202     collectSupportedLoops(*InnerL, LI, ORE, V);
2203 }
2204 
2205 namespace {
2206 
2207 /// The LoopVectorize Pass.
2208 struct LoopVectorize : public FunctionPass {
2209   /// Pass identification, replacement for typeid
2210   static char ID;
2211 
2212   LoopVectorizePass Impl;
2213 
2214   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215                          bool VectorizeOnlyWhenForced = false)
2216       : FunctionPass(ID),
2217         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219   }
2220 
2221   bool runOnFunction(Function &F) override {
2222     if (skipFunction(F))
2223       return false;
2224 
2225     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233     auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237 
2238     return Impl
2239         .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240         .MadeAnyChange;
2241   }
2242 
2243   void getAnalysisUsage(AnalysisUsage &AU) const override {
2244     AU.addRequired<AssumptionCacheTracker>();
2245     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246     AU.addRequired<DominatorTreeWrapperPass>();
2247     AU.addRequired<LoopInfoWrapperPass>();
2248     AU.addRequired<ScalarEvolutionWrapperPass>();
2249     AU.addRequired<TargetTransformInfoWrapperPass>();
2250     AU.addRequired<LoopAccessLegacyAnalysis>();
2251     AU.addRequired<DemandedBitsWrapperPass>();
2252     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253     AU.addRequired<InjectTLIMappingsLegacy>();
2254 
2255     // We currently do not preserve loopinfo/dominator analyses with outer loop
2256     // vectorization. Until this is addressed, mark these analyses as preserved
2257     // only for non-VPlan-native path.
2258     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259     if (!EnableVPlanNativePath) {
2260       AU.addPreserved<LoopInfoWrapperPass>();
2261       AU.addPreserved<DominatorTreeWrapperPass>();
2262     }
2263 
2264     AU.addPreserved<BasicAAWrapperPass>();
2265     AU.addPreserved<GlobalsAAWrapperPass>();
2266     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267   }
2268 };
2269 
2270 } // end anonymous namespace
2271 
2272 //===----------------------------------------------------------------------===//
2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2275 //===----------------------------------------------------------------------===//
2276 
2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278   // We need to place the broadcast of invariant variables outside the loop,
2279   // but only if it's proven safe to do so. Else, broadcast will be inside
2280   // vector loop body.
2281   Instruction *Instr = dyn_cast<Instruction>(V);
2282   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283                      (!Instr ||
2284                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285   // Place the code for broadcasting invariant variables in the new preheader.
2286   IRBuilder<>::InsertPointGuard Guard(Builder);
2287   if (SafeToHoist)
2288     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289 
2290   // Broadcast the scalar into all locations in the vector.
2291   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292 
2293   return Shuf;
2294 }
2295 
2296 /// This function adds
2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298 /// to each vector element of Val. The sequence starts at StartIndex.
2299 /// \p Opcode is relevant for FP induction variable.
2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301                             Instruction::BinaryOps BinOp, ElementCount VF,
2302                             IRBuilderBase &Builder) {
2303   assert(VF.isVector() && "only vector VFs are supported");
2304 
2305   // Create and check the types.
2306   auto *ValVTy = cast<VectorType>(Val->getType());
2307   ElementCount VLen = ValVTy->getElementCount();
2308 
2309   Type *STy = Val->getType()->getScalarType();
2310   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2311          "Induction Step must be an integer or FP");
2312   assert(Step->getType() == STy && "Step has wrong type");
2313 
2314   SmallVector<Constant *, 8> Indices;
2315 
2316   // Create a vector of consecutive numbers from zero to VF.
2317   VectorType *InitVecValVTy = ValVTy;
2318   if (STy->isFloatingPointTy()) {
2319     Type *InitVecValSTy =
2320         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322   }
2323   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324 
2325   // Splat the StartIdx
2326   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327 
2328   if (STy->isIntegerTy()) {
2329     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330     Step = Builder.CreateVectorSplat(VLen, Step);
2331     assert(Step->getType() == Val->getType() && "Invalid step vec");
2332     // FIXME: The newly created binary instructions should contain nsw/nuw
2333     // flags, which can be found from the original scalar operations.
2334     Step = Builder.CreateMul(InitVec, Step);
2335     return Builder.CreateAdd(Val, Step, "induction");
2336   }
2337 
2338   // Floating point induction.
2339   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2340          "Binary Opcode should be specified for FP induction");
2341   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343 
2344   Step = Builder.CreateVectorSplat(VLen, Step);
2345   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347 }
2348 
2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350 /// variable on which to base the steps, \p Step is the size of the step.
2351 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352                              const InductionDescriptor &ID, VPValue *Def,
2353                              VPTransformState &State) {
2354   IRBuilderBase &Builder = State.Builder;
2355 
2356   // Ensure step has the same type as that of scalar IV.
2357   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358   if (ScalarIVTy != Step->getType()) {
2359     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360     // avoid separate truncate here.
2361     assert(Step->getType()->isIntegerTy() &&
2362            "Truncation requires an integer step");
2363     Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364   }
2365 
2366   // We build scalar steps for both integer and floating-point induction
2367   // variables. Here, we determine the kind of arithmetic we will perform.
2368   Instruction::BinaryOps AddOp;
2369   Instruction::BinaryOps MulOp;
2370   if (ScalarIVTy->isIntegerTy()) {
2371     AddOp = Instruction::Add;
2372     MulOp = Instruction::Mul;
2373   } else {
2374     AddOp = ID.getInductionOpcode();
2375     MulOp = Instruction::FMul;
2376   }
2377 
2378   // Determine the number of scalars we need to generate for each unroll
2379   // iteration.
2380   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381   // Compute the scalar steps and save the results in State.
2382   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383                                      ScalarIVTy->getScalarSizeInBits());
2384   Type *VecIVTy = nullptr;
2385   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386   if (!FirstLaneOnly && State.VF.isScalable()) {
2387     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388     UnitStepVec =
2389         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392   }
2393 
2394   unsigned StartPart = 0;
2395   unsigned EndPart = State.UF;
2396   unsigned StartLane = 0;
2397   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398   if (State.Instance) {
2399     StartPart = State.Instance->Part;
2400     EndPart = StartPart + 1;
2401     StartLane = State.Instance->Lane.getKnownLane();
2402     EndLane = StartLane + 1;
2403   }
2404   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406 
2407     if (!FirstLaneOnly && State.VF.isScalable()) {
2408       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410       if (ScalarIVTy->isFloatingPointTy())
2411         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414       State.set(Def, Add, Part);
2415       // It's useful to record the lane values too for the known minimum number
2416       // of elements so we do those below. This improves the code quality when
2417       // trying to extract the first element, for example.
2418     }
2419 
2420     if (ScalarIVTy->isFloatingPointTy())
2421       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422 
2423     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424       Value *StartIdx = Builder.CreateBinOp(
2425           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426       // The step returned by `createStepForVF` is a runtime-evaluated value
2427       // when VF is scalable. Otherwise, it should be folded into a Constant.
2428       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2429              "Expected StartIdx to be folded to a constant when VF is not "
2430              "scalable");
2431       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433       State.set(Def, Add, VPIteration(Part, Lane));
2434     }
2435   }
2436 }
2437 
2438 // Generate code for the induction step. Note that induction steps are
2439 // required to be loop-invariant
2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441                               Instruction *InsertBefore,
2442                               Loop *OrigLoop = nullptr) {
2443   const DataLayout &DL = SE.getDataLayout();
2444   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2445          "Induction step should be loop invariant");
2446   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447     return E->getValue();
2448 
2449   SCEVExpander Exp(SE, DL, "induction");
2450   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451 }
2452 
2453 /// Compute the transformed value of Index at offset StartValue using step
2454 /// StepValue.
2455 /// For integer induction, returns StartValue + Index * StepValue.
2456 /// For pointer induction, returns StartValue[Index * StepValue].
2457 /// FIXME: The newly created binary instructions should contain nsw/nuw
2458 /// flags, which can be found from the original scalar operations.
2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460                                    Value *StartValue, Value *Step,
2461                                    const InductionDescriptor &ID) {
2462   Type *StepTy = Step->getType();
2463   Value *CastedIndex = StepTy->isIntegerTy()
2464                            ? B.CreateSExtOrTrunc(Index, StepTy)
2465                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466   if (CastedIndex != Index) {
2467     CastedIndex->setName(CastedIndex->getName() + ".cast");
2468     Index = CastedIndex;
2469   }
2470 
2471   // Note: the IR at this point is broken. We cannot use SE to create any new
2472   // SCEV and then expand it, hoping that SCEV's simplification will give us
2473   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474   // lead to various SCEV crashes. So all we can do is to use builder and rely
2475   // on InstCombine for future simplifications. Here we handle some trivial
2476   // cases only.
2477   auto CreateAdd = [&B](Value *X, Value *Y) {
2478     assert(X->getType() == Y->getType() && "Types don't match!");
2479     if (auto *CX = dyn_cast<ConstantInt>(X))
2480       if (CX->isZero())
2481         return Y;
2482     if (auto *CY = dyn_cast<ConstantInt>(Y))
2483       if (CY->isZero())
2484         return X;
2485     return B.CreateAdd(X, Y);
2486   };
2487 
2488   // We allow X to be a vector type, in which case Y will potentially be
2489   // splatted into a vector with the same element count.
2490   auto CreateMul = [&B](Value *X, Value *Y) {
2491     assert(X->getType()->getScalarType() == Y->getType() &&
2492            "Types don't match!");
2493     if (auto *CX = dyn_cast<ConstantInt>(X))
2494       if (CX->isOne())
2495         return Y;
2496     if (auto *CY = dyn_cast<ConstantInt>(Y))
2497       if (CY->isOne())
2498         return X;
2499     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500     if (XVTy && !isa<VectorType>(Y->getType()))
2501       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502     return B.CreateMul(X, Y);
2503   };
2504 
2505   switch (ID.getKind()) {
2506   case InductionDescriptor::IK_IntInduction: {
2507     assert(!isa<VectorType>(Index->getType()) &&
2508            "Vector indices not supported for integer inductions yet");
2509     assert(Index->getType() == StartValue->getType() &&
2510            "Index type does not match StartValue type");
2511     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512       return B.CreateSub(StartValue, Index);
2513     auto *Offset = CreateMul(Index, Step);
2514     return CreateAdd(StartValue, Offset);
2515   }
2516   case InductionDescriptor::IK_PtrInduction: {
2517     assert(isa<Constant>(Step) &&
2518            "Expected constant step for pointer induction");
2519     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520   }
2521   case InductionDescriptor::IK_FpInduction: {
2522     assert(!isa<VectorType>(Index->getType()) &&
2523            "Vector indices not supported for FP inductions yet");
2524     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2525     auto InductionBinOp = ID.getInductionBinOp();
2526     assert(InductionBinOp &&
2527            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2528             InductionBinOp->getOpcode() == Instruction::FSub) &&
2529            "Original bin op should be defined for FP induction");
2530 
2531     Value *MulExp = B.CreateFMul(Step, Index);
2532     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533                          "induction");
2534   }
2535   case InductionDescriptor::IK_NoInduction:
2536     return nullptr;
2537   }
2538   llvm_unreachable("invalid enum");
2539 }
2540 
2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542                                                     const VPIteration &Instance,
2543                                                     VPTransformState &State) {
2544   Value *ScalarInst = State.get(Def, Instance);
2545   Value *VectorValue = State.get(Def, Instance.Part);
2546   VectorValue = Builder.CreateInsertElement(
2547       VectorValue, ScalarInst,
2548       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549   State.set(Def, VectorValue, Instance.Part);
2550 }
2551 
2552 // Return whether we allow using masked interleave-groups (for dealing with
2553 // strided loads/stores that reside in predicated blocks, or for dealing
2554 // with gaps).
2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556   // If an override option has been passed in for interleaved accesses, use it.
2557   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558     return EnableMaskedInterleavedMemAccesses;
2559 
2560   return TTI.enableMaskedInterleavedAccessVectorization();
2561 }
2562 
2563 // Try to vectorize the interleave group that \p Instr belongs to.
2564 //
2565 // E.g. Translate following interleaved load group (factor = 3):
2566 //   for (i = 0; i < N; i+=3) {
2567 //     R = Pic[i];             // Member of index 0
2568 //     G = Pic[i+1];           // Member of index 1
2569 //     B = Pic[i+2];           // Member of index 2
2570 //     ... // do something to R, G, B
2571 //   }
2572 // To:
2573 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2574 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2575 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2576 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2577 //
2578 // Or translate following interleaved store group (factor = 3):
2579 //   for (i = 0; i < N; i+=3) {
2580 //     ... do something to R, G, B
2581 //     Pic[i]   = R;           // Member of index 0
2582 //     Pic[i+1] = G;           // Member of index 1
2583 //     Pic[i+2] = B;           // Member of index 2
2584 //   }
2585 // To:
2586 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2590 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2591 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594     VPValue *BlockInMask) {
2595   Instruction *Instr = Group->getInsertPos();
2596   const DataLayout &DL = Instr->getModule()->getDataLayout();
2597 
2598   // Prepare for the vector type of the interleaved load/store.
2599   Type *ScalarTy = getLoadStoreType(Instr);
2600   unsigned InterleaveFactor = Group->getFactor();
2601   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2602   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603 
2604   // Prepare for the new pointers.
2605   SmallVector<Value *, 2> AddrParts;
2606   unsigned Index = Group->getIndex(Instr);
2607 
2608   // TODO: extend the masked interleaved-group support to reversed access.
2609   assert((!BlockInMask || !Group->isReverse()) &&
2610          "Reversed masked interleave-group not supported.");
2611 
2612   // If the group is reverse, adjust the index to refer to the last vector lane
2613   // instead of the first. We adjust the index from the first vector lane,
2614   // rather than directly getting the pointer for lane VF - 1, because the
2615   // pointer operand of the interleaved access is supposed to be uniform. For
2616   // uniform instructions, we're only required to generate a value for the
2617   // first vector lane in each unroll iteration.
2618   if (Group->isReverse())
2619     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620 
2621   for (unsigned Part = 0; Part < UF; Part++) {
2622     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623     State.setDebugLocFromInst(AddrPart);
2624 
2625     // Notice current instruction could be any index. Need to adjust the address
2626     // to the member of index 0.
2627     //
2628     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2629     //       b = A[i];       // Member of index 0
2630     // Current pointer is pointed to A[i+1], adjust it to A[i].
2631     //
2632     // E.g.  A[i+1] = a;     // Member of index 1
2633     //       A[i]   = b;     // Member of index 0
2634     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2635     // Current pointer is pointed to A[i+2], adjust it to A[i].
2636 
2637     bool InBounds = false;
2638     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639       InBounds = gep->isInBounds();
2640     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642 
2643     // Cast to the vector pointer type.
2644     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647   }
2648 
2649   State.setDebugLocFromInst(Instr);
2650   Value *PoisonVec = PoisonValue::get(VecTy);
2651 
2652   Value *MaskForGaps = nullptr;
2653   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2656   }
2657 
2658   // Vectorize the interleaved load group.
2659   if (isa<LoadInst>(Instr)) {
2660     // For each unroll part, create a wide load for the group.
2661     SmallVector<Value *, 2> NewLoads;
2662     for (unsigned Part = 0; Part < UF; Part++) {
2663       Instruction *NewLoad;
2664       if (BlockInMask || MaskForGaps) {
2665         assert(useMaskedInterleavedAccesses(*TTI) &&
2666                "masked interleaved groups are not allowed.");
2667         Value *GroupMask = MaskForGaps;
2668         if (BlockInMask) {
2669           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670           Value *ShuffledMask = Builder.CreateShuffleVector(
2671               BlockInMaskPart,
2672               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673               "interleaved.mask");
2674           GroupMask = MaskForGaps
2675                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676                                                 MaskForGaps)
2677                           : ShuffledMask;
2678         }
2679         NewLoad =
2680             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681                                      GroupMask, PoisonVec, "wide.masked.vec");
2682       }
2683       else
2684         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685                                             Group->getAlign(), "wide.vec");
2686       Group->addMetadata(NewLoad);
2687       NewLoads.push_back(NewLoad);
2688     }
2689 
2690     // For each member in the group, shuffle out the appropriate data from the
2691     // wide loads.
2692     unsigned J = 0;
2693     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694       Instruction *Member = Group->getMember(I);
2695 
2696       // Skip the gaps in the group.
2697       if (!Member)
2698         continue;
2699 
2700       auto StrideMask =
2701           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702       for (unsigned Part = 0; Part < UF; Part++) {
2703         Value *StridedVec = Builder.CreateShuffleVector(
2704             NewLoads[Part], StrideMask, "strided.vec");
2705 
2706         // If this member has different type, cast the result type.
2707         if (Member->getType() != ScalarTy) {
2708           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2709           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711         }
2712 
2713         if (Group->isReverse())
2714           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715 
2716         State.set(VPDefs[J], StridedVec, Part);
2717       }
2718       ++J;
2719     }
2720     return;
2721   }
2722 
2723   // The sub vector type for current instruction.
2724   auto *SubVT = VectorType::get(ScalarTy, VF);
2725 
2726   // Vectorize the interleaved store group.
2727   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2729          "masked interleaved groups are not allowed.");
2730   assert((!MaskForGaps || !VF.isScalable()) &&
2731          "masking gaps for scalable vectors is not yet supported.");
2732   for (unsigned Part = 0; Part < UF; Part++) {
2733     // Collect the stored vector from each member.
2734     SmallVector<Value *, 4> StoredVecs;
2735     unsigned StoredIdx = 0;
2736     for (unsigned i = 0; i < InterleaveFactor; i++) {
2737       assert((Group->getMember(i) || MaskForGaps) &&
2738              "Fail to get a member from an interleaved store group");
2739       Instruction *Member = Group->getMember(i);
2740 
2741       // Skip the gaps in the group.
2742       if (!Member) {
2743         Value *Undef = PoisonValue::get(SubVT);
2744         StoredVecs.push_back(Undef);
2745         continue;
2746       }
2747 
2748       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749       ++StoredIdx;
2750 
2751       if (Group->isReverse())
2752         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753 
2754       // If this member has different type, cast it to a unified type.
2755 
2756       if (StoredVec->getType() != SubVT)
2757         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758 
2759       StoredVecs.push_back(StoredVec);
2760     }
2761 
2762     // Concatenate all vectors into a wide vector.
2763     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764 
2765     // Interleave the elements in the wide vector.
2766     Value *IVec = Builder.CreateShuffleVector(
2767         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768         "interleaved.vec");
2769 
2770     Instruction *NewStoreInstr;
2771     if (BlockInMask || MaskForGaps) {
2772       Value *GroupMask = MaskForGaps;
2773       if (BlockInMask) {
2774         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775         Value *ShuffledMask = Builder.CreateShuffleVector(
2776             BlockInMaskPart,
2777             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778             "interleaved.mask");
2779         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780                                                       ShuffledMask, MaskForGaps)
2781                                 : ShuffledMask;
2782       }
2783       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784                                                 Group->getAlign(), GroupMask);
2785     } else
2786       NewStoreInstr =
2787           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788 
2789     Group->addMetadata(NewStoreInstr);
2790   }
2791 }
2792 
2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794                                                VPReplicateRecipe *RepRecipe,
2795                                                const VPIteration &Instance,
2796                                                bool IfPredicateInstr,
2797                                                VPTransformState &State) {
2798   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2799 
2800   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801   // the first lane and part.
2802   if (isa<NoAliasScopeDeclInst>(Instr))
2803     if (!Instance.isFirstIteration())
2804       return;
2805 
2806   // Does this instruction return a value ?
2807   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808 
2809   Instruction *Cloned = Instr->clone();
2810   if (!IsVoidRetTy)
2811     Cloned->setName(Instr->getName() + ".cloned");
2812 
2813   // If the scalarized instruction contributes to the address computation of a
2814   // widen masked load/store which was in a basic block that needed predication
2815   // and is not predicated after vectorization, we can't propagate
2816   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817   // instruction could feed a poison value to the base address of the widen
2818   // load/store.
2819   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820     Cloned->dropPoisonGeneratingFlags();
2821 
2822   if (Instr->getDebugLoc())
2823     State.setDebugLocFromInst(Instr);
2824 
2825   // Replace the operands of the cloned instructions with their scalar
2826   // equivalents in the new loop.
2827   for (const auto &I : enumerate(RepRecipe->operands())) {
2828     auto InputInstance = Instance;
2829     VPValue *Operand = I.value();
2830     if (vputils::isUniformAfterVectorization(Operand))
2831       InputInstance.Lane = VPLane::getFirstLane();
2832     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833   }
2834   State.addNewMetadata(Cloned, Instr);
2835 
2836   // Place the cloned scalar in the new loop.
2837   State.Builder.Insert(Cloned);
2838 
2839   State.set(RepRecipe, Cloned, Instance);
2840 
2841   // If we just cloned a new assumption, add it the assumption cache.
2842   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843     AC->registerAssumption(II);
2844 
2845   // End if-block.
2846   if (IfPredicateInstr)
2847     PredicatedInstructions.push_back(Cloned);
2848 }
2849 
2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851   if (TripCount)
2852     return TripCount;
2853 
2854   assert(InsertBlock);
2855   IRBuilder<> Builder(InsertBlock->getTerminator());
2856   // Find the loop boundaries.
2857   Type *IdxTy = Legal->getWidestInductionType();
2858   assert(IdxTy && "No type for induction");
2859   const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860 
2861   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862 
2863   // Expand the trip count and place the new instructions in the preheader.
2864   // Notice that the pre-header does not change, only the loop body.
2865   SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866 
2867   // Count holds the overall loop count (N).
2868   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869                                 InsertBlock->getTerminator());
2870 
2871   if (TripCount->getType()->isPointerTy())
2872     TripCount =
2873         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874                                     InsertBlock->getTerminator());
2875 
2876   return TripCount;
2877 }
2878 
2879 Value *
2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881   if (VectorTripCount)
2882     return VectorTripCount;
2883 
2884   Value *TC = getOrCreateTripCount(InsertBlock);
2885   IRBuilder<> Builder(InsertBlock->getTerminator());
2886 
2887   Type *Ty = TC->getType();
2888   // This is where we can make the step a runtime constant.
2889   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890 
2891   // If the tail is to be folded by masking, round the number of iterations N
2892   // up to a multiple of Step instead of rounding down. This is done by first
2893   // adding Step-1 and then rounding down. Note that it's ok if this addition
2894   // overflows: the vector induction variable will eventually wrap to zero given
2895   // that it starts at zero and its Step is a power of two; the loop will then
2896   // exit, with the last early-exit vector comparison also producing all-true.
2897   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898   // is accounted for in emitIterationCountCheck that adds an overflow check.
2899   if (Cost->foldTailByMasking()) {
2900     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2901            "VF*UF must be a power of 2 when folding tail by masking");
2902     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903     TC = Builder.CreateAdd(
2904         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905   }
2906 
2907   // Now we need to generate the expression for the part of the loop that the
2908   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909   // iterations are not required for correctness, or N - Step, otherwise. Step
2910   // is equal to the vectorization factor (number of SIMD elements) times the
2911   // unroll factor (number of SIMD instructions).
2912   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913 
2914   // There are cases where we *must* run at least one iteration in the remainder
2915   // loop.  See the cost model for when this can happen.  If the step evenly
2916   // divides the trip count, we set the remainder to be equal to the step. If
2917   // the step does not evenly divide the trip count, no adjustment is necessary
2918   // since there will already be scalar iterations. Note that the minimum
2919   // iterations check ensures that N >= Step.
2920   if (Cost->requiresScalarEpilogue(VF)) {
2921     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922     R = Builder.CreateSelect(IsZero, Step, R);
2923   }
2924 
2925   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926 
2927   return VectorTripCount;
2928 }
2929 
2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931                                                    const DataLayout &DL) {
2932   // Verify that V is a vector type with same number of elements as DstVTy.
2933   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934   unsigned VF = DstFVTy->getNumElements();
2935   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2937   Type *SrcElemTy = SrcVecTy->getElementType();
2938   Type *DstElemTy = DstFVTy->getElementType();
2939   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2940          "Vector elements must have same size");
2941 
2942   // Do a direct cast if element types are castable.
2943   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945   }
2946   // V cannot be directly casted to desired vector type.
2947   // May happen when V is a floating point vector but DstVTy is a vector of
2948   // pointers or vice-versa. Handle this using a two-step bitcast using an
2949   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2951          "Only one type should be a pointer type");
2952   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2953          "Only one type should be a floating point type");
2954   Type *IntTy =
2955       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959 }
2960 
2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963   // Reuse existing vector loop preheader for TC checks.
2964   // Note that new preheader block is generated for vector loop.
2965   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967 
2968   // Generate code to check if the loop's trip count is less than VF * UF, or
2969   // equal to it in case a scalar epilogue is required; this implies that the
2970   // vector trip count is zero. This check also covers the case where adding one
2971   // to the backedge-taken count overflowed leading to an incorrect trip count
2972   // of zero. In this case we will also jump to the scalar loop.
2973   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974                                             : ICmpInst::ICMP_ULT;
2975 
2976   // If tail is to be folded, vector loop takes care of all iterations.
2977   Type *CountTy = Count->getType();
2978   Value *CheckMinIters = Builder.getFalse();
2979   auto CreateStep = [&]() -> Value * {
2980     // Create step with max(MinProTripCount, UF * VF).
2981     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982       return createStepForVF(Builder, CountTy, VF, UF);
2983 
2984     Value *MinProfTC =
2985         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986     if (!VF.isScalable())
2987       return MinProfTC;
2988     return Builder.CreateBinaryIntrinsic(
2989         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990   };
2991 
2992   if (!Cost->foldTailByMasking())
2993     CheckMinIters =
2994         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995   else if (VF.isScalable()) {
2996     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997     // an overflow to zero when updating induction variables and so an
2998     // additional overflow check is required before entering the vector loop.
2999 
3000     // Get the maximum unsigned value for the type.
3001     Value *MaxUIntTripCount =
3002         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004 
3005     // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007   }
3008 
3009   // Create new preheader for vector loop.
3010   LoopVectorPreHeader =
3011       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012                  "vector.ph");
3013 
3014   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3015                                DT->getNode(Bypass)->getIDom()) &&
3016          "TC check is expected to dominate Bypass");
3017 
3018   // Update dominator for Bypass & LoopExit (if needed).
3019   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020   if (!Cost->requiresScalarEpilogue(VF))
3021     // If there is an epilogue which must run, there's no edge from the
3022     // middle block to exit blocks  and thus no need to update the immediate
3023     // dominator of the exit blocks.
3024     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025 
3026   ReplaceInstWithInst(
3027       TCCheckBlock->getTerminator(),
3028       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029   LoopBypassBlocks.push_back(TCCheckBlock);
3030 }
3031 
3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033   BasicBlock *const SCEVCheckBlock =
3034       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035   if (!SCEVCheckBlock)
3036     return nullptr;
3037 
3038   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3039            (OptForSizeBasedOnProfile &&
3040             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3041          "Cannot SCEV check stride or overflow when optimizing for size");
3042 
3043 
3044   // Update dominator only if this is first RT check.
3045   if (LoopBypassBlocks.empty()) {
3046     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047     if (!Cost->requiresScalarEpilogue(VF))
3048       // If there is an epilogue which must run, there's no edge from the
3049       // middle block to exit blocks  and thus no need to update the immediate
3050       // dominator of the exit blocks.
3051       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052   }
3053 
3054   LoopBypassBlocks.push_back(SCEVCheckBlock);
3055   AddedSafetyChecks = true;
3056   return SCEVCheckBlock;
3057 }
3058 
3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060   // VPlan-native path does not do any analysis for runtime checks currently.
3061   if (EnableVPlanNativePath)
3062     return nullptr;
3063 
3064   BasicBlock *const MemCheckBlock =
3065       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066 
3067   // Check if we generated code that checks in runtime if arrays overlap. We put
3068   // the checks into a separate block to make the more common case of few
3069   // elements faster.
3070   if (!MemCheckBlock)
3071     return nullptr;
3072 
3073   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3075            "Cannot emit memory checks when optimizing for size, unless forced "
3076            "to vectorize.");
3077     ORE->emit([&]() {
3078       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3079                                         OrigLoop->getStartLoc(),
3080                                         OrigLoop->getHeader())
3081              << "Code-size may be reduced by not forcing "
3082                 "vectorization, or by source-code modifications "
3083                 "eliminating the need for runtime checks "
3084                 "(e.g., adding 'restrict').";
3085     });
3086   }
3087 
3088   LoopBypassBlocks.push_back(MemCheckBlock);
3089 
3090   AddedSafetyChecks = true;
3091 
3092   return MemCheckBlock;
3093 }
3094 
3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096   LoopScalarBody = OrigLoop->getHeader();
3097   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098   assert(LoopVectorPreHeader && "Invalid loop structure");
3099   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3101          "multiple exit loop without required epilogue?");
3102 
3103   LoopMiddleBlock =
3104       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105                  LI, nullptr, Twine(Prefix) + "middle.block");
3106   LoopScalarPreHeader =
3107       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108                  nullptr, Twine(Prefix) + "scalar.ph");
3109 
3110   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111 
3112   // Set up the middle block terminator.  Two cases:
3113   // 1) If we know that we must execute the scalar epilogue, emit an
3114   //    unconditional branch.
3115   // 2) Otherwise, we must have a single unique exit block (due to how we
3116   //    implement the multiple exit case).  In this case, set up a conditional
3117   //    branch from the middle block to the loop scalar preheader, and the
3118   //    exit block.  completeLoopSkeleton will update the condition to use an
3119   //    iteration check, if required to decide whether to execute the remainder.
3120   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121     BranchInst::Create(LoopScalarPreHeader) :
3122     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123                        Builder.getTrue());
3124   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126 
3127   // Update dominator for loop exit. During skeleton creation, only the vector
3128   // pre-header and the middle block are created. The vector loop is entirely
3129   // created during VPlan exection.
3130   if (!Cost->requiresScalarEpilogue(VF))
3131     // If there is an epilogue which must run, there's no edge from the
3132     // middle block to exit blocks  and thus no need to update the immediate
3133     // dominator of the exit blocks.
3134     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135 }
3136 
3137 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138     PHINode *OrigPhi, const InductionDescriptor &II,
3139     ArrayRef<BasicBlock *> BypassBlocks,
3140     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142   assert(VectorTripCount && "Expected valid arguments");
3143 
3144   Instruction *OldInduction = Legal->getPrimaryInduction();
3145   Value *&EndValue = IVEndValues[OrigPhi];
3146   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147   if (OrigPhi == OldInduction) {
3148     // We know what the end value is.
3149     EndValue = VectorTripCount;
3150   } else {
3151     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152 
3153     // Fast-math-flags propagate from the original induction instruction.
3154     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156 
3157     Value *Step =
3158         CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159     EndValue =
3160         emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161     EndValue->setName("ind.end");
3162 
3163     // Compute the end value for the additional bypass (if applicable).
3164     if (AdditionalBypass.first) {
3165       B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166       Value *Step =
3167           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168       EndValueFromAdditionalBypass = emitTransformedIndex(
3169           B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170       EndValueFromAdditionalBypass->setName("ind.end");
3171     }
3172   }
3173 
3174   // Create phi nodes to merge from the  backedge-taken check block.
3175   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176                                          LoopScalarPreHeader->getTerminator());
3177   // Copy original phi DL over to the new one.
3178   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179 
3180   // The new PHI merges the original incoming value, in case of a bypass,
3181   // or the value at the end of the vectorized loop.
3182   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183 
3184   // Fix the scalar body counter (PHI node).
3185   // The old induction's phi node in the scalar body needs the truncated
3186   // value.
3187   for (BasicBlock *BB : BypassBlocks)
3188     BCResumeVal->addIncoming(II.getStartValue(), BB);
3189 
3190   if (AdditionalBypass.first)
3191     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192                                           EndValueFromAdditionalBypass);
3193   return BCResumeVal;
3194 }
3195 
3196 void InnerLoopVectorizer::createInductionResumeValues(
3197     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3199           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3200          "Inconsistent information about additional bypass.");
3201   // We are going to resume the execution of the scalar loop.
3202   // Go over all of the induction variables that we found and fix the
3203   // PHIs that are left in the scalar version of the loop.
3204   // The starting values of PHI nodes depend on the counter of the last
3205   // iteration in the vectorized loop.
3206   // If we come from a bypass edge then we need to start from the original
3207   // start value.
3208   for (const auto &InductionEntry : Legal->getInductionVars()) {
3209     PHINode *OrigPhi = InductionEntry.first;
3210     const InductionDescriptor &II = InductionEntry.second;
3211     PHINode *BCResumeVal = createInductionResumeValue(
3212         OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214   }
3215 }
3216 
3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218   // The trip counts should be cached by now.
3219   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221 
3222   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223 
3224   // Add a check in the middle block to see if we have completed
3225   // all of the iterations in the first vector loop.  Three cases:
3226   // 1) If we require a scalar epilogue, there is no conditional branch as
3227   //    we unconditionally branch to the scalar preheader.  Do nothing.
3228   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229   //    Thus if tail is to be folded, we know we don't need to run the
3230   //    remainder and we can use the previous value for the condition (true).
3231   // 3) Otherwise, construct a runtime check.
3232   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234                                         Count, VectorTripCount, "cmp.n",
3235                                         LoopMiddleBlock->getTerminator());
3236 
3237     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238     // of the corresponding compare because they may have ended up with
3239     // different line numbers and we want to avoid awkward line stepping while
3240     // debugging. Eg. if the compare has got a line number inside the loop.
3241     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243   }
3244 
3245 #ifdef EXPENSIVE_CHECKS
3246   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3247 #endif
3248 
3249   return LoopVectorPreHeader;
3250 }
3251 
3252 std::pair<BasicBlock *, Value *>
3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254   /*
3255    In this function we generate a new loop. The new loop will contain
3256    the vectorized instructions while the old loop will continue to run the
3257    scalar remainder.
3258 
3259        [ ] <-- loop iteration number check.
3260     /   |
3261    /    v
3262   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3263   |  /  |
3264   | /   v
3265   ||   [ ]     <-- vector pre header.
3266   |/    |
3267   |     v
3268   |    [  ] \
3269   |    [  ]_|   <-- vector loop (created during VPlan execution).
3270   |     |
3271   |     v
3272   \   -[ ]   <--- middle-block.
3273    \/   |
3274    /\   v
3275    | ->[ ]     <--- new preheader.
3276    |    |
3277  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3278    |   [ ] \
3279    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3280     \   |
3281      \  v
3282       >[ ]     <-- exit block(s).
3283    ...
3284    */
3285 
3286   // Create an empty vector loop, and prepare basic blocks for the runtime
3287   // checks.
3288   createVectorLoopSkeleton("");
3289 
3290   // Now, compare the new count to zero. If it is zero skip the vector loop and
3291   // jump to the scalar loop. This check also covers the case where the
3292   // backedge-taken count is uint##_max: adding one to it will overflow leading
3293   // to an incorrect trip count of zero. In this (rare) case we will also jump
3294   // to the scalar loop.
3295   emitIterationCountCheck(LoopScalarPreHeader);
3296 
3297   // Generate the code to check any assumptions that we've made for SCEV
3298   // expressions.
3299   emitSCEVChecks(LoopScalarPreHeader);
3300 
3301   // Generate the code that checks in runtime if arrays overlap. We put the
3302   // checks into a separate block to make the more common case of few elements
3303   // faster.
3304   emitMemRuntimeChecks(LoopScalarPreHeader);
3305 
3306   // Emit phis for the new starting index of the scalar loop.
3307   createInductionResumeValues();
3308 
3309   return {completeLoopSkeleton(), nullptr};
3310 }
3311 
3312 // Fix up external users of the induction variable. At this point, we are
3313 // in LCSSA form, with all external PHIs that use the IV having one input value,
3314 // coming from the remainder loop. We need those PHIs to also have a correct
3315 // value for the IV when arriving directly from the middle block.
3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317                                        const InductionDescriptor &II,
3318                                        Value *VectorTripCount, Value *EndValue,
3319                                        BasicBlock *MiddleBlock,
3320                                        BasicBlock *VectorHeader, VPlan &Plan) {
3321   // There are two kinds of external IV usages - those that use the value
3322   // computed in the last iteration (the PHI) and those that use the penultimate
3323   // value (the value that feeds into the phi from the loop latch).
3324   // We allow both, but they, obviously, have different values.
3325 
3326   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3327 
3328   DenseMap<Value *, Value *> MissingVals;
3329 
3330   // An external user of the last iteration's value should see the value that
3331   // the remainder loop uses to initialize its own IV.
3332   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333   for (User *U : PostInc->users()) {
3334     Instruction *UI = cast<Instruction>(U);
3335     if (!OrigLoop->contains(UI)) {
3336       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3337       MissingVals[UI] = EndValue;
3338     }
3339   }
3340 
3341   // An external user of the penultimate value need to see EndValue - Step.
3342   // The simplest way to get this is to recompute it from the constituent SCEVs,
3343   // that is Start + (Step * (CRD - 1)).
3344   for (User *U : OrigPhi->users()) {
3345     auto *UI = cast<Instruction>(U);
3346     if (!OrigLoop->contains(UI)) {
3347       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3348 
3349       IRBuilder<> B(MiddleBlock->getTerminator());
3350 
3351       // Fast-math-flags propagate from the original induction instruction.
3352       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354 
3355       Value *CountMinusOne = B.CreateSub(
3356           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357       CountMinusOne->setName("cmo");
3358       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359                                     VectorHeader->getTerminator());
3360       Value *Escape =
3361           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362       Escape->setName("ind.escape");
3363       MissingVals[UI] = Escape;
3364     }
3365   }
3366 
3367   for (auto &I : MissingVals) {
3368     PHINode *PHI = cast<PHINode>(I.first);
3369     // One corner case we have to handle is two IVs "chasing" each-other,
3370     // that is %IV2 = phi [...], [ %IV1, %latch ]
3371     // In this case, if IV1 has an external use, we need to avoid adding both
3372     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373     // don't already have an incoming value for the middle block.
3374     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375       PHI->addIncoming(I.second, MiddleBlock);
3376       Plan.removeLiveOut(PHI);
3377     }
3378   }
3379 }
3380 
3381 namespace {
3382 
3383 struct CSEDenseMapInfo {
3384   static bool canHandle(const Instruction *I) {
3385     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387   }
3388 
3389   static inline Instruction *getEmptyKey() {
3390     return DenseMapInfo<Instruction *>::getEmptyKey();
3391   }
3392 
3393   static inline Instruction *getTombstoneKey() {
3394     return DenseMapInfo<Instruction *>::getTombstoneKey();
3395   }
3396 
3397   static unsigned getHashValue(const Instruction *I) {
3398     assert(canHandle(I) && "Unknown instruction!");
3399     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400                                                            I->value_op_end()));
3401   }
3402 
3403   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406       return LHS == RHS;
3407     return LHS->isIdenticalTo(RHS);
3408   }
3409 };
3410 
3411 } // end anonymous namespace
3412 
3413 ///Perform cse of induction variable instructions.
3414 static void cse(BasicBlock *BB) {
3415   // Perform simple cse.
3416   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418     if (!CSEDenseMapInfo::canHandle(&In))
3419       continue;
3420 
3421     // Check if we can replace this instruction with any of the
3422     // visited instructions.
3423     if (Instruction *V = CSEMap.lookup(&In)) {
3424       In.replaceAllUsesWith(V);
3425       In.eraseFromParent();
3426       continue;
3427     }
3428 
3429     CSEMap[&In] = &In;
3430   }
3431 }
3432 
3433 InstructionCost
3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435                                               bool &NeedToScalarize) const {
3436   Function *F = CI->getCalledFunction();
3437   Type *ScalarRetTy = CI->getType();
3438   SmallVector<Type *, 4> Tys, ScalarTys;
3439   for (auto &ArgOp : CI->args())
3440     ScalarTys.push_back(ArgOp->getType());
3441 
3442   // Estimate cost of scalarized vector call. The source operands are assumed
3443   // to be vectors, so we need to extract individual elements from there,
3444   // execute VF scalar calls, and then gather the result into the vector return
3445   // value.
3446   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447   InstructionCost ScalarCallCost =
3448       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449   if (VF.isScalar())
3450     return ScalarCallCost;
3451 
3452   // Compute corresponding vector type for return value and arguments.
3453   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454   for (Type *ScalarTy : ScalarTys)
3455     Tys.push_back(ToVectorTy(ScalarTy, VF));
3456 
3457   // Compute costs of unpacking argument values for the scalar calls and
3458   // packing the return values to a vector.
3459   InstructionCost ScalarizationCost =
3460       getScalarizationOverhead(CI, VF, CostKind);
3461 
3462   InstructionCost Cost =
3463       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464 
3465   // If we can't emit a vector call for this function, then the currently found
3466   // cost is the cost we need to return.
3467   NeedToScalarize = true;
3468   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470 
3471   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472     return Cost;
3473 
3474   // If the corresponding vector cost is cheaper, return its cost.
3475   InstructionCost VectorCallCost =
3476       TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477   if (VectorCallCost < Cost) {
3478     NeedToScalarize = false;
3479     Cost = VectorCallCost;
3480   }
3481   return Cost;
3482 }
3483 
3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486     return Elt;
3487   return VectorType::get(Elt, VF);
3488 }
3489 
3490 InstructionCost
3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492                                                    ElementCount VF) const {
3493   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494   assert(ID && "Expected intrinsic call!");
3495   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496   FastMathFlags FMF;
3497   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498     FMF = FPMO->getFastMathFlags();
3499 
3500   SmallVector<const Value *> Arguments(CI->args());
3501   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502   SmallVector<Type *> ParamTys;
3503   std::transform(FTy->param_begin(), FTy->param_end(),
3504                  std::back_inserter(ParamTys),
3505                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506 
3507   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508                                     dyn_cast<IntrinsicInst>(CI));
3509   return TTI.getIntrinsicInstrCost(CostAttrs,
3510                                    TargetTransformInfo::TCK_RecipThroughput);
3511 }
3512 
3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517 }
3518 
3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523 }
3524 
3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526   // For every instruction `I` in MinBWs, truncate the operands, create a
3527   // truncated version of `I` and reextend its result. InstCombine runs
3528   // later and will remove any ext/trunc pairs.
3529   SmallPtrSet<Value *, 4> Erased;
3530   for (const auto &KV : Cost->getMinimalBitwidths()) {
3531     // If the value wasn't vectorized, we must maintain the original scalar
3532     // type. The absence of the value from State indicates that it
3533     // wasn't vectorized.
3534     // FIXME: Should not rely on getVPValue at this point.
3535     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536     if (!State.hasAnyVectorValue(Def))
3537       continue;
3538     for (unsigned Part = 0; Part < UF; ++Part) {
3539       Value *I = State.get(Def, Part);
3540       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541         continue;
3542       Type *OriginalTy = I->getType();
3543       Type *ScalarTruncatedTy =
3544           IntegerType::get(OriginalTy->getContext(), KV.second);
3545       auto *TruncatedTy = VectorType::get(
3546           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547       if (TruncatedTy == OriginalTy)
3548         continue;
3549 
3550       IRBuilder<> B(cast<Instruction>(I));
3551       auto ShrinkOperand = [&](Value *V) -> Value * {
3552         if (auto *ZI = dyn_cast<ZExtInst>(V))
3553           if (ZI->getSrcTy() == TruncatedTy)
3554             return ZI->getOperand(0);
3555         return B.CreateZExtOrTrunc(V, TruncatedTy);
3556       };
3557 
3558       // The actual instruction modification depends on the instruction type,
3559       // unfortunately.
3560       Value *NewI = nullptr;
3561       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563                              ShrinkOperand(BO->getOperand(1)));
3564 
3565         // Any wrapping introduced by shrinking this operation shouldn't be
3566         // considered undefined behavior. So, we can't unconditionally copy
3567         // arithmetic wrapping flags to NewI.
3568         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570         NewI =
3571             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572                          ShrinkOperand(CI->getOperand(1)));
3573       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574         NewI = B.CreateSelect(SI->getCondition(),
3575                               ShrinkOperand(SI->getTrueValue()),
3576                               ShrinkOperand(SI->getFalseValue()));
3577       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578         switch (CI->getOpcode()) {
3579         default:
3580           llvm_unreachable("Unhandled cast!");
3581         case Instruction::Trunc:
3582           NewI = ShrinkOperand(CI->getOperand(0));
3583           break;
3584         case Instruction::SExt:
3585           NewI = B.CreateSExtOrTrunc(
3586               CI->getOperand(0),
3587               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588           break;
3589         case Instruction::ZExt:
3590           NewI = B.CreateZExtOrTrunc(
3591               CI->getOperand(0),
3592               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593           break;
3594         }
3595       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596         auto Elements0 =
3597             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598         auto *O0 = B.CreateZExtOrTrunc(
3599             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600         auto Elements1 =
3601             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602         auto *O1 = B.CreateZExtOrTrunc(
3603             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604 
3605         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607         // Don't do anything with the operands, just extend the result.
3608         continue;
3609       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610         auto Elements =
3611             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612         auto *O0 = B.CreateZExtOrTrunc(
3613             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617         auto Elements =
3618             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619         auto *O0 = B.CreateZExtOrTrunc(
3620             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622       } else {
3623         // If we don't know what to do, be conservative and don't do anything.
3624         continue;
3625       }
3626 
3627       // Lastly, extend the result.
3628       NewI->takeName(cast<Instruction>(I));
3629       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630       I->replaceAllUsesWith(Res);
3631       cast<Instruction>(I)->eraseFromParent();
3632       Erased.insert(I);
3633       State.reset(Def, Res, Part);
3634     }
3635   }
3636 
3637   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638   for (const auto &KV : Cost->getMinimalBitwidths()) {
3639     // If the value wasn't vectorized, we must maintain the original scalar
3640     // type. The absence of the value from State indicates that it
3641     // wasn't vectorized.
3642     // FIXME: Should not rely on getVPValue at this point.
3643     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644     if (!State.hasAnyVectorValue(Def))
3645       continue;
3646     for (unsigned Part = 0; Part < UF; ++Part) {
3647       Value *I = State.get(Def, Part);
3648       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649       if (Inst && Inst->use_empty()) {
3650         Value *NewI = Inst->getOperand(0);
3651         Inst->eraseFromParent();
3652         State.reset(Def, NewI, Part);
3653       }
3654     }
3655   }
3656 }
3657 
3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659                                             VPlan &Plan) {
3660   // Insert truncates and extends for any truncated instructions as hints to
3661   // InstCombine.
3662   if (VF.isVector())
3663     truncateToMinimalBitwidths(State);
3664 
3665   // Fix widened non-induction PHIs by setting up the PHI operands.
3666   if (EnableVPlanNativePath)
3667     fixNonInductionPHIs(Plan, State);
3668 
3669   // At this point every instruction in the original loop is widened to a
3670   // vector form. Now we need to fix the recurrences in the loop. These PHI
3671   // nodes are currently empty because we did not want to introduce cycles.
3672   // This is the second stage of vectorizing recurrences.
3673   fixCrossIterationPHIs(State);
3674 
3675   // Forget the original basic block.
3676   PSE.getSE()->forgetLoop(OrigLoop);
3677 
3678   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680   if (Cost->requiresScalarEpilogue(VF)) {
3681     // No edge from the middle block to the unique exit block has been inserted
3682     // and there is nothing to fix from vector loop; phis should have incoming
3683     // from scalar loop only.
3684     Plan.clearLiveOuts();
3685   } else {
3686     // If we inserted an edge from the middle block to the unique exit block,
3687     // update uses outside the loop (phis) to account for the newly inserted
3688     // edge.
3689 
3690     // Fix-up external users of the induction variables.
3691     for (const auto &Entry : Legal->getInductionVars())
3692       fixupIVUsers(Entry.first, Entry.second,
3693                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694                    IVEndValues[Entry.first], LoopMiddleBlock,
3695                    VectorLoop->getHeader(), Plan);
3696   }
3697 
3698   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699   // in the exit block, so update the builder.
3700   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701   for (const auto &KV : Plan.getLiveOuts())
3702     KV.second->fixPhi(Plan, State);
3703 
3704   for (Instruction *PI : PredicatedInstructions)
3705     sinkScalarOperands(&*PI);
3706 
3707   // Remove redundant induction instructions.
3708   cse(VectorLoop->getHeader());
3709 
3710   // Set/update profile weights for the vector and remainder loops as original
3711   // loop iterations are now distributed among them. Note that original loop
3712   // represented by LoopScalarBody becomes remainder loop after vectorization.
3713   //
3714   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715   // end up getting slightly roughened result but that should be OK since
3716   // profile is not inherently precise anyway. Note also possible bypass of
3717   // vector code caused by legality checks is ignored, assigning all the weight
3718   // to the vector loop, optimistically.
3719   //
3720   // For scalable vectorization we can't know at compile time how many iterations
3721   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722   // vscale of '1'.
3723   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724                                LI->getLoopFor(LoopScalarBody),
3725                                VF.getKnownMinValue() * UF);
3726 }
3727 
3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729   // In order to support recurrences we need to be able to vectorize Phi nodes.
3730   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731   // stage #2: We now need to fix the recurrences by adding incoming edges to
3732   // the currently empty PHI nodes. At this point every instruction in the
3733   // original loop is widened to a vector form so we can use them to construct
3734   // the incoming edges.
3735   VPBasicBlock *Header =
3736       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737   for (VPRecipeBase &R : Header->phis()) {
3738     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739       fixReduction(ReductionPhi, State);
3740     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741       fixFixedOrderRecurrence(FOR, State);
3742   }
3743 }
3744 
3745 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747   // This is the second phase of vectorizing first-order recurrences. An
3748   // overview of the transformation is described below. Suppose we have the
3749   // following loop.
3750   //
3751   //   for (int i = 0; i < n; ++i)
3752   //     b[i] = a[i] - a[i - 1];
3753   //
3754   // There is a first-order recurrence on "a". For this loop, the shorthand
3755   // scalar IR looks like:
3756   //
3757   //   scalar.ph:
3758   //     s_init = a[-1]
3759   //     br scalar.body
3760   //
3761   //   scalar.body:
3762   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3763   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764   //     s2 = a[i]
3765   //     b[i] = s2 - s1
3766   //     br cond, scalar.body, ...
3767   //
3768   // In this example, s1 is a recurrence because it's value depends on the
3769   // previous iteration. In the first phase of vectorization, we created a
3770   // vector phi v1 for s1. We now complete the vectorization and produce the
3771   // shorthand vector IR shown below (for VF = 4, UF = 1).
3772   //
3773   //   vector.ph:
3774   //     v_init = vector(..., ..., ..., a[-1])
3775   //     br vector.body
3776   //
3777   //   vector.body
3778   //     i = phi [0, vector.ph], [i+4, vector.body]
3779   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3780   //     v2 = a[i, i+1, i+2, i+3];
3781   //     v3 = vector(v1(3), v2(0, 1, 2))
3782   //     b[i, i+1, i+2, i+3] = v2 - v3
3783   //     br cond, vector.body, middle.block
3784   //
3785   //   middle.block:
3786   //     x = v2(3)
3787   //     br scalar.ph
3788   //
3789   //   scalar.ph:
3790   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3791   //     br scalar.body
3792   //
3793   // After execution completes the vector loop, we extract the next value of
3794   // the recurrence (x) to use as the initial value in the scalar loop.
3795 
3796   // Extract the last vector element in the middle block. This will be the
3797   // initial value for the recurrence when jumping to the scalar loop.
3798   VPValue *PreviousDef = PhiR->getBackedgeValue();
3799   Value *Incoming = State.get(PreviousDef, UF - 1);
3800   auto *ExtractForScalar = Incoming;
3801   auto *IdxTy = Builder.getInt32Ty();
3802   if (VF.isVector()) {
3803     auto *One = ConstantInt::get(IdxTy, 1);
3804     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808                                                     "vector.recur.extract");
3809   }
3810   // Extract the second last element in the middle block if the
3811   // Phi is used outside the loop. We need to extract the phi itself
3812   // and not the last element (the phi update in the current iteration). This
3813   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814   // when the scalar loop is not run at all.
3815   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816   if (VF.isVector()) {
3817     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820         Incoming, Idx, "vector.recur.extract.for.phi");
3821   } else if (UF > 1)
3822     // When loop is unrolled without vectorizing, initialize
3823     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824     // of `Incoming`. This is analogous to the vectorized case above: extracting
3825     // the second last element when VF > 1.
3826     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827 
3828   // Fix the initial value of the original recurrence in the scalar loop.
3829   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835     Start->addIncoming(Incoming, BB);
3836   }
3837 
3838   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839   Phi->setName("scalar.recur");
3840 
3841   // Finally, fix users of the recurrence outside the loop. The users will need
3842   // either the last value of the scalar recurrence or the last value of the
3843   // vector recurrence we extracted in the middle block. Since the loop is in
3844   // LCSSA form, we just need to find all the phi nodes for the original scalar
3845   // recurrence in the exit block, and then add an edge for the middle block.
3846   // Note that LCSSA does not imply single entry when the original scalar loop
3847   // had multiple exiting edges (as we always run the last iteration in the
3848   // scalar epilogue); in that case, there is no edge from middle to exit and
3849   // and thus no phis which needed updated.
3850   if (!Cost->requiresScalarEpilogue(VF))
3851     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854         State.Plan->removeLiveOut(&LCSSAPhi);
3855       }
3856 }
3857 
3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859                                        VPTransformState &State) {
3860   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861   // Get it's reduction variable descriptor.
3862   assert(Legal->isReductionVariable(OrigPhi) &&
3863          "Unable to find the reduction variable");
3864   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865 
3866   RecurKind RK = RdxDesc.getRecurrenceKind();
3867   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869   State.setDebugLocFromInst(ReductionStartValue);
3870 
3871   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872   // This is the vector-clone of the value that leaves the loop.
3873   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874 
3875   // Wrap flags are in general invalid after vectorization, clear them.
3876   clearReductionWrapFlags(PhiR, State);
3877 
3878   // Before each round, move the insertion point right between
3879   // the PHIs and the values we are going to write.
3880   // This allows us to write both PHINodes and the extractelement
3881   // instructions.
3882   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883 
3884   State.setDebugLocFromInst(LoopExitInst);
3885 
3886   Type *PhiTy = OrigPhi->getType();
3887 
3888   VPBasicBlock *LatchVPBB =
3889       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891   // If tail is folded by masking, the vector value to leave the loop should be
3892   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893   // instead of the former. For an inloop reduction the reduction will already
3894   // be predicated, and does not need to be handled here.
3895   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898       SelectInst *Sel = nullptr;
3899       for (User *U : VecLoopExitInst->users()) {
3900         if (isa<SelectInst>(U)) {
3901           assert(!Sel && "Reduction exit feeding two selects");
3902           Sel = cast<SelectInst>(U);
3903         } else
3904           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3905       }
3906       assert(Sel && "Reduction exit feeds no select");
3907       State.reset(LoopExitInstDef, Sel, Part);
3908 
3909       if (isa<FPMathOperator>(Sel))
3910         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911 
3912       // If the target can create a predicated operator for the reduction at no
3913       // extra cost in the loop (for example a predicated vadd), it can be
3914       // cheaper for the select to remain in the loop than be sunk out of it,
3915       // and so use the select value for the phi instead of the old
3916       // LoopExitValue.
3917       if (PreferPredicatedReductionSelect ||
3918           TTI->preferPredicatedReductionSelect(
3919               RdxDesc.getOpcode(), PhiTy,
3920               TargetTransformInfo::ReductionFlags())) {
3921         auto *VecRdxPhi =
3922             cast<PHINode>(State.get(PhiR, Part));
3923         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924       }
3925     }
3926   }
3927 
3928   // If the vector reduction can be performed in a smaller type, we truncate
3929   // then extend the loop exit value to enable InstCombine to evaluate the
3930   // entire expression in the smaller type.
3931   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3933     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935     VectorParts RdxParts(UF);
3936     for (unsigned Part = 0; Part < UF; ++Part) {
3937       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940                                         : Builder.CreateZExt(Trunc, VecTy);
3941       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942         if (U != Trunc) {
3943           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944           RdxParts[Part] = Extnd;
3945         }
3946     }
3947     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948     for (unsigned Part = 0; Part < UF; ++Part) {
3949       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951     }
3952   }
3953 
3954   // Reduce all of the unrolled parts into a single vector.
3955   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957 
3958   // The middle block terminator has already been assigned a DebugLoc here (the
3959   // OrigLoop's single latch terminator). We want the whole middle block to
3960   // appear to execute on this line because: (a) it is all compiler generated,
3961   // (b) these instructions are always executed after evaluating the latch
3962   // conditional branch, and (c) other passes may add new predecessors which
3963   // terminate on this line. This is the easiest way to ensure we don't
3964   // accidentally cause an extra step back into the loop while debugging.
3965   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966   if (PhiR->isOrdered())
3967     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968   else {
3969     // Floating-point operations should have some FMF to enable the reduction.
3970     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972     for (unsigned Part = 1; Part < UF; ++Part) {
3973       Value *RdxPart = State.get(LoopExitInstDef, Part);
3974       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975         ReducedPartRdx = Builder.CreateBinOp(
3976             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979                                            ReducedPartRdx, RdxPart);
3980       else
3981         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982     }
3983   }
3984 
3985   // Create the reduction after the loop. Note that inloop reductions create the
3986   // target reduction in the loop using a Reduction recipe.
3987   if (VF.isVector() && !PhiR->isInLoop()) {
3988     ReducedPartRdx =
3989         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990     // If the reduction can be performed in a smaller type, we need to extend
3991     // the reduction to the wider type before we branch to the original loop.
3992     if (PhiTy != RdxDesc.getRecurrenceType())
3993       ReducedPartRdx = RdxDesc.isSigned()
3994                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996   }
3997 
3998   PHINode *ResumePhi =
3999       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000 
4001   // Create a phi node that merges control-flow from the backedge-taken check
4002   // block and the middle block.
4003   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004                                         LoopScalarPreHeader->getTerminator());
4005 
4006   // If we are fixing reductions in the epilogue loop then we should already
4007   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008   // we carry over the incoming values correctly.
4009   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010     if (Incoming == LoopMiddleBlock)
4011       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014                               Incoming);
4015     else
4016       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017   }
4018 
4019   // Set the resume value for this reduction
4020   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021 
4022   // If there were stores of the reduction value to a uniform memory address
4023   // inside the loop, create the final store here.
4024   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025     StoreInst *NewSI =
4026         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027     propagateMetadata(NewSI, SI);
4028 
4029     // If the reduction value is used in other places,
4030     // then let the code below create PHI's for that.
4031   }
4032 
4033   // Now, we need to fix the users of the reduction variable
4034   // inside and outside of the scalar remainder loop.
4035 
4036   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037   // in the exit blocks.  See comment on analogous loop in
4038   // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039   if (!Cost->requiresScalarEpilogue(VF))
4040     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043         State.Plan->removeLiveOut(&LCSSAPhi);
4044       }
4045 
4046   // Fix the scalar loop reduction variable with the incoming reduction sum
4047   // from the vector body and from the backedge value.
4048   int IncomingEdgeBlockIdx =
4049       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4051   // Pick the other block.
4052   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055 }
4056 
4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058                                                   VPTransformState &State) {
4059   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060   RecurKind RK = RdxDesc.getRecurrenceKind();
4061   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062     return;
4063 
4064   SmallVector<VPValue *, 8> Worklist;
4065   SmallPtrSet<VPValue *, 8> Visited;
4066   Worklist.push_back(PhiR);
4067   Visited.insert(PhiR);
4068 
4069   while (!Worklist.empty()) {
4070     VPValue *Cur = Worklist.pop_back_val();
4071     for (unsigned Part = 0; Part < UF; ++Part) {
4072       Value *V = State.get(Cur, Part);
4073       if (!isa<OverflowingBinaryOperator>(V))
4074         break;
4075       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076       }
4077 
4078       for (VPUser *U : Cur->users()) {
4079         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080         if (!UserRecipe)
4081           continue;
4082         for (VPValue *V : UserRecipe->definedValues())
4083           if (Visited.insert(V).second)
4084             Worklist.push_back(V);
4085       }
4086   }
4087 }
4088 
4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090   // The basic block and loop containing the predicated instruction.
4091   auto *PredBB = PredInst->getParent();
4092   auto *VectorLoop = LI->getLoopFor(PredBB);
4093 
4094   // Initialize a worklist with the operands of the predicated instruction.
4095   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096 
4097   // Holds instructions that we need to analyze again. An instruction may be
4098   // reanalyzed if we don't yet know if we can sink it or not.
4099   SmallVector<Instruction *, 8> InstsToReanalyze;
4100 
4101   // Returns true if a given use occurs in the predicated block. Phi nodes use
4102   // their operands in their corresponding predecessor blocks.
4103   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104     auto *I = cast<Instruction>(U.getUser());
4105     BasicBlock *BB = I->getParent();
4106     if (auto *Phi = dyn_cast<PHINode>(I))
4107       BB = Phi->getIncomingBlock(
4108           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109     return BB == PredBB;
4110   };
4111 
4112   // Iteratively sink the scalarized operands of the predicated instruction
4113   // into the block we created for it. When an instruction is sunk, it's
4114   // operands are then added to the worklist. The algorithm ends after one pass
4115   // through the worklist doesn't sink a single instruction.
4116   bool Changed;
4117   do {
4118     // Add the instructions that need to be reanalyzed to the worklist, and
4119     // reset the changed indicator.
4120     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121     InstsToReanalyze.clear();
4122     Changed = false;
4123 
4124     while (!Worklist.empty()) {
4125       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126 
4127       // We can't sink an instruction if it is a phi node, is not in the loop,
4128       // or may have side effects.
4129       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130           I->mayHaveSideEffects())
4131         continue;
4132 
4133       // If the instruction is already in PredBB, check if we can sink its
4134       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135       // sinking the scalar instruction I, hence it appears in PredBB; but it
4136       // may have failed to sink I's operands (recursively), which we try
4137       // (again) here.
4138       if (I->getParent() == PredBB) {
4139         Worklist.insert(I->op_begin(), I->op_end());
4140         continue;
4141       }
4142 
4143       // It's legal to sink the instruction if all its uses occur in the
4144       // predicated block. Otherwise, there's nothing to do yet, and we may
4145       // need to reanalyze the instruction.
4146       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147         InstsToReanalyze.push_back(I);
4148         continue;
4149       }
4150 
4151       // Move the instruction to the beginning of the predicated block, and add
4152       // it's operands to the worklist.
4153       I->moveBefore(&*PredBB->getFirstInsertionPt());
4154       Worklist.insert(I->op_begin(), I->op_end());
4155 
4156       // The sinking may have enabled other instructions to be sunk, so we will
4157       // need to iterate.
4158       Changed = true;
4159     }
4160   } while (Changed);
4161 }
4162 
4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164                                               VPTransformState &State) {
4165   auto Iter = vp_depth_first_deep(Plan.getEntry());
4166   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167     for (VPRecipeBase &P : VPBB->phis()) {
4168       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169       if (!VPPhi)
4170         continue;
4171       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172       // Make sure the builder has a valid insert point.
4173       Builder.SetInsertPoint(NewPhi);
4174       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175         VPValue *Inc = VPPhi->getIncomingValue(i);
4176         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178       }
4179     }
4180   }
4181 }
4182 
4183 bool InnerLoopVectorizer::useOrderedReductions(
4184     const RecurrenceDescriptor &RdxDesc) {
4185   return Cost->useOrderedReductions(RdxDesc);
4186 }
4187 
4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189   // We should not collect Scalars more than once per VF. Right now, this
4190   // function is called from collectUniformsAndScalars(), which already does
4191   // this check. Collecting Scalars for VF=1 does not make any sense.
4192   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4193          "This function should not be visited twice for the same VF");
4194 
4195   // This avoids any chances of creating a REPLICATE recipe during planning
4196   // since that would result in generation of scalarized code during execution,
4197   // which is not supported for scalable vectors.
4198   if (VF.isScalable()) {
4199     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200     return;
4201   }
4202 
4203   SmallSetVector<Instruction *, 8> Worklist;
4204 
4205   // These sets are used to seed the analysis with pointers used by memory
4206   // accesses that will remain scalar.
4207   SmallSetVector<Instruction *, 8> ScalarPtrs;
4208   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209   auto *Latch = TheLoop->getLoopLatch();
4210 
4211   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212   // The pointer operands of loads and stores will be scalar as long as the
4213   // memory access is not a gather or scatter operation. The value operand of a
4214   // store will remain scalar if the store is scalarized.
4215   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217     assert(WideningDecision != CM_Unknown &&
4218            "Widening decision should be ready at this moment");
4219     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220       if (Ptr == Store->getValueOperand())
4221         return WideningDecision == CM_Scalarize;
4222     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4223            "Ptr is neither a value or pointer operand");
4224     return WideningDecision != CM_GatherScatter;
4225   };
4226 
4227   // A helper that returns true if the given value is a bitcast or
4228   // getelementptr instruction contained in the loop.
4229   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231             isa<GetElementPtrInst>(V)) &&
4232            !TheLoop->isLoopInvariant(V);
4233   };
4234 
4235   // A helper that evaluates a memory access's use of a pointer. If the use will
4236   // be a scalar use and the pointer is only used by memory accesses, we place
4237   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238   // PossibleNonScalarPtrs.
4239   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240     // We only care about bitcast and getelementptr instructions contained in
4241     // the loop.
4242     if (!isLoopVaryingBitCastOrGEP(Ptr))
4243       return;
4244 
4245     // If the pointer has already been identified as scalar (e.g., if it was
4246     // also identified as uniform), there's nothing to do.
4247     auto *I = cast<Instruction>(Ptr);
4248     if (Worklist.count(I))
4249       return;
4250 
4251     // If the use of the pointer will be a scalar use, and all users of the
4252     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253     // place the pointer in PossibleNonScalarPtrs.
4254     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255           return isa<LoadInst>(U) || isa<StoreInst>(U);
4256         }))
4257       ScalarPtrs.insert(I);
4258     else
4259       PossibleNonScalarPtrs.insert(I);
4260   };
4261 
4262   // We seed the scalars analysis with three classes of instructions: (1)
4263   // instructions marked uniform-after-vectorization and (2) bitcast,
4264   // getelementptr and (pointer) phi instructions used by memory accesses
4265   // requiring a scalar use.
4266   //
4267   // (1) Add to the worklist all instructions that have been identified as
4268   // uniform-after-vectorization.
4269   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270 
4271   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272   // memory accesses requiring a scalar use. The pointer operands of loads and
4273   // stores will be scalar as long as the memory accesses is not a gather or
4274   // scatter operation. The value operand of a store will remain scalar if the
4275   // store is scalarized.
4276   for (auto *BB : TheLoop->blocks())
4277     for (auto &I : *BB) {
4278       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279         evaluatePtrUse(Load, Load->getPointerOperand());
4280       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281         evaluatePtrUse(Store, Store->getPointerOperand());
4282         evaluatePtrUse(Store, Store->getValueOperand());
4283       }
4284     }
4285   for (auto *I : ScalarPtrs)
4286     if (!PossibleNonScalarPtrs.count(I)) {
4287       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4288       Worklist.insert(I);
4289     }
4290 
4291   // Insert the forced scalars.
4292   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293   // induction variable when the PHI user is scalarized.
4294   auto ForcedScalar = ForcedScalars.find(VF);
4295   if (ForcedScalar != ForcedScalars.end())
4296     for (auto *I : ForcedScalar->second) {
4297       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4298       Worklist.insert(I);
4299     }
4300 
4301   // Expand the worklist by looking through any bitcasts and getelementptr
4302   // instructions we've already identified as scalar. This is similar to the
4303   // expansion step in collectLoopUniforms(); however, here we're only
4304   // expanding to include additional bitcasts and getelementptr instructions.
4305   unsigned Idx = 0;
4306   while (Idx != Worklist.size()) {
4307     Instruction *Dst = Worklist[Idx++];
4308     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309       continue;
4310     auto *Src = cast<Instruction>(Dst->getOperand(0));
4311     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312           auto *J = cast<Instruction>(U);
4313           return !TheLoop->contains(J) || Worklist.count(J) ||
4314                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315                   isScalarUse(J, Src));
4316         })) {
4317       Worklist.insert(Src);
4318       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4319     }
4320   }
4321 
4322   // An induction variable will remain scalar if all users of the induction
4323   // variable and induction variable update remain scalar.
4324   for (const auto &Induction : Legal->getInductionVars()) {
4325     auto *Ind = Induction.first;
4326     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327 
4328     // If tail-folding is applied, the primary induction variable will be used
4329     // to feed a vector compare.
4330     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331       continue;
4332 
4333     // Returns true if \p Indvar is a pointer induction that is used directly by
4334     // load/store instruction \p I.
4335     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336                                               Instruction *I) {
4337       return Induction.second.getKind() ==
4338                  InductionDescriptor::IK_PtrInduction &&
4339              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341     };
4342 
4343     // Determine if all users of the induction variable are scalar after
4344     // vectorization.
4345     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346       auto *I = cast<Instruction>(U);
4347       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349     });
4350     if (!ScalarInd)
4351       continue;
4352 
4353     // Determine if all users of the induction variable update instruction are
4354     // scalar after vectorization.
4355     auto ScalarIndUpdate =
4356         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357           auto *I = cast<Instruction>(U);
4358           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360         });
4361     if (!ScalarIndUpdate)
4362       continue;
4363 
4364     // The induction variable and its update instruction will remain scalar.
4365     Worklist.insert(Ind);
4366     Worklist.insert(IndUpdate);
4367     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4368     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4369                       << "\n");
4370   }
4371 
4372   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373 }
4374 
4375 bool LoopVectorizationCostModel::isScalarWithPredication(
4376     Instruction *I, ElementCount VF) const {
4377   if (!isPredicatedInst(I))
4378     return false;
4379 
4380   // Do we have a non-scalar lowering for this predicated
4381   // instruction? No - it is scalar with predication.
4382   switch(I->getOpcode()) {
4383   default:
4384     return true;
4385   case Instruction::Load:
4386   case Instruction::Store: {
4387     auto *Ptr = getLoadStorePointerOperand(I);
4388     auto *Ty = getLoadStoreType(I);
4389     Type *VTy = Ty;
4390     if (VF.isVector())
4391       VTy = VectorType::get(Ty, VF);
4392     const Align Alignment = getLoadStoreAlignment(I);
4393     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394                                 TTI.isLegalMaskedGather(VTy, Alignment))
4395                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4397   }
4398   case Instruction::UDiv:
4399   case Instruction::SDiv:
4400   case Instruction::SRem:
4401   case Instruction::URem: {
4402     // We have the option to use the safe-divisor idiom to avoid predication.
4403     // The cost based decision here will always select safe-divisor for
4404     // scalable vectors as scalarization isn't legal.
4405     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407   }
4408   }
4409 }
4410 
4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413     return false;
4414 
4415   // Can we prove this instruction is safe to unconditionally execute?
4416   // If not, we must use some form of predication.
4417   switch(I->getOpcode()) {
4418   default:
4419     return false;
4420   case Instruction::Load:
4421   case Instruction::Store: {
4422     if (!Legal->isMaskRequired(I))
4423       return false;
4424     // When we know the load's address is loop invariant and the instruction
4425     // in the original scalar loop was unconditionally executed then we
4426     // don't need to mark it as a predicated instruction. Tail folding may
4427     // introduce additional predication, but we're guaranteed to always have
4428     // at least one active lane.  We call Legal->blockNeedsPredication here
4429     // because it doesn't query tail-folding.  For stores, we need to prove
4430     // both speculation safety (which follows from the same argument as loads),
4431     // but also must prove the value being stored is correct.  The easiest
4432     // form of the later is to require that all values stored are the same.
4433     if (Legal->isUniformMemOp(*I) &&
4434       (isa<LoadInst>(I) ||
4435        (isa<StoreInst>(I) &&
4436         TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437         !Legal->blockNeedsPredication(I->getParent()))
4438       return false;
4439     return true;
4440   }
4441   case Instruction::UDiv:
4442   case Instruction::SDiv:
4443   case Instruction::SRem:
4444   case Instruction::URem:
4445     // TODO: We can use the loop-preheader as context point here and get
4446     // context sensitive reasoning
4447     return !isSafeToSpeculativelyExecute(I);
4448   }
4449 }
4450 
4451 std::pair<InstructionCost, InstructionCost>
4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453                                                     ElementCount VF) const {
4454   assert(I->getOpcode() == Instruction::UDiv ||
4455          I->getOpcode() == Instruction::SDiv ||
4456          I->getOpcode() == Instruction::SRem ||
4457          I->getOpcode() == Instruction::URem);
4458   assert(!isSafeToSpeculativelyExecute(I));
4459 
4460   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461 
4462   // Scalarization isn't legal for scalable vector types
4463   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464   if (!VF.isScalable()) {
4465     // Get the scalarization cost and scale this amount by the probability of
4466     // executing the predicated block. If the instruction is not predicated,
4467     // we fall through to the next case.
4468     ScalarizationCost = 0;
4469 
4470     // These instructions have a non-void type, so account for the phi nodes
4471     // that we will create. This cost is likely to be zero. The phi node
4472     // cost, if any, should be scaled by the block probability because it
4473     // models a copy at the end of each predicated block.
4474     ScalarizationCost += VF.getKnownMinValue() *
4475       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476 
4477     // The cost of the non-predicated instruction.
4478     ScalarizationCost += VF.getKnownMinValue() *
4479       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480 
4481     // The cost of insertelement and extractelement instructions needed for
4482     // scalarization.
4483     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484 
4485     // Scale the cost by the probability of executing the predicated blocks.
4486     // This assumes the predicated block for each vector lane is equally
4487     // likely.
4488     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489   }
4490   InstructionCost SafeDivisorCost = 0;
4491 
4492   auto *VecTy = ToVectorTy(I->getType(), VF);
4493 
4494   // The cost of the select guard to ensure all lanes are well defined
4495   // after we speculate above any internal control flow.
4496   SafeDivisorCost += TTI.getCmpSelInstrCost(
4497     Instruction::Select, VecTy,
4498     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500 
4501   // Certain instructions can be cheaper to vectorize if they have a constant
4502   // second vector operand. One example of this are shifts on x86.
4503   Value *Op2 = I->getOperand(1);
4504   auto Op2Info = TTI.getOperandInfo(Op2);
4505   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507 
4508   SmallVector<const Value *, 4> Operands(I->operand_values());
4509   SafeDivisorCost += TTI.getArithmeticInstrCost(
4510     I->getOpcode(), VecTy, CostKind,
4511     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512     Op2Info, Operands, I);
4513   return {ScalarizationCost, SafeDivisorCost};
4514 }
4515 
4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517     Instruction *I, ElementCount VF) {
4518   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4519   assert(getWideningDecision(I, VF) == CM_Unknown &&
4520          "Decision should not be set yet.");
4521   auto *Group = getInterleavedAccessGroup(I);
4522   assert(Group && "Must have a group.");
4523 
4524   // If the instruction's allocated size doesn't equal it's type size, it
4525   // requires padding and will be scalarized.
4526   auto &DL = I->getModule()->getDataLayout();
4527   auto *ScalarTy = getLoadStoreType(I);
4528   if (hasIrregularType(ScalarTy, DL))
4529     return false;
4530 
4531   // If the group involves a non-integral pointer, we may not be able to
4532   // losslessly cast all values to a common type.
4533   unsigned InterleaveFactor = Group->getFactor();
4534   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535   for (unsigned i = 0; i < InterleaveFactor; i++) {
4536     Instruction *Member = Group->getMember(i);
4537     if (!Member)
4538       continue;
4539     auto *MemberTy = getLoadStoreType(Member);
4540     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541     // Don't coerce non-integral pointers to integers or vice versa.
4542     if (MemberNI != ScalarNI) {
4543       // TODO: Consider adding special nullptr value case here
4544       return false;
4545     } else if (MemberNI && ScalarNI &&
4546                ScalarTy->getPointerAddressSpace() !=
4547                MemberTy->getPointerAddressSpace()) {
4548       return false;
4549     }
4550   }
4551 
4552   // Check if masking is required.
4553   // A Group may need masking for one of two reasons: it resides in a block that
4554   // needs predication, or it was decided to use masking to deal with gaps
4555   // (either a gap at the end of a load-access that may result in a speculative
4556   // load, or any gaps in a store-access).
4557   bool PredicatedAccessRequiresMasking =
4558       blockNeedsPredicationForAnyReason(I->getParent()) &&
4559       Legal->isMaskRequired(I);
4560   bool LoadAccessWithGapsRequiresEpilogMasking =
4561       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562       !isScalarEpilogueAllowed();
4563   bool StoreAccessWithGapsRequiresMasking =
4564       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565   if (!PredicatedAccessRequiresMasking &&
4566       !LoadAccessWithGapsRequiresEpilogMasking &&
4567       !StoreAccessWithGapsRequiresMasking)
4568     return true;
4569 
4570   // If masked interleaving is required, we expect that the user/target had
4571   // enabled it, because otherwise it either wouldn't have been created or
4572   // it should have been invalidated by the CostModel.
4573   assert(useMaskedInterleavedAccesses(TTI) &&
4574          "Masked interleave-groups for predicated accesses are not enabled.");
4575 
4576   if (Group->isReverse())
4577     return false;
4578 
4579   auto *Ty = getLoadStoreType(I);
4580   const Align Alignment = getLoadStoreAlignment(I);
4581   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582                           : TTI.isLegalMaskedStore(Ty, Alignment);
4583 }
4584 
4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586     Instruction *I, ElementCount VF) {
4587   // Get and ensure we have a valid memory instruction.
4588   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4589 
4590   auto *Ptr = getLoadStorePointerOperand(I);
4591   auto *ScalarTy = getLoadStoreType(I);
4592 
4593   // In order to be widened, the pointer should be consecutive, first of all.
4594   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595     return false;
4596 
4597   // If the instruction is a store located in a predicated block, it will be
4598   // scalarized.
4599   if (isScalarWithPredication(I, VF))
4600     return false;
4601 
4602   // If the instruction's allocated size doesn't equal it's type size, it
4603   // requires padding and will be scalarized.
4604   auto &DL = I->getModule()->getDataLayout();
4605   if (hasIrregularType(ScalarTy, DL))
4606     return false;
4607 
4608   return true;
4609 }
4610 
4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612   // We should not collect Uniforms more than once per VF. Right now,
4613   // this function is called from collectUniformsAndScalars(), which
4614   // already does this check. Collecting Uniforms for VF=1 does not make any
4615   // sense.
4616 
4617   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4618          "This function should not be visited twice for the same VF");
4619 
4620   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621   // not analyze again.  Uniforms.count(VF) will return 1.
4622   Uniforms[VF].clear();
4623 
4624   // We now know that the loop is vectorizable!
4625   // Collect instructions inside the loop that will remain uniform after
4626   // vectorization.
4627 
4628   // Global values, params and instructions outside of current loop are out of
4629   // scope.
4630   auto isOutOfScope = [&](Value *V) -> bool {
4631     Instruction *I = dyn_cast<Instruction>(V);
4632     return (!I || !TheLoop->contains(I));
4633   };
4634 
4635   // Worklist containing uniform instructions demanding lane 0.
4636   SetVector<Instruction *> Worklist;
4637   BasicBlock *Latch = TheLoop->getLoopLatch();
4638 
4639   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640   // that are scalar with predication must not be considered uniform after
4641   // vectorization, because that would create an erroneous replicating region
4642   // where only a single instance out of VF should be formed.
4643   // TODO: optimize such seldom cases if found important, see PR40816.
4644   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645     if (isOutOfScope(I)) {
4646       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4647                         << *I << "\n");
4648       return;
4649     }
4650     if (isScalarWithPredication(I, VF)) {
4651       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4652                         << *I << "\n");
4653       return;
4654     }
4655     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4656     Worklist.insert(I);
4657   };
4658 
4659   // Start with the conditional branch. If the branch condition is an
4660   // instruction contained in the loop that is only used by the branch, it is
4661   // uniform.
4662   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664     addToWorklistIfAllowed(Cmp);
4665 
4666   // Return true if all lanes perform the same memory operation, and we can
4667   // thus chose to execute only one.
4668   auto isUniformMemOpUse = [&](Instruction *I) {
4669     if (!Legal->isUniformMemOp(*I))
4670       return false;
4671     if (isa<LoadInst>(I))
4672       // Loading the same address always produces the same result - at least
4673       // assuming aliasing and ordering which have already been checked.
4674       return true;
4675     // Storing the same value on every iteration.
4676     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677   };
4678 
4679   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680     InstWidening WideningDecision = getWideningDecision(I, VF);
4681     assert(WideningDecision != CM_Unknown &&
4682            "Widening decision should be ready at this moment");
4683 
4684     if (isUniformMemOpUse(I))
4685       return true;
4686 
4687     return (WideningDecision == CM_Widen ||
4688             WideningDecision == CM_Widen_Reverse ||
4689             WideningDecision == CM_Interleave);
4690   };
4691 
4692   // Returns true if Ptr is the pointer operand of a memory access instruction
4693   // I, I is known to not require scalarization, and the pointer is not also
4694   // stored.
4695   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696     auto GetStoredValue = [I]() -> Value * {
4697       if (!isa<StoreInst>(I))
4698         return nullptr;
4699       return I->getOperand(0);
4700     };
4701     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) &&
4702            GetStoredValue() != Ptr;
4703   };
4704 
4705   // Holds a list of values which are known to have at least one uniform use.
4706   // Note that there may be other uses which aren't uniform.  A "uniform use"
4707   // here is something which only demands lane 0 of the unrolled iterations;
4708   // it does not imply that all lanes produce the same value (e.g. this is not
4709   // the usual meaning of uniform)
4710   SetVector<Value *> HasUniformUse;
4711 
4712   // Scan the loop for instructions which are either a) known to have only
4713   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4714   for (auto *BB : TheLoop->blocks())
4715     for (auto &I : *BB) {
4716       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4717         switch (II->getIntrinsicID()) {
4718         case Intrinsic::sideeffect:
4719         case Intrinsic::experimental_noalias_scope_decl:
4720         case Intrinsic::assume:
4721         case Intrinsic::lifetime_start:
4722         case Intrinsic::lifetime_end:
4723           if (TheLoop->hasLoopInvariantOperands(&I))
4724             addToWorklistIfAllowed(&I);
4725           break;
4726         default:
4727           break;
4728         }
4729       }
4730 
4731       // ExtractValue instructions must be uniform, because the operands are
4732       // known to be loop-invariant.
4733       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4734         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4735                "Expected aggregate value to be loop invariant");
4736         addToWorklistIfAllowed(EVI);
4737         continue;
4738       }
4739 
4740       // If there's no pointer operand, there's nothing to do.
4741       auto *Ptr = getLoadStorePointerOperand(&I);
4742       if (!Ptr)
4743         continue;
4744 
4745       if (isUniformMemOpUse(&I))
4746         addToWorklistIfAllowed(&I);
4747 
4748       if (isVectorizedMemAccessUse(&I, Ptr)) {
4749         assert(isUniformDecision(&I, VF) && "consistency check");
4750         HasUniformUse.insert(Ptr);
4751       }
4752     }
4753 
4754   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4755   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4756   // disallows uses outside the loop as well.
4757   for (auto *V : HasUniformUse) {
4758     if (isOutOfScope(V))
4759       continue;
4760     auto *I = cast<Instruction>(V);
4761     auto UsersAreMemAccesses =
4762       llvm::all_of(I->users(), [&](User *U) -> bool {
4763         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4764       });
4765     if (UsersAreMemAccesses)
4766       addToWorklistIfAllowed(I);
4767   }
4768 
4769   // Expand Worklist in topological order: whenever a new instruction
4770   // is added , its users should be already inside Worklist.  It ensures
4771   // a uniform instruction will only be used by uniform instructions.
4772   unsigned idx = 0;
4773   while (idx != Worklist.size()) {
4774     Instruction *I = Worklist[idx++];
4775 
4776     for (auto *OV : I->operand_values()) {
4777       // isOutOfScope operands cannot be uniform instructions.
4778       if (isOutOfScope(OV))
4779         continue;
4780       // First order recurrence Phi's should typically be considered
4781       // non-uniform.
4782       auto *OP = dyn_cast<PHINode>(OV);
4783       if (OP && Legal->isFixedOrderRecurrence(OP))
4784         continue;
4785       // If all the users of the operand are uniform, then add the
4786       // operand into the uniform worklist.
4787       auto *OI = cast<Instruction>(OV);
4788       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4789             auto *J = cast<Instruction>(U);
4790             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4791           }))
4792         addToWorklistIfAllowed(OI);
4793     }
4794   }
4795 
4796   // For an instruction to be added into Worklist above, all its users inside
4797   // the loop should also be in Worklist. However, this condition cannot be
4798   // true for phi nodes that form a cyclic dependence. We must process phi
4799   // nodes separately. An induction variable will remain uniform if all users
4800   // of the induction variable and induction variable update remain uniform.
4801   // The code below handles both pointer and non-pointer induction variables.
4802   for (const auto &Induction : Legal->getInductionVars()) {
4803     auto *Ind = Induction.first;
4804     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4805 
4806     // Determine if all users of the induction variable are uniform after
4807     // vectorization.
4808     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4809       auto *I = cast<Instruction>(U);
4810       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4811              isVectorizedMemAccessUse(I, Ind);
4812     });
4813     if (!UniformInd)
4814       continue;
4815 
4816     // Determine if all users of the induction variable update instruction are
4817     // uniform after vectorization.
4818     auto UniformIndUpdate =
4819         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4820           auto *I = cast<Instruction>(U);
4821           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4822                  isVectorizedMemAccessUse(I, IndUpdate);
4823         });
4824     if (!UniformIndUpdate)
4825       continue;
4826 
4827     // The induction variable and its update instruction will remain uniform.
4828     addToWorklistIfAllowed(Ind);
4829     addToWorklistIfAllowed(IndUpdate);
4830   }
4831 
4832   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4833 }
4834 
4835 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4836   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4837 
4838   if (Legal->getRuntimePointerChecking()->Need) {
4839     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4840         "runtime pointer checks needed. Enable vectorization of this "
4841         "loop with '#pragma clang loop vectorize(enable)' when "
4842         "compiling with -Os/-Oz",
4843         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4844     return true;
4845   }
4846 
4847   if (!PSE.getPredicate().isAlwaysTrue()) {
4848     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4849         "runtime SCEV checks needed. Enable vectorization of this "
4850         "loop with '#pragma clang loop vectorize(enable)' when "
4851         "compiling with -Os/-Oz",
4852         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4853     return true;
4854   }
4855 
4856   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4857   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4858     reportVectorizationFailure("Runtime stride check for small trip count",
4859         "runtime stride == 1 checks needed. Enable vectorization of "
4860         "this loop without such check by compiling with -Os/-Oz",
4861         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4862     return true;
4863   }
4864 
4865   return false;
4866 }
4867 
4868 ElementCount
4869 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4870   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4871     return ElementCount::getScalable(0);
4872 
4873   if (Hints->isScalableVectorizationDisabled()) {
4874     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4875                             "ScalableVectorizationDisabled", ORE, TheLoop);
4876     return ElementCount::getScalable(0);
4877   }
4878 
4879   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4880 
4881   auto MaxScalableVF = ElementCount::getScalable(
4882       std::numeric_limits<ElementCount::ScalarTy>::max());
4883 
4884   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4885   // FIXME: While for scalable vectors this is currently sufficient, this should
4886   // be replaced by a more detailed mechanism that filters out specific VFs,
4887   // instead of invalidating vectorization for a whole set of VFs based on the
4888   // MaxVF.
4889 
4890   // Disable scalable vectorization if the loop contains unsupported reductions.
4891   if (!canVectorizeReductions(MaxScalableVF)) {
4892     reportVectorizationInfo(
4893         "Scalable vectorization not supported for the reduction "
4894         "operations found in this loop.",
4895         "ScalableVFUnfeasible", ORE, TheLoop);
4896     return ElementCount::getScalable(0);
4897   }
4898 
4899   // Disable scalable vectorization if the loop contains any instructions
4900   // with element types not supported for scalable vectors.
4901   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4902         return !Ty->isVoidTy() &&
4903                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4904       })) {
4905     reportVectorizationInfo("Scalable vectorization is not supported "
4906                             "for all element types found in this loop.",
4907                             "ScalableVFUnfeasible", ORE, TheLoop);
4908     return ElementCount::getScalable(0);
4909   }
4910 
4911   if (Legal->isSafeForAnyVectorWidth())
4912     return MaxScalableVF;
4913 
4914   // Limit MaxScalableVF by the maximum safe dependence distance.
4915   std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4916   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4917     MaxVScale =
4918         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4919   MaxScalableVF =
4920       ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4921   if (!MaxScalableVF)
4922     reportVectorizationInfo(
4923         "Max legal vector width too small, scalable vectorization "
4924         "unfeasible.",
4925         "ScalableVFUnfeasible", ORE, TheLoop);
4926 
4927   return MaxScalableVF;
4928 }
4929 
4930 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4931     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4932   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4933   unsigned SmallestType, WidestType;
4934   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4935 
4936   // Get the maximum safe dependence distance in bits computed by LAA.
4937   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4938   // the memory accesses that is most restrictive (involved in the smallest
4939   // dependence distance).
4940   unsigned MaxSafeElements =
4941       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4942 
4943   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4944   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4945 
4946   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4947                     << ".\n");
4948   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4949                     << ".\n");
4950 
4951   // First analyze the UserVF, fall back if the UserVF should be ignored.
4952   if (UserVF) {
4953     auto MaxSafeUserVF =
4954         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4955 
4956     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4957       // If `VF=vscale x N` is safe, then so is `VF=N`
4958       if (UserVF.isScalable())
4959         return FixedScalableVFPair(
4960             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4961       else
4962         return UserVF;
4963     }
4964 
4965     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4966 
4967     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4968     // is better to ignore the hint and let the compiler choose a suitable VF.
4969     if (!UserVF.isScalable()) {
4970       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4971                         << " is unsafe, clamping to max safe VF="
4972                         << MaxSafeFixedVF << ".\n");
4973       ORE->emit([&]() {
4974         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4975                                           TheLoop->getStartLoc(),
4976                                           TheLoop->getHeader())
4977                << "User-specified vectorization factor "
4978                << ore::NV("UserVectorizationFactor", UserVF)
4979                << " is unsafe, clamping to maximum safe vectorization factor "
4980                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4981       });
4982       return MaxSafeFixedVF;
4983     }
4984 
4985     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4986       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4987                         << " is ignored because scalable vectors are not "
4988                            "available.\n");
4989       ORE->emit([&]() {
4990         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4991                                           TheLoop->getStartLoc(),
4992                                           TheLoop->getHeader())
4993                << "User-specified vectorization factor "
4994                << ore::NV("UserVectorizationFactor", UserVF)
4995                << " is ignored because the target does not support scalable "
4996                   "vectors. The compiler will pick a more suitable value.";
4997       });
4998     } else {
4999       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5000                         << " is unsafe. Ignoring scalable UserVF.\n");
5001       ORE->emit([&]() {
5002         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5003                                           TheLoop->getStartLoc(),
5004                                           TheLoop->getHeader())
5005                << "User-specified vectorization factor "
5006                << ore::NV("UserVectorizationFactor", UserVF)
5007                << " is unsafe. Ignoring the hint to let the compiler pick a "
5008                   "more suitable value.";
5009       });
5010     }
5011   }
5012 
5013   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5014                     << " / " << WidestType << " bits.\n");
5015 
5016   FixedScalableVFPair Result(ElementCount::getFixed(1),
5017                              ElementCount::getScalable(0));
5018   if (auto MaxVF =
5019           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5020                                   MaxSafeFixedVF, FoldTailByMasking))
5021     Result.FixedVF = MaxVF;
5022 
5023   if (auto MaxVF =
5024           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5025                                   MaxSafeScalableVF, FoldTailByMasking))
5026     if (MaxVF.isScalable()) {
5027       Result.ScalableVF = MaxVF;
5028       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5029                         << "\n");
5030     }
5031 
5032   return Result;
5033 }
5034 
5035 FixedScalableVFPair
5036 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5037   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5038     // TODO: It may by useful to do since it's still likely to be dynamically
5039     // uniform if the target can skip.
5040     reportVectorizationFailure(
5041         "Not inserting runtime ptr check for divergent target",
5042         "runtime pointer checks needed. Not enabled for divergent target",
5043         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5044     return FixedScalableVFPair::getNone();
5045   }
5046 
5047   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5048   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5049   if (TC == 1) {
5050     reportVectorizationFailure("Single iteration (non) loop",
5051         "loop trip count is one, irrelevant for vectorization",
5052         "SingleIterationLoop", ORE, TheLoop);
5053     return FixedScalableVFPair::getNone();
5054   }
5055 
5056   switch (ScalarEpilogueStatus) {
5057   case CM_ScalarEpilogueAllowed:
5058     return computeFeasibleMaxVF(TC, UserVF, false);
5059   case CM_ScalarEpilogueNotAllowedUsePredicate:
5060     [[fallthrough]];
5061   case CM_ScalarEpilogueNotNeededUsePredicate:
5062     LLVM_DEBUG(
5063         dbgs() << "LV: vector predicate hint/switch found.\n"
5064                << "LV: Not allowing scalar epilogue, creating predicated "
5065                << "vector loop.\n");
5066     break;
5067   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5068     // fallthrough as a special case of OptForSize
5069   case CM_ScalarEpilogueNotAllowedOptSize:
5070     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5071       LLVM_DEBUG(
5072           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5073     else
5074       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5075                         << "count.\n");
5076 
5077     // Bail if runtime checks are required, which are not good when optimising
5078     // for size.
5079     if (runtimeChecksRequired())
5080       return FixedScalableVFPair::getNone();
5081 
5082     break;
5083   }
5084 
5085   // The only loops we can vectorize without a scalar epilogue, are loops with
5086   // a bottom-test and a single exiting block. We'd have to handle the fact
5087   // that not every instruction executes on the last iteration.  This will
5088   // require a lane mask which varies through the vector loop body.  (TODO)
5089   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5090     // If there was a tail-folding hint/switch, but we can't fold the tail by
5091     // masking, fallback to a vectorization with a scalar epilogue.
5092     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5093       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5094                            "scalar epilogue instead.\n");
5095       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5096       return computeFeasibleMaxVF(TC, UserVF, false);
5097     }
5098     return FixedScalableVFPair::getNone();
5099   }
5100 
5101   // Now try the tail folding
5102 
5103   // Invalidate interleave groups that require an epilogue if we can't mask
5104   // the interleave-group.
5105   if (!useMaskedInterleavedAccesses(TTI)) {
5106     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5107            "No decisions should have been taken at this point");
5108     // Note: There is no need to invalidate any cost modeling decisions here, as
5109     // non where taken so far.
5110     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5111   }
5112 
5113   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5114   // Avoid tail folding if the trip count is known to be a multiple of any VF
5115   // we chose.
5116   // FIXME: The condition below pessimises the case for fixed-width vectors,
5117   // when scalable VFs are also candidates for vectorization.
5118   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5119     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5120     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5121            "MaxFixedVF must be a power of 2");
5122     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5123                                    : MaxFixedVF.getFixedValue();
5124     ScalarEvolution *SE = PSE.getSE();
5125     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5126     const SCEV *ExitCount = SE->getAddExpr(
5127         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5128     const SCEV *Rem = SE->getURemExpr(
5129         SE->applyLoopGuards(ExitCount, TheLoop),
5130         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5131     if (Rem->isZero()) {
5132       // Accept MaxFixedVF if we do not have a tail.
5133       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5134       return MaxFactors;
5135     }
5136   }
5137 
5138   // If we don't know the precise trip count, or if the trip count that we
5139   // found modulo the vectorization factor is not zero, try to fold the tail
5140   // by masking.
5141   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5142   if (Legal->prepareToFoldTailByMasking()) {
5143     FoldTailByMasking = true;
5144     return MaxFactors;
5145   }
5146 
5147   // If there was a tail-folding hint/switch, but we can't fold the tail by
5148   // masking, fallback to a vectorization with a scalar epilogue.
5149   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5150     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5151                          "scalar epilogue instead.\n");
5152     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5153     return MaxFactors;
5154   }
5155 
5156   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5157     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5158     return FixedScalableVFPair::getNone();
5159   }
5160 
5161   if (TC == 0) {
5162     reportVectorizationFailure(
5163         "Unable to calculate the loop count due to complex control flow",
5164         "unable to calculate the loop count due to complex control flow",
5165         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5166     return FixedScalableVFPair::getNone();
5167   }
5168 
5169   reportVectorizationFailure(
5170       "Cannot optimize for size and vectorize at the same time.",
5171       "cannot optimize for size and vectorize at the same time. "
5172       "Enable vectorization of this loop with '#pragma clang loop "
5173       "vectorize(enable)' when compiling with -Os/-Oz",
5174       "NoTailLoopWithOptForSize", ORE, TheLoop);
5175   return FixedScalableVFPair::getNone();
5176 }
5177 
5178 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5179     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5180     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5181   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5182   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5183       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5184                            : TargetTransformInfo::RGK_FixedWidthVector);
5185 
5186   // Convenience function to return the minimum of two ElementCounts.
5187   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5188     assert((LHS.isScalable() == RHS.isScalable()) &&
5189            "Scalable flags must match");
5190     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5191   };
5192 
5193   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5194   // Note that both WidestRegister and WidestType may not be a powers of 2.
5195   auto MaxVectorElementCount = ElementCount::get(
5196       PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5197       ComputeScalableMaxVF);
5198   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5199   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5200                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5201 
5202   if (!MaxVectorElementCount) {
5203     LLVM_DEBUG(dbgs() << "LV: The target has no "
5204                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5205                       << " vector registers.\n");
5206     return ElementCount::getFixed(1);
5207   }
5208 
5209   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5210   if (MaxVectorElementCount.isScalable() &&
5211       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5212     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5213     auto Min = Attr.getVScaleRangeMin();
5214     WidestRegisterMinEC *= Min;
5215   }
5216   if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5217       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5218     // If loop trip count (TC) is known at compile time there is no point in
5219     // choosing VF greater than TC (as done in the loop below). Select maximum
5220     // power of two which doesn't exceed TC.
5221     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5222     // when the TC is less than or equal to the known number of lanes.
5223     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5224     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5225                          "exceeding the constant trip count: "
5226                       << ClampedConstTripCount << "\n");
5227     return ElementCount::getFixed(ClampedConstTripCount);
5228   }
5229 
5230   TargetTransformInfo::RegisterKind RegKind =
5231       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5232                            : TargetTransformInfo::RGK_FixedWidthVector;
5233   ElementCount MaxVF = MaxVectorElementCount;
5234   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5235                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5236     auto MaxVectorElementCountMaxBW = ElementCount::get(
5237         PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5238         ComputeScalableMaxVF);
5239     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5240 
5241     // Collect all viable vectorization factors larger than the default MaxVF
5242     // (i.e. MaxVectorElementCount).
5243     SmallVector<ElementCount, 8> VFs;
5244     for (ElementCount VS = MaxVectorElementCount * 2;
5245          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5246       VFs.push_back(VS);
5247 
5248     // For each VF calculate its register usage.
5249     auto RUs = calculateRegisterUsage(VFs);
5250 
5251     // Select the largest VF which doesn't require more registers than existing
5252     // ones.
5253     for (int i = RUs.size() - 1; i >= 0; --i) {
5254       bool Selected = true;
5255       for (auto &pair : RUs[i].MaxLocalUsers) {
5256         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5257         if (pair.second > TargetNumRegisters)
5258           Selected = false;
5259       }
5260       if (Selected) {
5261         MaxVF = VFs[i];
5262         break;
5263       }
5264     }
5265     if (ElementCount MinVF =
5266             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5267       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5268         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5269                           << ") with target's minimum: " << MinVF << '\n');
5270         MaxVF = MinVF;
5271       }
5272     }
5273 
5274     // Invalidate any widening decisions we might have made, in case the loop
5275     // requires prediction (decided later), but we have already made some
5276     // load/store widening decisions.
5277     invalidateCostModelingDecisions();
5278   }
5279   return MaxVF;
5280 }
5281 
5282 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5283   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5284     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5285     auto Min = Attr.getVScaleRangeMin();
5286     auto Max = Attr.getVScaleRangeMax();
5287     if (Max && Min == Max)
5288       return Max;
5289   }
5290 
5291   return TTI.getVScaleForTuning();
5292 }
5293 
5294 bool LoopVectorizationCostModel::isMoreProfitable(
5295     const VectorizationFactor &A, const VectorizationFactor &B) const {
5296   InstructionCost CostA = A.Cost;
5297   InstructionCost CostB = B.Cost;
5298 
5299   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5300 
5301   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5302       MaxTripCount) {
5303     // If we are folding the tail and the trip count is a known (possibly small)
5304     // constant, the trip count will be rounded up to an integer number of
5305     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5306     // which we compare directly. When not folding the tail, the total cost will
5307     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5308     // approximated with the per-lane cost below instead of using the tripcount
5309     // as here.
5310     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5311     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5312     return RTCostA < RTCostB;
5313   }
5314 
5315   // Improve estimate for the vector width if it is scalable.
5316   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5317   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5318   if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5319     if (A.Width.isScalable())
5320       EstimatedWidthA *= *VScale;
5321     if (B.Width.isScalable())
5322       EstimatedWidthB *= *VScale;
5323   }
5324 
5325   // Assume vscale may be larger than 1 (or the value being tuned for),
5326   // so that scalable vectorization is slightly favorable over fixed-width
5327   // vectorization.
5328   if (A.Width.isScalable() && !B.Width.isScalable())
5329     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5330 
5331   // To avoid the need for FP division:
5332   //      (CostA / A.Width) < (CostB / B.Width)
5333   // <=>  (CostA * B.Width) < (CostB * A.Width)
5334   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5335 }
5336 
5337 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5338     const ElementCountSet &VFCandidates) {
5339   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5340   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5341   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5342   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5343          "Expected Scalar VF to be a candidate");
5344 
5345   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5346                                        ExpectedCost);
5347   VectorizationFactor ChosenFactor = ScalarCost;
5348 
5349   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5350   if (ForceVectorization && VFCandidates.size() > 1) {
5351     // Ignore scalar width, because the user explicitly wants vectorization.
5352     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5353     // evaluation.
5354     ChosenFactor.Cost = InstructionCost::getMax();
5355   }
5356 
5357   SmallVector<InstructionVFPair> InvalidCosts;
5358   for (const auto &i : VFCandidates) {
5359     // The cost for scalar VF=1 is already calculated, so ignore it.
5360     if (i.isScalar())
5361       continue;
5362 
5363     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5364     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5365 
5366 #ifndef NDEBUG
5367     unsigned AssumedMinimumVscale = 1;
5368     if (std::optional<unsigned> VScale = getVScaleForTuning())
5369       AssumedMinimumVscale = *VScale;
5370     unsigned Width =
5371         Candidate.Width.isScalable()
5372             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5373             : Candidate.Width.getFixedValue();
5374     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5375                       << " costs: " << (Candidate.Cost / Width));
5376     if (i.isScalable())
5377       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5378                         << AssumedMinimumVscale << ")");
5379     LLVM_DEBUG(dbgs() << ".\n");
5380 #endif
5381 
5382     if (!C.second && !ForceVectorization) {
5383       LLVM_DEBUG(
5384           dbgs() << "LV: Not considering vector loop of width " << i
5385                  << " because it will not generate any vector instructions.\n");
5386       continue;
5387     }
5388 
5389     // If profitable add it to ProfitableVF list.
5390     if (isMoreProfitable(Candidate, ScalarCost))
5391       ProfitableVFs.push_back(Candidate);
5392 
5393     if (isMoreProfitable(Candidate, ChosenFactor))
5394       ChosenFactor = Candidate;
5395   }
5396 
5397   // Emit a report of VFs with invalid costs in the loop.
5398   if (!InvalidCosts.empty()) {
5399     // Group the remarks per instruction, keeping the instruction order from
5400     // InvalidCosts.
5401     std::map<Instruction *, unsigned> Numbering;
5402     unsigned I = 0;
5403     for (auto &Pair : InvalidCosts)
5404       if (!Numbering.count(Pair.first))
5405         Numbering[Pair.first] = I++;
5406 
5407     // Sort the list, first on instruction(number) then on VF.
5408     llvm::sort(InvalidCosts,
5409                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5410                  if (Numbering[A.first] != Numbering[B.first])
5411                    return Numbering[A.first] < Numbering[B.first];
5412                  ElementCountComparator ECC;
5413                  return ECC(A.second, B.second);
5414                });
5415 
5416     // For a list of ordered instruction-vf pairs:
5417     //   [(load, vf1), (load, vf2), (store, vf1)]
5418     // Group the instructions together to emit separate remarks for:
5419     //   load  (vf1, vf2)
5420     //   store (vf1)
5421     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5422     auto Subset = ArrayRef<InstructionVFPair>();
5423     do {
5424       if (Subset.empty())
5425         Subset = Tail.take_front(1);
5426 
5427       Instruction *I = Subset.front().first;
5428 
5429       // If the next instruction is different, or if there are no other pairs,
5430       // emit a remark for the collated subset. e.g.
5431       //   [(load, vf1), (load, vf2))]
5432       // to emit:
5433       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5434       if (Subset == Tail || Tail[Subset.size()].first != I) {
5435         std::string OutString;
5436         raw_string_ostream OS(OutString);
5437         assert(!Subset.empty() && "Unexpected empty range");
5438         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5439         for (const auto &Pair : Subset)
5440           OS << (Pair.second == Subset.front().second ? "" : ", ")
5441              << Pair.second;
5442         OS << "):";
5443         if (auto *CI = dyn_cast<CallInst>(I))
5444           OS << " call to " << CI->getCalledFunction()->getName();
5445         else
5446           OS << " " << I->getOpcodeName();
5447         OS.flush();
5448         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5449         Tail = Tail.drop_front(Subset.size());
5450         Subset = {};
5451       } else
5452         // Grow the subset by one element
5453         Subset = Tail.take_front(Subset.size() + 1);
5454     } while (!Tail.empty());
5455   }
5456 
5457   if (!EnableCondStoresVectorization && NumPredStores) {
5458     reportVectorizationFailure("There are conditional stores.",
5459         "store that is conditionally executed prevents vectorization",
5460         "ConditionalStore", ORE, TheLoop);
5461     ChosenFactor = ScalarCost;
5462   }
5463 
5464   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5465                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5466              << "LV: Vectorization seems to be not beneficial, "
5467              << "but was forced by a user.\n");
5468   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5469   return ChosenFactor;
5470 }
5471 
5472 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5473     const Loop &L, ElementCount VF) const {
5474   // Cross iteration phis such as reductions need special handling and are
5475   // currently unsupported.
5476   if (any_of(L.getHeader()->phis(),
5477              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5478     return false;
5479 
5480   // Phis with uses outside of the loop require special handling and are
5481   // currently unsupported.
5482   for (const auto &Entry : Legal->getInductionVars()) {
5483     // Look for uses of the value of the induction at the last iteration.
5484     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5485     for (User *U : PostInc->users())
5486       if (!L.contains(cast<Instruction>(U)))
5487         return false;
5488     // Look for uses of penultimate value of the induction.
5489     for (User *U : Entry.first->users())
5490       if (!L.contains(cast<Instruction>(U)))
5491         return false;
5492   }
5493 
5494   // Epilogue vectorization code has not been auditted to ensure it handles
5495   // non-latch exits properly.  It may be fine, but it needs auditted and
5496   // tested.
5497   if (L.getExitingBlock() != L.getLoopLatch())
5498     return false;
5499 
5500   return true;
5501 }
5502 
5503 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5504     const ElementCount VF) const {
5505   // FIXME: We need a much better cost-model to take different parameters such
5506   // as register pressure, code size increase and cost of extra branches into
5507   // account. For now we apply a very crude heuristic and only consider loops
5508   // with vectorization factors larger than a certain value.
5509 
5510   // Allow the target to opt out entirely.
5511   if (!TTI.preferEpilogueVectorization())
5512     return false;
5513 
5514   // We also consider epilogue vectorization unprofitable for targets that don't
5515   // consider interleaving beneficial (eg. MVE).
5516   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5517     return false;
5518   // FIXME: We should consider changing the threshold for scalable
5519   // vectors to take VScaleForTuning into account.
5520   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5521     return true;
5522   return false;
5523 }
5524 
5525 VectorizationFactor
5526 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5527     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5528   VectorizationFactor Result = VectorizationFactor::Disabled();
5529   if (!EnableEpilogueVectorization) {
5530     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5531     return Result;
5532   }
5533 
5534   if (!isScalarEpilogueAllowed()) {
5535     LLVM_DEBUG(
5536         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5537                   "allowed.\n";);
5538     return Result;
5539   }
5540 
5541   // Not really a cost consideration, but check for unsupported cases here to
5542   // simplify the logic.
5543   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5544     LLVM_DEBUG(
5545         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5546                   "not a supported candidate.\n";);
5547     return Result;
5548   }
5549 
5550   if (EpilogueVectorizationForceVF > 1) {
5551     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5552     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5553     if (LVP.hasPlanWithVF(ForcedEC))
5554       return {ForcedEC, 0, 0};
5555     else {
5556       LLVM_DEBUG(
5557           dbgs()
5558               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5559       return Result;
5560     }
5561   }
5562 
5563   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5564       TheLoop->getHeader()->getParent()->hasMinSize()) {
5565     LLVM_DEBUG(
5566         dbgs()
5567             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5568     return Result;
5569   }
5570 
5571   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5572     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5573                          "this loop\n");
5574     return Result;
5575   }
5576 
5577   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5578   // the main loop handles 8 lanes per iteration. We could still benefit from
5579   // vectorizing the epilogue loop with VF=4.
5580   ElementCount EstimatedRuntimeVF = MainLoopVF;
5581   if (MainLoopVF.isScalable()) {
5582     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5583     if (std::optional<unsigned> VScale = getVScaleForTuning())
5584       EstimatedRuntimeVF *= *VScale;
5585   }
5586 
5587   for (auto &NextVF : ProfitableVFs)
5588     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5589           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5590          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5591         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5592         LVP.hasPlanWithVF(NextVF.Width))
5593       Result = NextVF;
5594 
5595   if (Result != VectorizationFactor::Disabled())
5596     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5597                       << Result.Width << "\n";);
5598   return Result;
5599 }
5600 
5601 std::pair<unsigned, unsigned>
5602 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5603   unsigned MinWidth = -1U;
5604   unsigned MaxWidth = 8;
5605   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5606   // For in-loop reductions, no element types are added to ElementTypesInLoop
5607   // if there are no loads/stores in the loop. In this case, check through the
5608   // reduction variables to determine the maximum width.
5609   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5610     // Reset MaxWidth so that we can find the smallest type used by recurrences
5611     // in the loop.
5612     MaxWidth = -1U;
5613     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5614       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5615       // When finding the min width used by the recurrence we need to account
5616       // for casts on the input operands of the recurrence.
5617       MaxWidth = std::min<unsigned>(
5618           MaxWidth, std::min<unsigned>(
5619                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5620                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5621     }
5622   } else {
5623     for (Type *T : ElementTypesInLoop) {
5624       MinWidth = std::min<unsigned>(
5625           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5626       MaxWidth = std::max<unsigned>(
5627           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5628     }
5629   }
5630   return {MinWidth, MaxWidth};
5631 }
5632 
5633 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5634   ElementTypesInLoop.clear();
5635   // For each block.
5636   for (BasicBlock *BB : TheLoop->blocks()) {
5637     // For each instruction in the loop.
5638     for (Instruction &I : BB->instructionsWithoutDebug()) {
5639       Type *T = I.getType();
5640 
5641       // Skip ignored values.
5642       if (ValuesToIgnore.count(&I))
5643         continue;
5644 
5645       // Only examine Loads, Stores and PHINodes.
5646       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5647         continue;
5648 
5649       // Examine PHI nodes that are reduction variables. Update the type to
5650       // account for the recurrence type.
5651       if (auto *PN = dyn_cast<PHINode>(&I)) {
5652         if (!Legal->isReductionVariable(PN))
5653           continue;
5654         const RecurrenceDescriptor &RdxDesc =
5655             Legal->getReductionVars().find(PN)->second;
5656         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5657             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5658                                       RdxDesc.getRecurrenceType(),
5659                                       TargetTransformInfo::ReductionFlags()))
5660           continue;
5661         T = RdxDesc.getRecurrenceType();
5662       }
5663 
5664       // Examine the stored values.
5665       if (auto *ST = dyn_cast<StoreInst>(&I))
5666         T = ST->getValueOperand()->getType();
5667 
5668       assert(T->isSized() &&
5669              "Expected the load/store/recurrence type to be sized");
5670 
5671       ElementTypesInLoop.insert(T);
5672     }
5673   }
5674 }
5675 
5676 unsigned
5677 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5678                                                   InstructionCost LoopCost) {
5679   // -- The interleave heuristics --
5680   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5681   // There are many micro-architectural considerations that we can't predict
5682   // at this level. For example, frontend pressure (on decode or fetch) due to
5683   // code size, or the number and capabilities of the execution ports.
5684   //
5685   // We use the following heuristics to select the interleave count:
5686   // 1. If the code has reductions, then we interleave to break the cross
5687   // iteration dependency.
5688   // 2. If the loop is really small, then we interleave to reduce the loop
5689   // overhead.
5690   // 3. We don't interleave if we think that we will spill registers to memory
5691   // due to the increased register pressure.
5692 
5693   if (!isScalarEpilogueAllowed())
5694     return 1;
5695 
5696   // We used the distance for the interleave count.
5697   if (Legal->getMaxSafeDepDistBytes() != -1U)
5698     return 1;
5699 
5700   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5701   const bool HasReductions = !Legal->getReductionVars().empty();
5702   // Do not interleave loops with a relatively small known or estimated trip
5703   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5704   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5705   // because with the above conditions interleaving can expose ILP and break
5706   // cross iteration dependences for reductions.
5707   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5708       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5709     return 1;
5710 
5711   // If we did not calculate the cost for VF (because the user selected the VF)
5712   // then we calculate the cost of VF here.
5713   if (LoopCost == 0) {
5714     LoopCost = expectedCost(VF).first;
5715     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5716 
5717     // Loop body is free and there is no need for interleaving.
5718     if (LoopCost == 0)
5719       return 1;
5720   }
5721 
5722   RegisterUsage R = calculateRegisterUsage({VF})[0];
5723   // We divide by these constants so assume that we have at least one
5724   // instruction that uses at least one register.
5725   for (auto& pair : R.MaxLocalUsers) {
5726     pair.second = std::max(pair.second, 1U);
5727   }
5728 
5729   // We calculate the interleave count using the following formula.
5730   // Subtract the number of loop invariants from the number of available
5731   // registers. These registers are used by all of the interleaved instances.
5732   // Next, divide the remaining registers by the number of registers that is
5733   // required by the loop, in order to estimate how many parallel instances
5734   // fit without causing spills. All of this is rounded down if necessary to be
5735   // a power of two. We want power of two interleave count to simplify any
5736   // addressing operations or alignment considerations.
5737   // We also want power of two interleave counts to ensure that the induction
5738   // variable of the vector loop wraps to zero, when tail is folded by masking;
5739   // this currently happens when OptForSize, in which case IC is set to 1 above.
5740   unsigned IC = UINT_MAX;
5741 
5742   for (auto& pair : R.MaxLocalUsers) {
5743     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5744     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5745                       << " registers of "
5746                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5747     if (VF.isScalar()) {
5748       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5749         TargetNumRegisters = ForceTargetNumScalarRegs;
5750     } else {
5751       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5752         TargetNumRegisters = ForceTargetNumVectorRegs;
5753     }
5754     unsigned MaxLocalUsers = pair.second;
5755     unsigned LoopInvariantRegs = 0;
5756     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5757       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5758 
5759     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5760     // Don't count the induction variable as interleaved.
5761     if (EnableIndVarRegisterHeur) {
5762       TmpIC =
5763           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5764                         std::max(1U, (MaxLocalUsers - 1)));
5765     }
5766 
5767     IC = std::min(IC, TmpIC);
5768   }
5769 
5770   // Clamp the interleave ranges to reasonable counts.
5771   unsigned MaxInterleaveCount =
5772       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5773 
5774   // Check if the user has overridden the max.
5775   if (VF.isScalar()) {
5776     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5777       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5778   } else {
5779     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5780       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5781   }
5782 
5783   // If trip count is known or estimated compile time constant, limit the
5784   // interleave count to be less than the trip count divided by VF, provided it
5785   // is at least 1.
5786   //
5787   // For scalable vectors we can't know if interleaving is beneficial. It may
5788   // not be beneficial for small loops if none of the lanes in the second vector
5789   // iterations is enabled. However, for larger loops, there is likely to be a
5790   // similar benefit as for fixed-width vectors. For now, we choose to leave
5791   // the InterleaveCount as if vscale is '1', although if some information about
5792   // the vector is known (e.g. min vector size), we can make a better decision.
5793   if (BestKnownTC) {
5794     MaxInterleaveCount =
5795         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5796     // Make sure MaxInterleaveCount is greater than 0.
5797     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5798   }
5799 
5800   assert(MaxInterleaveCount > 0 &&
5801          "Maximum interleave count must be greater than 0");
5802 
5803   // Clamp the calculated IC to be between the 1 and the max interleave count
5804   // that the target and trip count allows.
5805   if (IC > MaxInterleaveCount)
5806     IC = MaxInterleaveCount;
5807   else
5808     // Make sure IC is greater than 0.
5809     IC = std::max(1u, IC);
5810 
5811   assert(IC > 0 && "Interleave count must be greater than 0.");
5812 
5813   // Interleave if we vectorized this loop and there is a reduction that could
5814   // benefit from interleaving.
5815   if (VF.isVector() && HasReductions) {
5816     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5817     return IC;
5818   }
5819 
5820   // For any scalar loop that either requires runtime checks or predication we
5821   // are better off leaving this to the unroller. Note that if we've already
5822   // vectorized the loop we will have done the runtime check and so interleaving
5823   // won't require further checks.
5824   bool ScalarInterleavingRequiresPredication =
5825       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5826          return Legal->blockNeedsPredication(BB);
5827        }));
5828   bool ScalarInterleavingRequiresRuntimePointerCheck =
5829       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5830 
5831   // We want to interleave small loops in order to reduce the loop overhead and
5832   // potentially expose ILP opportunities.
5833   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5834                     << "LV: IC is " << IC << '\n'
5835                     << "LV: VF is " << VF << '\n');
5836   const bool AggressivelyInterleaveReductions =
5837       TTI.enableAggressiveInterleaving(HasReductions);
5838   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5839       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5840     // We assume that the cost overhead is 1 and we use the cost model
5841     // to estimate the cost of the loop and interleave until the cost of the
5842     // loop overhead is about 5% of the cost of the loop.
5843     unsigned SmallIC = std::min(
5844         IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5845 
5846     // Interleave until store/load ports (estimated by max interleave count) are
5847     // saturated.
5848     unsigned NumStores = Legal->getNumStores();
5849     unsigned NumLoads = Legal->getNumLoads();
5850     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5851     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5852 
5853     // There is little point in interleaving for reductions containing selects
5854     // and compares when VF=1 since it may just create more overhead than it's
5855     // worth for loops with small trip counts. This is because we still have to
5856     // do the final reduction after the loop.
5857     bool HasSelectCmpReductions =
5858         HasReductions &&
5859         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5860           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5861           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5862               RdxDesc.getRecurrenceKind());
5863         });
5864     if (HasSelectCmpReductions) {
5865       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5866       return 1;
5867     }
5868 
5869     // If we have a scalar reduction (vector reductions are already dealt with
5870     // by this point), we can increase the critical path length if the loop
5871     // we're interleaving is inside another loop. For tree-wise reductions
5872     // set the limit to 2, and for ordered reductions it's best to disable
5873     // interleaving entirely.
5874     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5875       bool HasOrderedReductions =
5876           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5877             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5878             return RdxDesc.isOrdered();
5879           });
5880       if (HasOrderedReductions) {
5881         LLVM_DEBUG(
5882             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5883         return 1;
5884       }
5885 
5886       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5887       SmallIC = std::min(SmallIC, F);
5888       StoresIC = std::min(StoresIC, F);
5889       LoadsIC = std::min(LoadsIC, F);
5890     }
5891 
5892     if (EnableLoadStoreRuntimeInterleave &&
5893         std::max(StoresIC, LoadsIC) > SmallIC) {
5894       LLVM_DEBUG(
5895           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5896       return std::max(StoresIC, LoadsIC);
5897     }
5898 
5899     // If there are scalar reductions and TTI has enabled aggressive
5900     // interleaving for reductions, we will interleave to expose ILP.
5901     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5902         AggressivelyInterleaveReductions) {
5903       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5904       // Interleave no less than SmallIC but not as aggressive as the normal IC
5905       // to satisfy the rare situation when resources are too limited.
5906       return std::max(IC / 2, SmallIC);
5907     } else {
5908       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5909       return SmallIC;
5910     }
5911   }
5912 
5913   // Interleave if this is a large loop (small loops are already dealt with by
5914   // this point) that could benefit from interleaving.
5915   if (AggressivelyInterleaveReductions) {
5916     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5917     return IC;
5918   }
5919 
5920   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5921   return 1;
5922 }
5923 
5924 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5925 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5926   // This function calculates the register usage by measuring the highest number
5927   // of values that are alive at a single location. Obviously, this is a very
5928   // rough estimation. We scan the loop in a topological order in order and
5929   // assign a number to each instruction. We use RPO to ensure that defs are
5930   // met before their users. We assume that each instruction that has in-loop
5931   // users starts an interval. We record every time that an in-loop value is
5932   // used, so we have a list of the first and last occurrences of each
5933   // instruction. Next, we transpose this data structure into a multi map that
5934   // holds the list of intervals that *end* at a specific location. This multi
5935   // map allows us to perform a linear search. We scan the instructions linearly
5936   // and record each time that a new interval starts, by placing it in a set.
5937   // If we find this value in the multi-map then we remove it from the set.
5938   // The max register usage is the maximum size of the set.
5939   // We also search for instructions that are defined outside the loop, but are
5940   // used inside the loop. We need this number separately from the max-interval
5941   // usage number because when we unroll, loop-invariant values do not take
5942   // more register.
5943   LoopBlocksDFS DFS(TheLoop);
5944   DFS.perform(LI);
5945 
5946   RegisterUsage RU;
5947 
5948   // Each 'key' in the map opens a new interval. The values
5949   // of the map are the index of the 'last seen' usage of the
5950   // instruction that is the key.
5951   using IntervalMap = DenseMap<Instruction *, unsigned>;
5952 
5953   // Maps instruction to its index.
5954   SmallVector<Instruction *, 64> IdxToInstr;
5955   // Marks the end of each interval.
5956   IntervalMap EndPoint;
5957   // Saves the list of instruction indices that are used in the loop.
5958   SmallPtrSet<Instruction *, 8> Ends;
5959   // Saves the list of values that are used in the loop but are defined outside
5960   // the loop (not including non-instruction values such as arguments and
5961   // constants).
5962   SmallPtrSet<Instruction *, 8> LoopInvariants;
5963 
5964   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5965     for (Instruction &I : BB->instructionsWithoutDebug()) {
5966       IdxToInstr.push_back(&I);
5967 
5968       // Save the end location of each USE.
5969       for (Value *U : I.operands()) {
5970         auto *Instr = dyn_cast<Instruction>(U);
5971 
5972         // Ignore non-instruction values such as arguments, constants, etc.
5973         // FIXME: Might need some motivation why these values are ignored. If
5974         // for example an argument is used inside the loop it will increase the
5975         // register pressure (so shouldn't we add it to LoopInvariants).
5976         if (!Instr)
5977           continue;
5978 
5979         // If this instruction is outside the loop then record it and continue.
5980         if (!TheLoop->contains(Instr)) {
5981           LoopInvariants.insert(Instr);
5982           continue;
5983         }
5984 
5985         // Overwrite previous end points.
5986         EndPoint[Instr] = IdxToInstr.size();
5987         Ends.insert(Instr);
5988       }
5989     }
5990   }
5991 
5992   // Saves the list of intervals that end with the index in 'key'.
5993   using InstrList = SmallVector<Instruction *, 2>;
5994   DenseMap<unsigned, InstrList> TransposeEnds;
5995 
5996   // Transpose the EndPoints to a list of values that end at each index.
5997   for (auto &Interval : EndPoint)
5998     TransposeEnds[Interval.second].push_back(Interval.first);
5999 
6000   SmallPtrSet<Instruction *, 8> OpenIntervals;
6001   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6002   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6003 
6004   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6005 
6006   const auto &TTICapture = TTI;
6007   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6008     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6009       return 0;
6010     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6011   };
6012 
6013   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6014     Instruction *I = IdxToInstr[i];
6015 
6016     // Remove all of the instructions that end at this location.
6017     InstrList &List = TransposeEnds[i];
6018     for (Instruction *ToRemove : List)
6019       OpenIntervals.erase(ToRemove);
6020 
6021     // Ignore instructions that are never used within the loop.
6022     if (!Ends.count(I))
6023       continue;
6024 
6025     // Skip ignored values.
6026     if (ValuesToIgnore.count(I))
6027       continue;
6028 
6029     // For each VF find the maximum usage of registers.
6030     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6031       // Count the number of registers used, per register class, given all open
6032       // intervals.
6033       // Note that elements in this SmallMapVector will be default constructed
6034       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6035       // there is no previous entry for ClassID.
6036       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6037 
6038       if (VFs[j].isScalar()) {
6039         for (auto *Inst : OpenIntervals) {
6040           unsigned ClassID =
6041               TTI.getRegisterClassForType(false, Inst->getType());
6042           // FIXME: The target might use more than one register for the type
6043           // even in the scalar case.
6044           RegUsage[ClassID] += 1;
6045         }
6046       } else {
6047         collectUniformsAndScalars(VFs[j]);
6048         for (auto *Inst : OpenIntervals) {
6049           // Skip ignored values for VF > 1.
6050           if (VecValuesToIgnore.count(Inst))
6051             continue;
6052           if (isScalarAfterVectorization(Inst, VFs[j])) {
6053             unsigned ClassID =
6054                 TTI.getRegisterClassForType(false, Inst->getType());
6055             // FIXME: The target might use more than one register for the type
6056             // even in the scalar case.
6057             RegUsage[ClassID] += 1;
6058           } else {
6059             unsigned ClassID =
6060                 TTI.getRegisterClassForType(true, Inst->getType());
6061             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6062           }
6063         }
6064       }
6065 
6066       for (auto& pair : RegUsage) {
6067         auto &Entry = MaxUsages[j][pair.first];
6068         Entry = std::max(Entry, pair.second);
6069       }
6070     }
6071 
6072     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6073                       << OpenIntervals.size() << '\n');
6074 
6075     // Add the current instruction to the list of open intervals.
6076     OpenIntervals.insert(I);
6077   }
6078 
6079   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6080     // Note that elements in this SmallMapVector will be default constructed
6081     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6082     // there is no previous entry for ClassID.
6083     SmallMapVector<unsigned, unsigned, 4> Invariant;
6084 
6085     for (auto *Inst : LoopInvariants) {
6086       // FIXME: The target might use more than one register for the type
6087       // even in the scalar case.
6088       bool IsScalar = all_of(Inst->users(), [&](User *U) {
6089         auto *I = cast<Instruction>(U);
6090         return TheLoop != LI->getLoopFor(I->getParent()) ||
6091                isScalarAfterVectorization(I, VFs[i]);
6092       });
6093 
6094       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
6095       unsigned ClassID =
6096           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
6097       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
6098     }
6099 
6100     LLVM_DEBUG({
6101       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6102       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6103              << " item\n";
6104       for (const auto &pair : MaxUsages[i]) {
6105         dbgs() << "LV(REG): RegisterClass: "
6106                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6107                << " registers\n";
6108       }
6109       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6110              << " item\n";
6111       for (const auto &pair : Invariant) {
6112         dbgs() << "LV(REG): RegisterClass: "
6113                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6114                << " registers\n";
6115       }
6116     });
6117 
6118     RU.LoopInvariantRegs = Invariant;
6119     RU.MaxLocalUsers = MaxUsages[i];
6120     RUs[i] = RU;
6121   }
6122 
6123   return RUs;
6124 }
6125 
6126 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6127                                                            ElementCount VF) {
6128   // TODO: Cost model for emulated masked load/store is completely
6129   // broken. This hack guides the cost model to use an artificially
6130   // high enough value to practically disable vectorization with such
6131   // operations, except where previously deployed legality hack allowed
6132   // using very low cost values. This is to avoid regressions coming simply
6133   // from moving "masked load/store" check from legality to cost model.
6134   // Masked Load/Gather emulation was previously never allowed.
6135   // Limited number of Masked Store/Scatter emulation was allowed.
6136   assert((isPredicatedInst(I)) &&
6137          "Expecting a scalar emulated instruction");
6138   return isa<LoadInst>(I) ||
6139          (isa<StoreInst>(I) &&
6140           NumPredStores > NumberOfStoresToPredicate);
6141 }
6142 
6143 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6144   // If we aren't vectorizing the loop, or if we've already collected the
6145   // instructions to scalarize, there's nothing to do. Collection may already
6146   // have occurred if we have a user-selected VF and are now computing the
6147   // expected cost for interleaving.
6148   if (VF.isScalar() || VF.isZero() ||
6149       InstsToScalarize.find(VF) != InstsToScalarize.end())
6150     return;
6151 
6152   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6153   // not profitable to scalarize any instructions, the presence of VF in the
6154   // map will indicate that we've analyzed it already.
6155   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6156 
6157   PredicatedBBsAfterVectorization[VF].clear();
6158 
6159   // Find all the instructions that are scalar with predication in the loop and
6160   // determine if it would be better to not if-convert the blocks they are in.
6161   // If so, we also record the instructions to scalarize.
6162   for (BasicBlock *BB : TheLoop->blocks()) {
6163     if (!blockNeedsPredicationForAnyReason(BB))
6164       continue;
6165     for (Instruction &I : *BB)
6166       if (isScalarWithPredication(&I, VF)) {
6167         ScalarCostsTy ScalarCosts;
6168         // Do not apply discount if scalable, because that would lead to
6169         // invalid scalarization costs.
6170         // Do not apply discount logic if hacked cost is needed
6171         // for emulated masked memrefs.
6172         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6173             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6174           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6175         // Remember that BB will remain after vectorization.
6176         PredicatedBBsAfterVectorization[VF].insert(BB);
6177       }
6178   }
6179 }
6180 
6181 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6182     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6183   assert(!isUniformAfterVectorization(PredInst, VF) &&
6184          "Instruction marked uniform-after-vectorization will be predicated");
6185 
6186   // Initialize the discount to zero, meaning that the scalar version and the
6187   // vector version cost the same.
6188   InstructionCost Discount = 0;
6189 
6190   // Holds instructions to analyze. The instructions we visit are mapped in
6191   // ScalarCosts. Those instructions are the ones that would be scalarized if
6192   // we find that the scalar version costs less.
6193   SmallVector<Instruction *, 8> Worklist;
6194 
6195   // Returns true if the given instruction can be scalarized.
6196   auto canBeScalarized = [&](Instruction *I) -> bool {
6197     // We only attempt to scalarize instructions forming a single-use chain
6198     // from the original predicated block that would otherwise be vectorized.
6199     // Although not strictly necessary, we give up on instructions we know will
6200     // already be scalar to avoid traversing chains that are unlikely to be
6201     // beneficial.
6202     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6203         isScalarAfterVectorization(I, VF))
6204       return false;
6205 
6206     // If the instruction is scalar with predication, it will be analyzed
6207     // separately. We ignore it within the context of PredInst.
6208     if (isScalarWithPredication(I, VF))
6209       return false;
6210 
6211     // If any of the instruction's operands are uniform after vectorization,
6212     // the instruction cannot be scalarized. This prevents, for example, a
6213     // masked load from being scalarized.
6214     //
6215     // We assume we will only emit a value for lane zero of an instruction
6216     // marked uniform after vectorization, rather than VF identical values.
6217     // Thus, if we scalarize an instruction that uses a uniform, we would
6218     // create uses of values corresponding to the lanes we aren't emitting code
6219     // for. This behavior can be changed by allowing getScalarValue to clone
6220     // the lane zero values for uniforms rather than asserting.
6221     for (Use &U : I->operands())
6222       if (auto *J = dyn_cast<Instruction>(U.get()))
6223         if (isUniformAfterVectorization(J, VF))
6224           return false;
6225 
6226     // Otherwise, we can scalarize the instruction.
6227     return true;
6228   };
6229 
6230   // Compute the expected cost discount from scalarizing the entire expression
6231   // feeding the predicated instruction. We currently only consider expressions
6232   // that are single-use instruction chains.
6233   Worklist.push_back(PredInst);
6234   while (!Worklist.empty()) {
6235     Instruction *I = Worklist.pop_back_val();
6236 
6237     // If we've already analyzed the instruction, there's nothing to do.
6238     if (ScalarCosts.find(I) != ScalarCosts.end())
6239       continue;
6240 
6241     // Compute the cost of the vector instruction. Note that this cost already
6242     // includes the scalarization overhead of the predicated instruction.
6243     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6244 
6245     // Compute the cost of the scalarized instruction. This cost is the cost of
6246     // the instruction as if it wasn't if-converted and instead remained in the
6247     // predicated block. We will scale this cost by block probability after
6248     // computing the scalarization overhead.
6249     InstructionCost ScalarCost =
6250         VF.getFixedValue() *
6251         getInstructionCost(I, ElementCount::getFixed(1)).first;
6252 
6253     // Compute the scalarization overhead of needed insertelement instructions
6254     // and phi nodes.
6255     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6256     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6257       ScalarCost += TTI.getScalarizationOverhead(
6258           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6259           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6260           /*Extract*/ false, CostKind);
6261       ScalarCost +=
6262           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6263     }
6264 
6265     // Compute the scalarization overhead of needed extractelement
6266     // instructions. For each of the instruction's operands, if the operand can
6267     // be scalarized, add it to the worklist; otherwise, account for the
6268     // overhead.
6269     for (Use &U : I->operands())
6270       if (auto *J = dyn_cast<Instruction>(U.get())) {
6271         assert(VectorType::isValidElementType(J->getType()) &&
6272                "Instruction has non-scalar type");
6273         if (canBeScalarized(J))
6274           Worklist.push_back(J);
6275         else if (needsExtract(J, VF)) {
6276           ScalarCost += TTI.getScalarizationOverhead(
6277               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6278               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6279               /*Extract*/ true, CostKind);
6280         }
6281       }
6282 
6283     // Scale the total scalar cost by block probability.
6284     ScalarCost /= getReciprocalPredBlockProb();
6285 
6286     // Compute the discount. A non-negative discount means the vector version
6287     // of the instruction costs more, and scalarizing would be beneficial.
6288     Discount += VectorCost - ScalarCost;
6289     ScalarCosts[I] = ScalarCost;
6290   }
6291 
6292   return Discount;
6293 }
6294 
6295 LoopVectorizationCostModel::VectorizationCostTy
6296 LoopVectorizationCostModel::expectedCost(
6297     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6298   VectorizationCostTy Cost;
6299 
6300   // For each block.
6301   for (BasicBlock *BB : TheLoop->blocks()) {
6302     VectorizationCostTy BlockCost;
6303 
6304     // For each instruction in the old loop.
6305     for (Instruction &I : BB->instructionsWithoutDebug()) {
6306       // Skip ignored values.
6307       if (ValuesToIgnore.count(&I) ||
6308           (VF.isVector() && VecValuesToIgnore.count(&I)))
6309         continue;
6310 
6311       VectorizationCostTy C = getInstructionCost(&I, VF);
6312 
6313       // Check if we should override the cost.
6314       if (C.first.isValid() &&
6315           ForceTargetInstructionCost.getNumOccurrences() > 0)
6316         C.first = InstructionCost(ForceTargetInstructionCost);
6317 
6318       // Keep a list of instructions with invalid costs.
6319       if (Invalid && !C.first.isValid())
6320         Invalid->emplace_back(&I, VF);
6321 
6322       BlockCost.first += C.first;
6323       BlockCost.second |= C.second;
6324       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6325                         << " for VF " << VF << " For instruction: " << I
6326                         << '\n');
6327     }
6328 
6329     // If we are vectorizing a predicated block, it will have been
6330     // if-converted. This means that the block's instructions (aside from
6331     // stores and instructions that may divide by zero) will now be
6332     // unconditionally executed. For the scalar case, we may not always execute
6333     // the predicated block, if it is an if-else block. Thus, scale the block's
6334     // cost by the probability of executing it. blockNeedsPredication from
6335     // Legal is used so as to not include all blocks in tail folded loops.
6336     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6337       BlockCost.first /= getReciprocalPredBlockProb();
6338 
6339     Cost.first += BlockCost.first;
6340     Cost.second |= BlockCost.second;
6341   }
6342 
6343   return Cost;
6344 }
6345 
6346 /// Gets Address Access SCEV after verifying that the access pattern
6347 /// is loop invariant except the induction variable dependence.
6348 ///
6349 /// This SCEV can be sent to the Target in order to estimate the address
6350 /// calculation cost.
6351 static const SCEV *getAddressAccessSCEV(
6352               Value *Ptr,
6353               LoopVectorizationLegality *Legal,
6354               PredicatedScalarEvolution &PSE,
6355               const Loop *TheLoop) {
6356 
6357   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6358   if (!Gep)
6359     return nullptr;
6360 
6361   // We are looking for a gep with all loop invariant indices except for one
6362   // which should be an induction variable.
6363   auto SE = PSE.getSE();
6364   unsigned NumOperands = Gep->getNumOperands();
6365   for (unsigned i = 1; i < NumOperands; ++i) {
6366     Value *Opd = Gep->getOperand(i);
6367     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6368         !Legal->isInductionVariable(Opd))
6369       return nullptr;
6370   }
6371 
6372   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6373   return PSE.getSCEV(Ptr);
6374 }
6375 
6376 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6377   return Legal->hasStride(I->getOperand(0)) ||
6378          Legal->hasStride(I->getOperand(1));
6379 }
6380 
6381 InstructionCost
6382 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6383                                                         ElementCount VF) {
6384   assert(VF.isVector() &&
6385          "Scalarization cost of instruction implies vectorization.");
6386   if (VF.isScalable())
6387     return InstructionCost::getInvalid();
6388 
6389   Type *ValTy = getLoadStoreType(I);
6390   auto SE = PSE.getSE();
6391 
6392   unsigned AS = getLoadStoreAddressSpace(I);
6393   Value *Ptr = getLoadStorePointerOperand(I);
6394   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6395   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6396   //       that it is being called from this specific place.
6397 
6398   // Figure out whether the access is strided and get the stride value
6399   // if it's known in compile time
6400   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6401 
6402   // Get the cost of the scalar memory instruction and address computation.
6403   InstructionCost Cost =
6404       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6405 
6406   // Don't pass *I here, since it is scalar but will actually be part of a
6407   // vectorized loop where the user of it is a vectorized instruction.
6408   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6409   const Align Alignment = getLoadStoreAlignment(I);
6410   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6411                                                       ValTy->getScalarType(),
6412                                                       Alignment, AS, CostKind);
6413 
6414   // Get the overhead of the extractelement and insertelement instructions
6415   // we might create due to scalarization.
6416   Cost += getScalarizationOverhead(I, VF, CostKind);
6417 
6418   // If we have a predicated load/store, it will need extra i1 extracts and
6419   // conditional branches, but may not be executed for each vector lane. Scale
6420   // the cost by the probability of executing the predicated block.
6421   if (isPredicatedInst(I)) {
6422     Cost /= getReciprocalPredBlockProb();
6423 
6424     // Add the cost of an i1 extract and a branch
6425     auto *Vec_i1Ty =
6426         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6427     Cost += TTI.getScalarizationOverhead(
6428         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6429         /*Insert=*/false, /*Extract=*/true, CostKind);
6430     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6431 
6432     if (useEmulatedMaskMemRefHack(I, VF))
6433       // Artificially setting to a high enough value to practically disable
6434       // vectorization with such operations.
6435       Cost = 3000000;
6436   }
6437 
6438   return Cost;
6439 }
6440 
6441 InstructionCost
6442 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6443                                                     ElementCount VF) {
6444   Type *ValTy = getLoadStoreType(I);
6445   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6446   Value *Ptr = getLoadStorePointerOperand(I);
6447   unsigned AS = getLoadStoreAddressSpace(I);
6448   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6449   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6450 
6451   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6452          "Stride should be 1 or -1 for consecutive memory access");
6453   const Align Alignment = getLoadStoreAlignment(I);
6454   InstructionCost Cost = 0;
6455   if (Legal->isMaskRequired(I)) {
6456     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6457                                       CostKind);
6458   } else {
6459     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6460     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6461                                 CostKind, OpInfo, I);
6462   }
6463 
6464   bool Reverse = ConsecutiveStride < 0;
6465   if (Reverse)
6466     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6467                                std::nullopt, CostKind, 0);
6468   return Cost;
6469 }
6470 
6471 InstructionCost
6472 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6473                                                 ElementCount VF) {
6474   assert(Legal->isUniformMemOp(*I));
6475 
6476   Type *ValTy = getLoadStoreType(I);
6477   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6478   const Align Alignment = getLoadStoreAlignment(I);
6479   unsigned AS = getLoadStoreAddressSpace(I);
6480   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6481   if (isa<LoadInst>(I)) {
6482     return TTI.getAddressComputationCost(ValTy) +
6483            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6484                                CostKind) +
6485            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6486   }
6487   StoreInst *SI = cast<StoreInst>(I);
6488 
6489   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6490   return TTI.getAddressComputationCost(ValTy) +
6491          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6492                              CostKind) +
6493          (isLoopInvariantStoreValue
6494               ? 0
6495               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6496                                        CostKind, VF.getKnownMinValue() - 1));
6497 }
6498 
6499 InstructionCost
6500 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6501                                                  ElementCount VF) {
6502   Type *ValTy = getLoadStoreType(I);
6503   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6504   const Align Alignment = getLoadStoreAlignment(I);
6505   const Value *Ptr = getLoadStorePointerOperand(I);
6506 
6507   return TTI.getAddressComputationCost(VectorTy) +
6508          TTI.getGatherScatterOpCost(
6509              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6510              TargetTransformInfo::TCK_RecipThroughput, I);
6511 }
6512 
6513 InstructionCost
6514 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6515                                                    ElementCount VF) {
6516   // TODO: Once we have support for interleaving with scalable vectors
6517   // we can calculate the cost properly here.
6518   if (VF.isScalable())
6519     return InstructionCost::getInvalid();
6520 
6521   Type *ValTy = getLoadStoreType(I);
6522   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6523   unsigned AS = getLoadStoreAddressSpace(I);
6524   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6525 
6526   auto Group = getInterleavedAccessGroup(I);
6527   assert(Group && "Fail to get an interleaved access group.");
6528 
6529   unsigned InterleaveFactor = Group->getFactor();
6530   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6531 
6532   // Holds the indices of existing members in the interleaved group.
6533   SmallVector<unsigned, 4> Indices;
6534   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6535     if (Group->getMember(IF))
6536       Indices.push_back(IF);
6537 
6538   // Calculate the cost of the whole interleaved group.
6539   bool UseMaskForGaps =
6540       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6541       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6542   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6543       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6544       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6545 
6546   if (Group->isReverse()) {
6547     // TODO: Add support for reversed masked interleaved access.
6548     assert(!Legal->isMaskRequired(I) &&
6549            "Reverse masked interleaved access not supported.");
6550     Cost += Group->getNumMembers() *
6551             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6552                                std::nullopt, CostKind, 0);
6553   }
6554   return Cost;
6555 }
6556 
6557 std::optional<InstructionCost>
6558 LoopVectorizationCostModel::getReductionPatternCost(
6559     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6560   using namespace llvm::PatternMatch;
6561   // Early exit for no inloop reductions
6562   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6563     return std::nullopt;
6564   auto *VectorTy = cast<VectorType>(Ty);
6565 
6566   // We are looking for a pattern of, and finding the minimal acceptable cost:
6567   //  reduce(mul(ext(A), ext(B))) or
6568   //  reduce(mul(A, B)) or
6569   //  reduce(ext(A)) or
6570   //  reduce(A).
6571   // The basic idea is that we walk down the tree to do that, finding the root
6572   // reduction instruction in InLoopReductionImmediateChains. From there we find
6573   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6574   // of the components. If the reduction cost is lower then we return it for the
6575   // reduction instruction and 0 for the other instructions in the pattern. If
6576   // it is not we return an invalid cost specifying the orignal cost method
6577   // should be used.
6578   Instruction *RetI = I;
6579   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6580     if (!RetI->hasOneUser())
6581       return std::nullopt;
6582     RetI = RetI->user_back();
6583   }
6584 
6585   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6586       RetI->user_back()->getOpcode() == Instruction::Add) {
6587     RetI = RetI->user_back();
6588   }
6589 
6590   // Test if the found instruction is a reduction, and if not return an invalid
6591   // cost specifying the parent to use the original cost modelling.
6592   if (!InLoopReductionImmediateChains.count(RetI))
6593     return std::nullopt;
6594 
6595   // Find the reduction this chain is a part of and calculate the basic cost of
6596   // the reduction on its own.
6597   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6598   Instruction *ReductionPhi = LastChain;
6599   while (!isa<PHINode>(ReductionPhi))
6600     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6601 
6602   const RecurrenceDescriptor &RdxDesc =
6603       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6604 
6605   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6606       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6607 
6608   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6609   // normal fmul instruction to the cost of the fadd reduction.
6610   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6611     BaseCost +=
6612         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6613 
6614   // If we're using ordered reductions then we can just return the base cost
6615   // here, since getArithmeticReductionCost calculates the full ordered
6616   // reduction cost when FP reassociation is not allowed.
6617   if (useOrderedReductions(RdxDesc))
6618     return BaseCost;
6619 
6620   // Get the operand that was not the reduction chain and match it to one of the
6621   // patterns, returning the better cost if it is found.
6622   Instruction *RedOp = RetI->getOperand(1) == LastChain
6623                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6624                            : dyn_cast<Instruction>(RetI->getOperand(1));
6625 
6626   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6627 
6628   Instruction *Op0, *Op1;
6629   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6630       match(RedOp,
6631             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6632       match(Op0, m_ZExtOrSExt(m_Value())) &&
6633       Op0->getOpcode() == Op1->getOpcode() &&
6634       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6635       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6636       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6637 
6638     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6639     // Note that the extend opcodes need to all match, or if A==B they will have
6640     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6641     // which is equally fine.
6642     bool IsUnsigned = isa<ZExtInst>(Op0);
6643     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6644     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6645 
6646     InstructionCost ExtCost =
6647         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6648                              TTI::CastContextHint::None, CostKind, Op0);
6649     InstructionCost MulCost =
6650         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6651     InstructionCost Ext2Cost =
6652         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6653                              TTI::CastContextHint::None, CostKind, RedOp);
6654 
6655     InstructionCost RedCost = TTI.getMulAccReductionCost(
6656         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6657 
6658     if (RedCost.isValid() &&
6659         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6660       return I == RetI ? RedCost : 0;
6661   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6662              !TheLoop->isLoopInvariant(RedOp)) {
6663     // Matched reduce(ext(A))
6664     bool IsUnsigned = isa<ZExtInst>(RedOp);
6665     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6666     InstructionCost RedCost = TTI.getExtendedReductionCost(
6667         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6668         RdxDesc.getFastMathFlags(), CostKind);
6669 
6670     InstructionCost ExtCost =
6671         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6672                              TTI::CastContextHint::None, CostKind, RedOp);
6673     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6674       return I == RetI ? RedCost : 0;
6675   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6676              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6677     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6678         Op0->getOpcode() == Op1->getOpcode() &&
6679         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6680       bool IsUnsigned = isa<ZExtInst>(Op0);
6681       Type *Op0Ty = Op0->getOperand(0)->getType();
6682       Type *Op1Ty = Op1->getOperand(0)->getType();
6683       Type *LargestOpTy =
6684           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6685                                                                     : Op0Ty;
6686       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6687 
6688       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6689       // different sizes. We take the largest type as the ext to reduce, and add
6690       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6691       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6692           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6693           TTI::CastContextHint::None, CostKind, Op0);
6694       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6695           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6696           TTI::CastContextHint::None, CostKind, Op1);
6697       InstructionCost MulCost =
6698           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6699 
6700       InstructionCost RedCost = TTI.getMulAccReductionCost(
6701           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6702       InstructionCost ExtraExtCost = 0;
6703       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6704         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6705         ExtraExtCost = TTI.getCastInstrCost(
6706             ExtraExtOp->getOpcode(), ExtType,
6707             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6708             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6709       }
6710 
6711       if (RedCost.isValid() &&
6712           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6713         return I == RetI ? RedCost : 0;
6714     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6715       // Matched reduce.add(mul())
6716       InstructionCost MulCost =
6717           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6718 
6719       InstructionCost RedCost = TTI.getMulAccReductionCost(
6720           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6721 
6722       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6723         return I == RetI ? RedCost : 0;
6724     }
6725   }
6726 
6727   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6728 }
6729 
6730 InstructionCost
6731 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6732                                                      ElementCount VF) {
6733   // Calculate scalar cost only. Vectorization cost should be ready at this
6734   // moment.
6735   if (VF.isScalar()) {
6736     Type *ValTy = getLoadStoreType(I);
6737     const Align Alignment = getLoadStoreAlignment(I);
6738     unsigned AS = getLoadStoreAddressSpace(I);
6739 
6740     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6741     return TTI.getAddressComputationCost(ValTy) +
6742            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6743                                TTI::TCK_RecipThroughput, OpInfo, I);
6744   }
6745   return getWideningCost(I, VF);
6746 }
6747 
6748 LoopVectorizationCostModel::VectorizationCostTy
6749 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6750                                                ElementCount VF) {
6751   // If we know that this instruction will remain uniform, check the cost of
6752   // the scalar version.
6753   if (isUniformAfterVectorization(I, VF))
6754     VF = ElementCount::getFixed(1);
6755 
6756   if (VF.isVector() && isProfitableToScalarize(I, VF))
6757     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6758 
6759   // Forced scalars do not have any scalarization overhead.
6760   auto ForcedScalar = ForcedScalars.find(VF);
6761   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6762     auto InstSet = ForcedScalar->second;
6763     if (InstSet.count(I))
6764       return VectorizationCostTy(
6765           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6766            VF.getKnownMinValue()),
6767           false);
6768   }
6769 
6770   Type *VectorTy;
6771   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6772 
6773   bool TypeNotScalarized = false;
6774   if (VF.isVector() && VectorTy->isVectorTy()) {
6775     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6776       if (VF.isScalable())
6777         // <vscale x 1 x iN> is assumed to be profitable over iN because
6778         // scalable registers are a distinct register class from scalar ones.
6779         // If we ever find a target which wants to lower scalable vectors
6780         // back to scalars, we'll need to update this code to explicitly
6781         // ask TTI about the register class uses for each part.
6782         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6783       else
6784         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6785     } else
6786       C = InstructionCost::getInvalid();
6787   }
6788   return VectorizationCostTy(C, TypeNotScalarized);
6789 }
6790 
6791 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6792     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6793 
6794   // There is no mechanism yet to create a scalable scalarization loop,
6795   // so this is currently Invalid.
6796   if (VF.isScalable())
6797     return InstructionCost::getInvalid();
6798 
6799   if (VF.isScalar())
6800     return 0;
6801 
6802   InstructionCost Cost = 0;
6803   Type *RetTy = ToVectorTy(I->getType(), VF);
6804   if (!RetTy->isVoidTy() &&
6805       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6806     Cost += TTI.getScalarizationOverhead(
6807         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6808         /*Insert*/ true,
6809         /*Extract*/ false, CostKind);
6810 
6811   // Some targets keep addresses scalar.
6812   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6813     return Cost;
6814 
6815   // Some targets support efficient element stores.
6816   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6817     return Cost;
6818 
6819   // Collect operands to consider.
6820   CallInst *CI = dyn_cast<CallInst>(I);
6821   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6822 
6823   // Skip operands that do not require extraction/scalarization and do not incur
6824   // any overhead.
6825   SmallVector<Type *> Tys;
6826   for (auto *V : filterExtractingOperands(Ops, VF))
6827     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6828   return Cost + TTI.getOperandsScalarizationOverhead(
6829                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6830 }
6831 
6832 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6833   if (VF.isScalar())
6834     return;
6835   NumPredStores = 0;
6836   for (BasicBlock *BB : TheLoop->blocks()) {
6837     // For each instruction in the old loop.
6838     for (Instruction &I : *BB) {
6839       Value *Ptr =  getLoadStorePointerOperand(&I);
6840       if (!Ptr)
6841         continue;
6842 
6843       // TODO: We should generate better code and update the cost model for
6844       // predicated uniform stores. Today they are treated as any other
6845       // predicated store (see added test cases in
6846       // invariant-store-vectorization.ll).
6847       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6848         NumPredStores++;
6849 
6850       if (Legal->isUniformMemOp(I)) {
6851         auto isLegalToScalarize = [&]() {
6852           if (!VF.isScalable())
6853             // Scalarization of fixed length vectors "just works".
6854             return true;
6855 
6856           // We have dedicated lowering for unpredicated uniform loads and
6857           // stores.  Note that even with tail folding we know that at least
6858           // one lane is active (i.e. generalized predication is not possible
6859           // here), and the logic below depends on this fact.
6860           if (!foldTailByMasking())
6861             return true;
6862 
6863           // For scalable vectors, a uniform memop load is always
6864           // uniform-by-parts  and we know how to scalarize that.
6865           if (isa<LoadInst>(I))
6866             return true;
6867 
6868           // A uniform store isn't neccessarily uniform-by-part
6869           // and we can't assume scalarization.
6870           auto &SI = cast<StoreInst>(I);
6871           return TheLoop->isLoopInvariant(SI.getValueOperand());
6872         };
6873 
6874         const InstructionCost GatherScatterCost =
6875           isLegalGatherOrScatter(&I, VF) ?
6876           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6877 
6878         // Load: Scalar load + broadcast
6879         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6880         // FIXME: This cost is a significant under-estimate for tail folded
6881         // memory ops.
6882         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6883           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6884 
6885         // Choose better solution for the current VF,  Note that Invalid
6886         // costs compare as maximumal large.  If both are invalid, we get
6887         // scalable invalid which signals a failure and a vectorization abort.
6888         if (GatherScatterCost < ScalarizationCost)
6889           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6890         else
6891           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6892         continue;
6893       }
6894 
6895       // We assume that widening is the best solution when possible.
6896       if (memoryInstructionCanBeWidened(&I, VF)) {
6897         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6898         int ConsecutiveStride = Legal->isConsecutivePtr(
6899             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6900         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6901                "Expected consecutive stride.");
6902         InstWidening Decision =
6903             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6904         setWideningDecision(&I, VF, Decision, Cost);
6905         continue;
6906       }
6907 
6908       // Choose between Interleaving, Gather/Scatter or Scalarization.
6909       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6910       unsigned NumAccesses = 1;
6911       if (isAccessInterleaved(&I)) {
6912         auto Group = getInterleavedAccessGroup(&I);
6913         assert(Group && "Fail to get an interleaved access group.");
6914 
6915         // Make one decision for the whole group.
6916         if (getWideningDecision(&I, VF) != CM_Unknown)
6917           continue;
6918 
6919         NumAccesses = Group->getNumMembers();
6920         if (interleavedAccessCanBeWidened(&I, VF))
6921           InterleaveCost = getInterleaveGroupCost(&I, VF);
6922       }
6923 
6924       InstructionCost GatherScatterCost =
6925           isLegalGatherOrScatter(&I, VF)
6926               ? getGatherScatterCost(&I, VF) * NumAccesses
6927               : InstructionCost::getInvalid();
6928 
6929       InstructionCost ScalarizationCost =
6930           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6931 
6932       // Choose better solution for the current VF,
6933       // write down this decision and use it during vectorization.
6934       InstructionCost Cost;
6935       InstWidening Decision;
6936       if (InterleaveCost <= GatherScatterCost &&
6937           InterleaveCost < ScalarizationCost) {
6938         Decision = CM_Interleave;
6939         Cost = InterleaveCost;
6940       } else if (GatherScatterCost < ScalarizationCost) {
6941         Decision = CM_GatherScatter;
6942         Cost = GatherScatterCost;
6943       } else {
6944         Decision = CM_Scalarize;
6945         Cost = ScalarizationCost;
6946       }
6947       // If the instructions belongs to an interleave group, the whole group
6948       // receives the same decision. The whole group receives the cost, but
6949       // the cost will actually be assigned to one instruction.
6950       if (auto Group = getInterleavedAccessGroup(&I))
6951         setWideningDecision(Group, VF, Decision, Cost);
6952       else
6953         setWideningDecision(&I, VF, Decision, Cost);
6954     }
6955   }
6956 
6957   // Make sure that any load of address and any other address computation
6958   // remains scalar unless there is gather/scatter support. This avoids
6959   // inevitable extracts into address registers, and also has the benefit of
6960   // activating LSR more, since that pass can't optimize vectorized
6961   // addresses.
6962   if (TTI.prefersVectorizedAddressing())
6963     return;
6964 
6965   // Start with all scalar pointer uses.
6966   SmallPtrSet<Instruction *, 8> AddrDefs;
6967   for (BasicBlock *BB : TheLoop->blocks())
6968     for (Instruction &I : *BB) {
6969       Instruction *PtrDef =
6970         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6971       if (PtrDef && TheLoop->contains(PtrDef) &&
6972           getWideningDecision(&I, VF) != CM_GatherScatter)
6973         AddrDefs.insert(PtrDef);
6974     }
6975 
6976   // Add all instructions used to generate the addresses.
6977   SmallVector<Instruction *, 4> Worklist;
6978   append_range(Worklist, AddrDefs);
6979   while (!Worklist.empty()) {
6980     Instruction *I = Worklist.pop_back_val();
6981     for (auto &Op : I->operands())
6982       if (auto *InstOp = dyn_cast<Instruction>(Op))
6983         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6984             AddrDefs.insert(InstOp).second)
6985           Worklist.push_back(InstOp);
6986   }
6987 
6988   for (auto *I : AddrDefs) {
6989     if (isa<LoadInst>(I)) {
6990       // Setting the desired widening decision should ideally be handled in
6991       // by cost functions, but since this involves the task of finding out
6992       // if the loaded register is involved in an address computation, it is
6993       // instead changed here when we know this is the case.
6994       InstWidening Decision = getWideningDecision(I, VF);
6995       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6996         // Scalarize a widened load of address.
6997         setWideningDecision(
6998             I, VF, CM_Scalarize,
6999             (VF.getKnownMinValue() *
7000              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7001       else if (auto Group = getInterleavedAccessGroup(I)) {
7002         // Scalarize an interleave group of address loads.
7003         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7004           if (Instruction *Member = Group->getMember(I))
7005             setWideningDecision(
7006                 Member, VF, CM_Scalarize,
7007                 (VF.getKnownMinValue() *
7008                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7009         }
7010       }
7011     } else
7012       // Make sure I gets scalarized and a cost estimate without
7013       // scalarization overhead.
7014       ForcedScalars[VF].insert(I);
7015   }
7016 }
7017 
7018 InstructionCost
7019 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7020                                                Type *&VectorTy) {
7021   Type *RetTy = I->getType();
7022   if (canTruncateToMinimalBitwidth(I, VF))
7023     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7024   auto SE = PSE.getSE();
7025   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7026 
7027   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7028                                                 ElementCount VF) -> bool {
7029     if (VF.isScalar())
7030       return true;
7031 
7032     auto Scalarized = InstsToScalarize.find(VF);
7033     assert(Scalarized != InstsToScalarize.end() &&
7034            "VF not yet analyzed for scalarization profitability");
7035     return !Scalarized->second.count(I) &&
7036            llvm::all_of(I->users(), [&](User *U) {
7037              auto *UI = cast<Instruction>(U);
7038              return !Scalarized->second.count(UI);
7039            });
7040   };
7041   (void) hasSingleCopyAfterVectorization;
7042 
7043   if (isScalarAfterVectorization(I, VF)) {
7044     // With the exception of GEPs and PHIs, after scalarization there should
7045     // only be one copy of the instruction generated in the loop. This is
7046     // because the VF is either 1, or any instructions that need scalarizing
7047     // have already been dealt with by the the time we get here. As a result,
7048     // it means we don't have to multiply the instruction cost by VF.
7049     assert(I->getOpcode() == Instruction::GetElementPtr ||
7050            I->getOpcode() == Instruction::PHI ||
7051            (I->getOpcode() == Instruction::BitCast &&
7052             I->getType()->isPointerTy()) ||
7053            hasSingleCopyAfterVectorization(I, VF));
7054     VectorTy = RetTy;
7055   } else
7056     VectorTy = ToVectorTy(RetTy, VF);
7057 
7058   // TODO: We need to estimate the cost of intrinsic calls.
7059   switch (I->getOpcode()) {
7060   case Instruction::GetElementPtr:
7061     // We mark this instruction as zero-cost because the cost of GEPs in
7062     // vectorized code depends on whether the corresponding memory instruction
7063     // is scalarized or not. Therefore, we handle GEPs with the memory
7064     // instruction cost.
7065     return 0;
7066   case Instruction::Br: {
7067     // In cases of scalarized and predicated instructions, there will be VF
7068     // predicated blocks in the vectorized loop. Each branch around these
7069     // blocks requires also an extract of its vector compare i1 element.
7070     bool ScalarPredicatedBB = false;
7071     BranchInst *BI = cast<BranchInst>(I);
7072     if (VF.isVector() && BI->isConditional() &&
7073         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7074          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7075       ScalarPredicatedBB = true;
7076 
7077     if (ScalarPredicatedBB) {
7078       // Not possible to scalarize scalable vector with predicated instructions.
7079       if (VF.isScalable())
7080         return InstructionCost::getInvalid();
7081       // Return cost for branches around scalarized and predicated blocks.
7082       auto *Vec_i1Ty =
7083           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7084       return (
7085           TTI.getScalarizationOverhead(
7086               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7087               /*Insert*/ false, /*Extract*/ true, CostKind) +
7088           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7089     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7090       // The back-edge branch will remain, as will all scalar branches.
7091       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7092     else
7093       // This branch will be eliminated by if-conversion.
7094       return 0;
7095     // Note: We currently assume zero cost for an unconditional branch inside
7096     // a predicated block since it will become a fall-through, although we
7097     // may decide in the future to call TTI for all branches.
7098   }
7099   case Instruction::PHI: {
7100     auto *Phi = cast<PHINode>(I);
7101 
7102     // First-order recurrences are replaced by vector shuffles inside the loop.
7103     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7104       SmallVector<int> Mask(VF.getKnownMinValue());
7105       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7106       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7107                                 cast<VectorType>(VectorTy), Mask, CostKind,
7108                                 VF.getKnownMinValue() - 1);
7109     }
7110 
7111     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7112     // converted into select instructions. We require N - 1 selects per phi
7113     // node, where N is the number of incoming values.
7114     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7115       return (Phi->getNumIncomingValues() - 1) *
7116              TTI.getCmpSelInstrCost(
7117                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7118                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7119                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7120 
7121     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7122   }
7123   case Instruction::UDiv:
7124   case Instruction::SDiv:
7125   case Instruction::URem:
7126   case Instruction::SRem:
7127     if (VF.isVector() && isPredicatedInst(I)) {
7128       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7129       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7130         ScalarCost : SafeDivisorCost;
7131     }
7132     // We've proven all lanes safe to speculate, fall through.
7133     [[fallthrough]];
7134   case Instruction::Add:
7135   case Instruction::FAdd:
7136   case Instruction::Sub:
7137   case Instruction::FSub:
7138   case Instruction::Mul:
7139   case Instruction::FMul:
7140   case Instruction::FDiv:
7141   case Instruction::FRem:
7142   case Instruction::Shl:
7143   case Instruction::LShr:
7144   case Instruction::AShr:
7145   case Instruction::And:
7146   case Instruction::Or:
7147   case Instruction::Xor: {
7148     // Since we will replace the stride by 1 the multiplication should go away.
7149     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7150       return 0;
7151 
7152     // Detect reduction patterns
7153     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7154       return *RedCost;
7155 
7156     // Certain instructions can be cheaper to vectorize if they have a constant
7157     // second vector operand. One example of this are shifts on x86.
7158     Value *Op2 = I->getOperand(1);
7159     auto Op2Info = TTI.getOperandInfo(Op2);
7160     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7161       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7162 
7163     SmallVector<const Value *, 4> Operands(I->operand_values());
7164     return TTI.getArithmeticInstrCost(
7165         I->getOpcode(), VectorTy, CostKind,
7166         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7167         Op2Info, Operands, I);
7168   }
7169   case Instruction::FNeg: {
7170     return TTI.getArithmeticInstrCost(
7171         I->getOpcode(), VectorTy, CostKind,
7172         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7173         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7174         I->getOperand(0), I);
7175   }
7176   case Instruction::Select: {
7177     SelectInst *SI = cast<SelectInst>(I);
7178     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7179     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7180 
7181     const Value *Op0, *Op1;
7182     using namespace llvm::PatternMatch;
7183     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7184                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7185       // select x, y, false --> x & y
7186       // select x, true, y --> x | y
7187       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7188       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7189       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7190               Op1->getType()->getScalarSizeInBits() == 1);
7191 
7192       SmallVector<const Value *, 2> Operands{Op0, Op1};
7193       return TTI.getArithmeticInstrCost(
7194           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7195           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7196     }
7197 
7198     Type *CondTy = SI->getCondition()->getType();
7199     if (!ScalarCond)
7200       CondTy = VectorType::get(CondTy, VF);
7201 
7202     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7203     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7204       Pred = Cmp->getPredicate();
7205     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7206                                   CostKind, I);
7207   }
7208   case Instruction::ICmp:
7209   case Instruction::FCmp: {
7210     Type *ValTy = I->getOperand(0)->getType();
7211     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7212     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7213       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7214     VectorTy = ToVectorTy(ValTy, VF);
7215     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7216                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7217                                   I);
7218   }
7219   case Instruction::Store:
7220   case Instruction::Load: {
7221     ElementCount Width = VF;
7222     if (Width.isVector()) {
7223       InstWidening Decision = getWideningDecision(I, Width);
7224       assert(Decision != CM_Unknown &&
7225              "CM decision should be taken at this point");
7226       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7227         return InstructionCost::getInvalid();
7228       if (Decision == CM_Scalarize)
7229         Width = ElementCount::getFixed(1);
7230     }
7231     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7232     return getMemoryInstructionCost(I, VF);
7233   }
7234   case Instruction::BitCast:
7235     if (I->getType()->isPointerTy())
7236       return 0;
7237     [[fallthrough]];
7238   case Instruction::ZExt:
7239   case Instruction::SExt:
7240   case Instruction::FPToUI:
7241   case Instruction::FPToSI:
7242   case Instruction::FPExt:
7243   case Instruction::PtrToInt:
7244   case Instruction::IntToPtr:
7245   case Instruction::SIToFP:
7246   case Instruction::UIToFP:
7247   case Instruction::Trunc:
7248   case Instruction::FPTrunc: {
7249     // Computes the CastContextHint from a Load/Store instruction.
7250     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7251       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7252              "Expected a load or a store!");
7253 
7254       if (VF.isScalar() || !TheLoop->contains(I))
7255         return TTI::CastContextHint::Normal;
7256 
7257       switch (getWideningDecision(I, VF)) {
7258       case LoopVectorizationCostModel::CM_GatherScatter:
7259         return TTI::CastContextHint::GatherScatter;
7260       case LoopVectorizationCostModel::CM_Interleave:
7261         return TTI::CastContextHint::Interleave;
7262       case LoopVectorizationCostModel::CM_Scalarize:
7263       case LoopVectorizationCostModel::CM_Widen:
7264         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7265                                         : TTI::CastContextHint::Normal;
7266       case LoopVectorizationCostModel::CM_Widen_Reverse:
7267         return TTI::CastContextHint::Reversed;
7268       case LoopVectorizationCostModel::CM_Unknown:
7269         llvm_unreachable("Instr did not go through cost modelling?");
7270       }
7271 
7272       llvm_unreachable("Unhandled case!");
7273     };
7274 
7275     unsigned Opcode = I->getOpcode();
7276     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7277     // For Trunc, the context is the only user, which must be a StoreInst.
7278     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7279       if (I->hasOneUse())
7280         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7281           CCH = ComputeCCH(Store);
7282     }
7283     // For Z/Sext, the context is the operand, which must be a LoadInst.
7284     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7285              Opcode == Instruction::FPExt) {
7286       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7287         CCH = ComputeCCH(Load);
7288     }
7289 
7290     // We optimize the truncation of induction variables having constant
7291     // integer steps. The cost of these truncations is the same as the scalar
7292     // operation.
7293     if (isOptimizableIVTruncate(I, VF)) {
7294       auto *Trunc = cast<TruncInst>(I);
7295       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7296                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7297     }
7298 
7299     // Detect reduction patterns
7300     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7301       return *RedCost;
7302 
7303     Type *SrcScalarTy = I->getOperand(0)->getType();
7304     Type *SrcVecTy =
7305         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7306     if (canTruncateToMinimalBitwidth(I, VF)) {
7307       // This cast is going to be shrunk. This may remove the cast or it might
7308       // turn it into slightly different cast. For example, if MinBW == 16,
7309       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7310       //
7311       // Calculate the modified src and dest types.
7312       Type *MinVecTy = VectorTy;
7313       if (Opcode == Instruction::Trunc) {
7314         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7315         VectorTy =
7316             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7317       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7318         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7319         VectorTy =
7320             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7321       }
7322     }
7323 
7324     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7325   }
7326   case Instruction::Call: {
7327     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7328       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7329         return *RedCost;
7330     bool NeedToScalarize;
7331     CallInst *CI = cast<CallInst>(I);
7332     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7333     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7334       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7335       return std::min(CallCost, IntrinsicCost);
7336     }
7337     return CallCost;
7338   }
7339   case Instruction::ExtractValue:
7340     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7341   case Instruction::Alloca:
7342     // We cannot easily widen alloca to a scalable alloca, as
7343     // the result would need to be a vector of pointers.
7344     if (VF.isScalable())
7345       return InstructionCost::getInvalid();
7346     [[fallthrough]];
7347   default:
7348     // This opcode is unknown. Assume that it is the same as 'mul'.
7349     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7350   } // end of switch.
7351 }
7352 
7353 char LoopVectorize::ID = 0;
7354 
7355 static const char lv_name[] = "Loop Vectorization";
7356 
7357 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7358 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7359 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7360 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7361 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7362 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7363 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7364 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7365 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7366 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7367 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7368 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7369 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7370 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7371 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7372 
7373 namespace llvm {
7374 
7375 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7376 
7377 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7378                               bool VectorizeOnlyWhenForced) {
7379   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7380 }
7381 
7382 } // end namespace llvm
7383 
7384 void LoopVectorizationCostModel::collectValuesToIgnore() {
7385   // Ignore ephemeral values.
7386   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7387 
7388   // Find all stores to invariant variables. Since they are going to sink
7389   // outside the loop we do not need calculate cost for them.
7390   for (BasicBlock *BB : TheLoop->blocks())
7391     for (Instruction &I : *BB) {
7392       StoreInst *SI;
7393       if ((SI = dyn_cast<StoreInst>(&I)) &&
7394           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7395         ValuesToIgnore.insert(&I);
7396     }
7397 
7398   // Ignore type-promoting instructions we identified during reduction
7399   // detection.
7400   for (const auto &Reduction : Legal->getReductionVars()) {
7401     const RecurrenceDescriptor &RedDes = Reduction.second;
7402     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7403     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7404   }
7405   // Ignore type-casting instructions we identified during induction
7406   // detection.
7407   for (const auto &Induction : Legal->getInductionVars()) {
7408     const InductionDescriptor &IndDes = Induction.second;
7409     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7410     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7411   }
7412 }
7413 
7414 void LoopVectorizationCostModel::collectInLoopReductions() {
7415   for (const auto &Reduction : Legal->getReductionVars()) {
7416     PHINode *Phi = Reduction.first;
7417     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7418 
7419     // We don't collect reductions that are type promoted (yet).
7420     if (RdxDesc.getRecurrenceType() != Phi->getType())
7421       continue;
7422 
7423     // If the target would prefer this reduction to happen "in-loop", then we
7424     // want to record it as such.
7425     unsigned Opcode = RdxDesc.getOpcode();
7426     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7427         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7428                                    TargetTransformInfo::ReductionFlags()))
7429       continue;
7430 
7431     // Check that we can correctly put the reductions into the loop, by
7432     // finding the chain of operations that leads from the phi to the loop
7433     // exit value.
7434     SmallVector<Instruction *, 4> ReductionOperations =
7435         RdxDesc.getReductionOpChain(Phi, TheLoop);
7436     bool InLoop = !ReductionOperations.empty();
7437     if (InLoop) {
7438       InLoopReductionChains[Phi] = ReductionOperations;
7439       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7440       Instruction *LastChain = Phi;
7441       for (auto *I : ReductionOperations) {
7442         InLoopReductionImmediateChains[I] = LastChain;
7443         LastChain = I;
7444       }
7445     }
7446     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7447                       << " reduction for phi: " << *Phi << "\n");
7448   }
7449 }
7450 
7451 // TODO: we could return a pair of values that specify the max VF and
7452 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7453 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7454 // doesn't have a cost model that can choose which plan to execute if
7455 // more than one is generated.
7456 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7457                                  LoopVectorizationCostModel &CM) {
7458   unsigned WidestType;
7459   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7460   return WidestVectorRegBits / WidestType;
7461 }
7462 
7463 VectorizationFactor
7464 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7465   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7466   ElementCount VF = UserVF;
7467   // Outer loop handling: They may require CFG and instruction level
7468   // transformations before even evaluating whether vectorization is profitable.
7469   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7470   // the vectorization pipeline.
7471   if (!OrigLoop->isInnermost()) {
7472     // If the user doesn't provide a vectorization factor, determine a
7473     // reasonable one.
7474     if (UserVF.isZero()) {
7475       VF = ElementCount::getFixed(determineVPlanVF(
7476           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7477               .getFixedValue(),
7478           CM));
7479       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7480 
7481       // Make sure we have a VF > 1 for stress testing.
7482       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7483         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7484                           << "overriding computed VF.\n");
7485         VF = ElementCount::getFixed(4);
7486       }
7487     }
7488     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7489     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7490            "VF needs to be a power of two");
7491     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7492                       << "VF " << VF << " to build VPlans.\n");
7493     buildVPlans(VF, VF);
7494 
7495     // For VPlan build stress testing, we bail out after VPlan construction.
7496     if (VPlanBuildStressTest)
7497       return VectorizationFactor::Disabled();
7498 
7499     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7500   }
7501 
7502   LLVM_DEBUG(
7503       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7504                 "VPlan-native path.\n");
7505   return VectorizationFactor::Disabled();
7506 }
7507 
7508 std::optional<VectorizationFactor>
7509 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7510   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7511   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7512   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7513     return std::nullopt;
7514 
7515   // Invalidate interleave groups if all blocks of loop will be predicated.
7516   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7517       !useMaskedInterleavedAccesses(*TTI)) {
7518     LLVM_DEBUG(
7519         dbgs()
7520         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7521            "which requires masked-interleaved support.\n");
7522     if (CM.InterleaveInfo.invalidateGroups())
7523       // Invalidating interleave groups also requires invalidating all decisions
7524       // based on them, which includes widening decisions and uniform and scalar
7525       // values.
7526       CM.invalidateCostModelingDecisions();
7527   }
7528 
7529   ElementCount MaxUserVF =
7530       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7531   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7532   if (!UserVF.isZero() && UserVFIsLegal) {
7533     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7534            "VF needs to be a power of two");
7535     // Collect the instructions (and their associated costs) that will be more
7536     // profitable to scalarize.
7537     if (CM.selectUserVectorizationFactor(UserVF)) {
7538       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7539       CM.collectInLoopReductions();
7540       buildVPlansWithVPRecipes(UserVF, UserVF);
7541       LLVM_DEBUG(printPlans(dbgs()));
7542       return {{UserVF, 0, 0}};
7543     } else
7544       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7545                               "InvalidCost", ORE, OrigLoop);
7546   }
7547 
7548   // Populate the set of Vectorization Factor Candidates.
7549   ElementCountSet VFCandidates;
7550   for (auto VF = ElementCount::getFixed(1);
7551        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7552     VFCandidates.insert(VF);
7553   for (auto VF = ElementCount::getScalable(1);
7554        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7555     VFCandidates.insert(VF);
7556 
7557   for (const auto &VF : VFCandidates) {
7558     // Collect Uniform and Scalar instructions after vectorization with VF.
7559     CM.collectUniformsAndScalars(VF);
7560 
7561     // Collect the instructions (and their associated costs) that will be more
7562     // profitable to scalarize.
7563     if (VF.isVector())
7564       CM.collectInstsToScalarize(VF);
7565   }
7566 
7567   CM.collectInLoopReductions();
7568   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7569   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7570 
7571   LLVM_DEBUG(printPlans(dbgs()));
7572   if (!MaxFactors.hasVector())
7573     return VectorizationFactor::Disabled();
7574 
7575   // Select the optimal vectorization factor.
7576   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7577   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7578   return VF;
7579 }
7580 
7581 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7582   assert(count_if(VPlans,
7583                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7584              1 &&
7585          "Best VF has not a single VPlan.");
7586 
7587   for (const VPlanPtr &Plan : VPlans) {
7588     if (Plan->hasVF(VF))
7589       return *Plan.get();
7590   }
7591   llvm_unreachable("No plan found!");
7592 }
7593 
7594 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7595   SmallVector<Metadata *, 4> MDs;
7596   // Reserve first location for self reference to the LoopID metadata node.
7597   MDs.push_back(nullptr);
7598   bool IsUnrollMetadata = false;
7599   MDNode *LoopID = L->getLoopID();
7600   if (LoopID) {
7601     // First find existing loop unrolling disable metadata.
7602     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7603       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7604       if (MD) {
7605         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7606         IsUnrollMetadata =
7607             S && S->getString().startswith("llvm.loop.unroll.disable");
7608       }
7609       MDs.push_back(LoopID->getOperand(i));
7610     }
7611   }
7612 
7613   if (!IsUnrollMetadata) {
7614     // Add runtime unroll disable metadata.
7615     LLVMContext &Context = L->getHeader()->getContext();
7616     SmallVector<Metadata *, 1> DisableOperands;
7617     DisableOperands.push_back(
7618         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7619     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7620     MDs.push_back(DisableNode);
7621     MDNode *NewLoopID = MDNode::get(Context, MDs);
7622     // Set operand 0 to refer to the loop id itself.
7623     NewLoopID->replaceOperandWith(0, NewLoopID);
7624     L->setLoopID(NewLoopID);
7625   }
7626 }
7627 
7628 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7629                                            VPlan &BestVPlan,
7630                                            InnerLoopVectorizer &ILV,
7631                                            DominatorTree *DT,
7632                                            bool IsEpilogueVectorization) {
7633   assert(BestVPlan.hasVF(BestVF) &&
7634          "Trying to execute plan with unsupported VF");
7635   assert(BestVPlan.hasUF(BestUF) &&
7636          "Trying to execute plan with unsupported UF");
7637 
7638   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7639                     << '\n');
7640 
7641   // Workaround!  Compute the trip count of the original loop and cache it
7642   // before we start modifying the CFG.  This code has a systemic problem
7643   // wherein it tries to run analysis over partially constructed IR; this is
7644   // wrong, and not simply for SCEV.  The trip count of the original loop
7645   // simply happens to be prone to hitting this in practice.  In theory, we
7646   // can hit the same issue for any SCEV, or ValueTracking query done during
7647   // mutation.  See PR49900.
7648   ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7649 
7650   if (!IsEpilogueVectorization)
7651     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7652 
7653   // Perform the actual loop transformation.
7654 
7655   // 1. Set up the skeleton for vectorization, including vector pre-header and
7656   // middle block. The vector loop is created during VPlan execution.
7657   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7658   Value *CanonicalIVStartValue;
7659   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7660       ILV.createVectorizedLoopSkeleton();
7661 
7662   // Only use noalias metadata when using memory checks guaranteeing no overlap
7663   // across all iterations.
7664   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7665   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7666       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7667 
7668     //  We currently don't use LoopVersioning for the actual loop cloning but we
7669     //  still use it to add the noalias metadata.
7670     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7671     //        metadata.
7672     State.LVer = std::make_unique<LoopVersioning>(
7673         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7674         PSE.getSE());
7675     State.LVer->prepareNoAliasMetadata();
7676   }
7677 
7678   ILV.collectPoisonGeneratingRecipes(State);
7679 
7680   ILV.printDebugTracesAtStart();
7681 
7682   //===------------------------------------------------===//
7683   //
7684   // Notice: any optimization or new instruction that go
7685   // into the code below should also be implemented in
7686   // the cost-model.
7687   //
7688   //===------------------------------------------------===//
7689 
7690   // 2. Copy and widen instructions from the old loop into the new loop.
7691   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7692                              ILV.getOrCreateVectorTripCount(nullptr),
7693                              CanonicalIVStartValue, State,
7694                              IsEpilogueVectorization);
7695 
7696   BestVPlan.execute(&State);
7697 
7698   // Keep all loop hints from the original loop on the vector loop (we'll
7699   // replace the vectorizer-specific hints below).
7700   MDNode *OrigLoopID = OrigLoop->getLoopID();
7701 
7702   std::optional<MDNode *> VectorizedLoopID =
7703       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7704                                       LLVMLoopVectorizeFollowupVectorized});
7705 
7706   VPBasicBlock *HeaderVPBB =
7707       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7708   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7709   if (VectorizedLoopID)
7710     L->setLoopID(*VectorizedLoopID);
7711   else {
7712     // Keep all loop hints from the original loop on the vector loop (we'll
7713     // replace the vectorizer-specific hints below).
7714     if (MDNode *LID = OrigLoop->getLoopID())
7715       L->setLoopID(LID);
7716 
7717     LoopVectorizeHints Hints(L, true, *ORE);
7718     Hints.setAlreadyVectorized();
7719   }
7720   AddRuntimeUnrollDisableMetaData(L);
7721 
7722   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7723   //    predication, updating analyses.
7724   ILV.fixVectorizedLoop(State, BestVPlan);
7725 
7726   ILV.printDebugTracesAtEnd();
7727 }
7728 
7729 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7730 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7731   for (const auto &Plan : VPlans)
7732     if (PrintVPlansInDotFormat)
7733       Plan->printDOT(O);
7734     else
7735       Plan->print(O);
7736 }
7737 #endif
7738 
7739 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7740 
7741 //===--------------------------------------------------------------------===//
7742 // EpilogueVectorizerMainLoop
7743 //===--------------------------------------------------------------------===//
7744 
7745 /// This function is partially responsible for generating the control flow
7746 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7747 std::pair<BasicBlock *, Value *>
7748 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7749   createVectorLoopSkeleton("");
7750 
7751   // Generate the code to check the minimum iteration count of the vector
7752   // epilogue (see below).
7753   EPI.EpilogueIterationCountCheck =
7754       emitIterationCountCheck(LoopScalarPreHeader, true);
7755   EPI.EpilogueIterationCountCheck->setName("iter.check");
7756 
7757   // Generate the code to check any assumptions that we've made for SCEV
7758   // expressions.
7759   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7760 
7761   // Generate the code that checks at runtime if arrays overlap. We put the
7762   // checks into a separate block to make the more common case of few elements
7763   // faster.
7764   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7765 
7766   // Generate the iteration count check for the main loop, *after* the check
7767   // for the epilogue loop, so that the path-length is shorter for the case
7768   // that goes directly through the vector epilogue. The longer-path length for
7769   // the main loop is compensated for, by the gain from vectorizing the larger
7770   // trip count. Note: the branch will get updated later on when we vectorize
7771   // the epilogue.
7772   EPI.MainLoopIterationCountCheck =
7773       emitIterationCountCheck(LoopScalarPreHeader, false);
7774 
7775   // Generate the induction variable.
7776   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7777 
7778   // Skip induction resume value creation here because they will be created in
7779   // the second pass for the scalar loop. The induction resume values for the
7780   // inductions in the epilogue loop are created before executing the plan for
7781   // the epilogue loop.
7782 
7783   return {completeLoopSkeleton(), nullptr};
7784 }
7785 
7786 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7787   LLVM_DEBUG({
7788     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7789            << "Main Loop VF:" << EPI.MainLoopVF
7790            << ", Main Loop UF:" << EPI.MainLoopUF
7791            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7792            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7793   });
7794 }
7795 
7796 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7797   DEBUG_WITH_TYPE(VerboseDebug, {
7798     dbgs() << "intermediate fn:\n"
7799            << *OrigLoop->getHeader()->getParent() << "\n";
7800   });
7801 }
7802 
7803 BasicBlock *
7804 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7805                                                     bool ForEpilogue) {
7806   assert(Bypass && "Expected valid bypass basic block.");
7807   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7808   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7809   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7810   // Reuse existing vector loop preheader for TC checks.
7811   // Note that new preheader block is generated for vector loop.
7812   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7813   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7814 
7815   // Generate code to check if the loop's trip count is less than VF * UF of the
7816   // main vector loop.
7817   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7818       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7819 
7820   Value *CheckMinIters = Builder.CreateICmp(
7821       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7822       "min.iters.check");
7823 
7824   if (!ForEpilogue)
7825     TCCheckBlock->setName("vector.main.loop.iter.check");
7826 
7827   // Create new preheader for vector loop.
7828   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7829                                    DT, LI, nullptr, "vector.ph");
7830 
7831   if (ForEpilogue) {
7832     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7833                                  DT->getNode(Bypass)->getIDom()) &&
7834            "TC check is expected to dominate Bypass");
7835 
7836     // Update dominator for Bypass & LoopExit.
7837     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7838     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7839       // For loops with multiple exits, there's no edge from the middle block
7840       // to exit blocks (as the epilogue must run) and thus no need to update
7841       // the immediate dominator of the exit blocks.
7842       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7843 
7844     LoopBypassBlocks.push_back(TCCheckBlock);
7845 
7846     // Save the trip count so we don't have to regenerate it in the
7847     // vec.epilog.iter.check. This is safe to do because the trip count
7848     // generated here dominates the vector epilog iter check.
7849     EPI.TripCount = Count;
7850   }
7851 
7852   ReplaceInstWithInst(
7853       TCCheckBlock->getTerminator(),
7854       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7855 
7856   return TCCheckBlock;
7857 }
7858 
7859 //===--------------------------------------------------------------------===//
7860 // EpilogueVectorizerEpilogueLoop
7861 //===--------------------------------------------------------------------===//
7862 
7863 /// This function is partially responsible for generating the control flow
7864 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7865 std::pair<BasicBlock *, Value *>
7866 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7867   createVectorLoopSkeleton("vec.epilog.");
7868 
7869   // Now, compare the remaining count and if there aren't enough iterations to
7870   // execute the vectorized epilogue skip to the scalar part.
7871   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7872   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7873   LoopVectorPreHeader =
7874       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7875                  LI, nullptr, "vec.epilog.ph");
7876   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7877                                           VecEpilogueIterationCountCheck);
7878 
7879   // Adjust the control flow taking the state info from the main loop
7880   // vectorization into account.
7881   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7882          "expected this to be saved from the previous pass.");
7883   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7884       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7885 
7886   DT->changeImmediateDominator(LoopVectorPreHeader,
7887                                EPI.MainLoopIterationCountCheck);
7888 
7889   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7890       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7891 
7892   if (EPI.SCEVSafetyCheck)
7893     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7894         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7895   if (EPI.MemSafetyCheck)
7896     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7897         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7898 
7899   DT->changeImmediateDominator(
7900       VecEpilogueIterationCountCheck,
7901       VecEpilogueIterationCountCheck->getSinglePredecessor());
7902 
7903   DT->changeImmediateDominator(LoopScalarPreHeader,
7904                                EPI.EpilogueIterationCountCheck);
7905   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7906     // If there is an epilogue which must run, there's no edge from the
7907     // middle block to exit blocks  and thus no need to update the immediate
7908     // dominator of the exit blocks.
7909     DT->changeImmediateDominator(LoopExitBlock,
7910                                  EPI.EpilogueIterationCountCheck);
7911 
7912   // Keep track of bypass blocks, as they feed start values to the induction and
7913   // reduction phis in the scalar loop preheader.
7914   if (EPI.SCEVSafetyCheck)
7915     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7916   if (EPI.MemSafetyCheck)
7917     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7918   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7919 
7920   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7921   // reductions which merge control-flow from the latch block and the middle
7922   // block. Update the incoming values here and move the Phi into the preheader.
7923   SmallVector<PHINode *, 4> PhisInBlock;
7924   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7925     PhisInBlock.push_back(&Phi);
7926 
7927   for (PHINode *Phi : PhisInBlock) {
7928     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7929     Phi->replaceIncomingBlockWith(
7930         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7931         VecEpilogueIterationCountCheck);
7932 
7933     // If the phi doesn't have an incoming value from the
7934     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7935     // value and also those from other check blocks. This is needed for
7936     // reduction phis only.
7937     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7938           return EPI.EpilogueIterationCountCheck == IncB;
7939         }))
7940       continue;
7941     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7942     if (EPI.SCEVSafetyCheck)
7943       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7944     if (EPI.MemSafetyCheck)
7945       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7946   }
7947 
7948   // Generate a resume induction for the vector epilogue and put it in the
7949   // vector epilogue preheader
7950   Type *IdxTy = Legal->getWidestInductionType();
7951   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7952                                          LoopVectorPreHeader->getFirstNonPHI());
7953   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7954   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7955                            EPI.MainLoopIterationCountCheck);
7956 
7957   // Generate induction resume values. These variables save the new starting
7958   // indexes for the scalar loop. They are used to test if there are any tail
7959   // iterations left once the vector loop has completed.
7960   // Note that when the vectorized epilogue is skipped due to iteration count
7961   // check, then the resume value for the induction variable comes from
7962   // the trip count of the main vector loop, hence passing the AdditionalBypass
7963   // argument.
7964   createInductionResumeValues({VecEpilogueIterationCountCheck,
7965                                EPI.VectorTripCount} /* AdditionalBypass */);
7966 
7967   return {completeLoopSkeleton(), EPResumeVal};
7968 }
7969 
7970 BasicBlock *
7971 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7972     BasicBlock *Bypass, BasicBlock *Insert) {
7973 
7974   assert(EPI.TripCount &&
7975          "Expected trip count to have been safed in the first pass.");
7976   assert(
7977       (!isa<Instruction>(EPI.TripCount) ||
7978        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7979       "saved trip count does not dominate insertion point.");
7980   Value *TC = EPI.TripCount;
7981   IRBuilder<> Builder(Insert->getTerminator());
7982   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7983 
7984   // Generate code to check if the loop's trip count is less than VF * UF of the
7985   // vector epilogue loop.
7986   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7987       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7988 
7989   Value *CheckMinIters =
7990       Builder.CreateICmp(P, Count,
7991                          createStepForVF(Builder, Count->getType(),
7992                                          EPI.EpilogueVF, EPI.EpilogueUF),
7993                          "min.epilog.iters.check");
7994 
7995   ReplaceInstWithInst(
7996       Insert->getTerminator(),
7997       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7998 
7999   LoopBypassBlocks.push_back(Insert);
8000   return Insert;
8001 }
8002 
8003 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8004   LLVM_DEBUG({
8005     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8006            << "Epilogue Loop VF:" << EPI.EpilogueVF
8007            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8008   });
8009 }
8010 
8011 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8012   DEBUG_WITH_TYPE(VerboseDebug, {
8013     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8014   });
8015 }
8016 
8017 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8018     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8019   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8020   bool PredicateAtRangeStart = Predicate(Range.Start);
8021 
8022   for (ElementCount TmpVF = Range.Start * 2;
8023        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8024     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8025       Range.End = TmpVF;
8026       break;
8027     }
8028 
8029   return PredicateAtRangeStart;
8030 }
8031 
8032 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8033 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8034 /// of VF's starting at a given VF and extending it as much as possible. Each
8035 /// vectorization decision can potentially shorten this sub-range during
8036 /// buildVPlan().
8037 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8038                                            ElementCount MaxVF) {
8039   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8040   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8041     VFRange SubRange = {VF, MaxVFPlusOne};
8042     VPlans.push_back(buildVPlan(SubRange));
8043     VF = SubRange.End;
8044   }
8045 }
8046 
8047 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8048                                          VPlanPtr &Plan) {
8049   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8050 
8051   // Look for cached value.
8052   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8053   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8054   if (ECEntryIt != EdgeMaskCache.end())
8055     return ECEntryIt->second;
8056 
8057   VPValue *SrcMask = createBlockInMask(Src, Plan);
8058 
8059   // The terminator has to be a branch inst!
8060   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8061   assert(BI && "Unexpected terminator found");
8062 
8063   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8064     return EdgeMaskCache[Edge] = SrcMask;
8065 
8066   // If source is an exiting block, we know the exit edge is dynamically dead
8067   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8068   // adding uses of an otherwise potentially dead instruction.
8069   if (OrigLoop->isLoopExiting(Src))
8070     return EdgeMaskCache[Edge] = SrcMask;
8071 
8072   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8073   assert(EdgeMask && "No Edge Mask found for condition");
8074 
8075   if (BI->getSuccessor(0) != Dst)
8076     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8077 
8078   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8079     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8080     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8081     // The select version does not introduce new UB if SrcMask is false and
8082     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8083     VPValue *False = Plan->getOrAddVPValue(
8084         ConstantInt::getFalse(BI->getCondition()->getType()));
8085     EdgeMask =
8086         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8087   }
8088 
8089   return EdgeMaskCache[Edge] = EdgeMask;
8090 }
8091 
8092 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8093   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8094 
8095   // Look for cached value.
8096   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8097   if (BCEntryIt != BlockMaskCache.end())
8098     return BCEntryIt->second;
8099 
8100   // All-one mask is modelled as no-mask following the convention for masked
8101   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8102   VPValue *BlockMask = nullptr;
8103 
8104   if (OrigLoop->getHeader() == BB) {
8105     if (!CM.blockNeedsPredicationForAnyReason(BB))
8106       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8107 
8108     assert(CM.foldTailByMasking() && "must fold the tail");
8109 
8110     // If we're using the active lane mask for control flow, then we get the
8111     // mask from the active lane mask PHI that is cached in the VPlan.
8112     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8113     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8114       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8115 
8116     // Introduce the early-exit compare IV <= BTC to form header block mask.
8117     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8118     // constructing the desired canonical IV in the header block as its first
8119     // non-phi instructions.
8120 
8121     VPBasicBlock *HeaderVPBB =
8122         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8123     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8124     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8125     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8126 
8127     VPBuilder::InsertPointGuard Guard(Builder);
8128     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8129     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8130       VPValue *TC = Plan->getOrCreateTripCount();
8131       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8132                                        nullptr, "active.lane.mask");
8133     } else {
8134       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8135       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8136     }
8137     return BlockMaskCache[BB] = BlockMask;
8138   }
8139 
8140   // This is the block mask. We OR all incoming edges.
8141   for (auto *Predecessor : predecessors(BB)) {
8142     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8143     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8144       return BlockMaskCache[BB] = EdgeMask;
8145 
8146     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8147       BlockMask = EdgeMask;
8148       continue;
8149     }
8150 
8151     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8152   }
8153 
8154   return BlockMaskCache[BB] = BlockMask;
8155 }
8156 
8157 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8158                                                 ArrayRef<VPValue *> Operands,
8159                                                 VFRange &Range,
8160                                                 VPlanPtr &Plan) {
8161   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8162          "Must be called with either a load or store");
8163 
8164   auto willWiden = [&](ElementCount VF) -> bool {
8165     LoopVectorizationCostModel::InstWidening Decision =
8166         CM.getWideningDecision(I, VF);
8167     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8168            "CM decision should be taken at this point.");
8169     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8170       return true;
8171     if (CM.isScalarAfterVectorization(I, VF) ||
8172         CM.isProfitableToScalarize(I, VF))
8173       return false;
8174     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8175   };
8176 
8177   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8178     return nullptr;
8179 
8180   VPValue *Mask = nullptr;
8181   if (Legal->isMaskRequired(I))
8182     Mask = createBlockInMask(I->getParent(), Plan);
8183 
8184   // Determine if the pointer operand of the access is either consecutive or
8185   // reverse consecutive.
8186   LoopVectorizationCostModel::InstWidening Decision =
8187       CM.getWideningDecision(I, Range.Start);
8188   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8189   bool Consecutive =
8190       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8191 
8192   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8193     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8194                                               Consecutive, Reverse);
8195 
8196   StoreInst *Store = cast<StoreInst>(I);
8197   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8198                                             Mask, Consecutive, Reverse);
8199 }
8200 
8201 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8202 /// insert a recipe to expand the step for the induction recipe.
8203 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8204     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8205     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8206     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8207   // Returns true if an instruction \p I should be scalarized instead of
8208   // vectorized for the chosen vectorization factor.
8209   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8210     return CM.isScalarAfterVectorization(I, VF) ||
8211            CM.isProfitableToScalarize(I, VF);
8212   };
8213 
8214   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8215       [&](ElementCount VF) {
8216         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8217       },
8218       Range);
8219   assert(IndDesc.getStartValue() ==
8220          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8221   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8222          "step must be loop invariant");
8223 
8224   VPValue *Step =
8225       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8226   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8227     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8228                                              !NeedsScalarIVOnly);
8229   }
8230   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8231   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8232                                            !NeedsScalarIVOnly);
8233 }
8234 
8235 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8236     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8237 
8238   // Check if this is an integer or fp induction. If so, build the recipe that
8239   // produces its scalar and vector values.
8240   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8241     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8242                                        *PSE.getSE(), *OrigLoop, Range);
8243 
8244   // Check if this is pointer induction. If so, build the recipe for it.
8245   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8246     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8247                                                            *PSE.getSE());
8248     assert(isa<SCEVConstant>(II->getStep()));
8249     return new VPWidenPointerInductionRecipe(
8250         Phi, Operands[0], Step, *II,
8251         LoopVectorizationPlanner::getDecisionAndClampRange(
8252             [&](ElementCount VF) {
8253               return CM.isScalarAfterVectorization(Phi, VF);
8254             },
8255             Range));
8256   }
8257   return nullptr;
8258 }
8259 
8260 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8261     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8262   // Optimize the special case where the source is a constant integer
8263   // induction variable. Notice that we can only optimize the 'trunc' case
8264   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8265   // (c) other casts depend on pointer size.
8266 
8267   // Determine whether \p K is a truncation based on an induction variable that
8268   // can be optimized.
8269   auto isOptimizableIVTruncate =
8270       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8271     return [=](ElementCount VF) -> bool {
8272       return CM.isOptimizableIVTruncate(K, VF);
8273     };
8274   };
8275 
8276   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8277           isOptimizableIVTruncate(I), Range)) {
8278 
8279     auto *Phi = cast<PHINode>(I->getOperand(0));
8280     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8281     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8282     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8283                                        *PSE.getSE(), *OrigLoop, Range);
8284   }
8285   return nullptr;
8286 }
8287 
8288 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8289                                                 ArrayRef<VPValue *> Operands,
8290                                                 VPlanPtr &Plan) {
8291   // If all incoming values are equal, the incoming VPValue can be used directly
8292   // instead of creating a new VPBlendRecipe.
8293   if (llvm::all_equal(Operands))
8294     return Operands[0];
8295 
8296   unsigned NumIncoming = Phi->getNumIncomingValues();
8297   // For in-loop reductions, we do not need to create an additional select.
8298   VPValue *InLoopVal = nullptr;
8299   for (unsigned In = 0; In < NumIncoming; In++) {
8300     PHINode *PhiOp =
8301         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8302     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8303       assert(!InLoopVal && "Found more than one in-loop reduction!");
8304       InLoopVal = Operands[In];
8305     }
8306   }
8307 
8308   assert((!InLoopVal || NumIncoming == 2) &&
8309          "Found an in-loop reduction for PHI with unexpected number of "
8310          "incoming values");
8311   if (InLoopVal)
8312     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8313 
8314   // We know that all PHIs in non-header blocks are converted into selects, so
8315   // we don't have to worry about the insertion order and we can just use the
8316   // builder. At this point we generate the predication tree. There may be
8317   // duplications since this is a simple recursive scan, but future
8318   // optimizations will clean it up.
8319   SmallVector<VPValue *, 2> OperandsWithMask;
8320 
8321   for (unsigned In = 0; In < NumIncoming; In++) {
8322     VPValue *EdgeMask =
8323       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8324     assert((EdgeMask || NumIncoming == 1) &&
8325            "Multiple predecessors with one having a full mask");
8326     OperandsWithMask.push_back(Operands[In]);
8327     if (EdgeMask)
8328       OperandsWithMask.push_back(EdgeMask);
8329   }
8330   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8331 }
8332 
8333 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8334                                                    ArrayRef<VPValue *> Operands,
8335                                                    VFRange &Range) const {
8336 
8337   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8338       [this, CI](ElementCount VF) {
8339         return CM.isScalarWithPredication(CI, VF);
8340       },
8341       Range);
8342 
8343   if (IsPredicated)
8344     return nullptr;
8345 
8346   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8347   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8348              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8349              ID == Intrinsic::pseudoprobe ||
8350              ID == Intrinsic::experimental_noalias_scope_decl))
8351     return nullptr;
8352 
8353   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8354 
8355   // Is it beneficial to perform intrinsic call compared to lib call?
8356   bool ShouldUseVectorIntrinsic =
8357       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8358                 [&](ElementCount VF) -> bool {
8359                   bool NeedToScalarize = false;
8360                   // Is it beneficial to perform intrinsic call compared to lib
8361                   // call?
8362                   InstructionCost CallCost =
8363                       CM.getVectorCallCost(CI, VF, NeedToScalarize);
8364                   InstructionCost IntrinsicCost =
8365                       CM.getVectorIntrinsicCost(CI, VF);
8366                   return IntrinsicCost <= CallCost;
8367                 },
8368                 Range);
8369   if (ShouldUseVectorIntrinsic)
8370     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8371 
8372   // Is better to call a vectorized version of the function than to to scalarize
8373   // the call?
8374   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8375       [&](ElementCount VF) -> bool {
8376         // The following case may be scalarized depending on the VF.
8377         // The flag shows whether we can use a usual Call for vectorized
8378         // version of the instruction.
8379         bool NeedToScalarize = false;
8380         CM.getVectorCallCost(CI, VF, NeedToScalarize);
8381         return !NeedToScalarize;
8382       },
8383       Range);
8384   if (ShouldUseVectorCall)
8385     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8386                                  Intrinsic::not_intrinsic);
8387 
8388   return nullptr;
8389 }
8390 
8391 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8392   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8393          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8394   // Instruction should be widened, unless it is scalar after vectorization,
8395   // scalarization is profitable or it is predicated.
8396   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8397     return CM.isScalarAfterVectorization(I, VF) ||
8398            CM.isProfitableToScalarize(I, VF) ||
8399            CM.isScalarWithPredication(I, VF);
8400   };
8401   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8402                                                              Range);
8403 }
8404 
8405 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8406                                           ArrayRef<VPValue *> Operands,
8407                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8408   switch (I->getOpcode()) {
8409   default:
8410     return nullptr;
8411   case Instruction::SDiv:
8412   case Instruction::UDiv:
8413   case Instruction::SRem:
8414   case Instruction::URem: {
8415     // If not provably safe, use a select to form a safe divisor before widening the
8416     // div/rem operation itself.  Otherwise fall through to general handling below.
8417     if (CM.isPredicatedInst(I)) {
8418       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8419       VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8420       VPValue *One =
8421         Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8422       auto *SafeRHS =
8423          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8424                            I->getDebugLoc());
8425       VPBB->appendRecipe(SafeRHS);
8426       Ops[1] = SafeRHS;
8427       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8428     }
8429     LLVM_FALLTHROUGH;
8430   }
8431   case Instruction::Add:
8432   case Instruction::And:
8433   case Instruction::AShr:
8434   case Instruction::BitCast:
8435   case Instruction::FAdd:
8436   case Instruction::FCmp:
8437   case Instruction::FDiv:
8438   case Instruction::FMul:
8439   case Instruction::FNeg:
8440   case Instruction::FPExt:
8441   case Instruction::FPToSI:
8442   case Instruction::FPToUI:
8443   case Instruction::FPTrunc:
8444   case Instruction::FRem:
8445   case Instruction::FSub:
8446   case Instruction::ICmp:
8447   case Instruction::IntToPtr:
8448   case Instruction::LShr:
8449   case Instruction::Mul:
8450   case Instruction::Or:
8451   case Instruction::PtrToInt:
8452   case Instruction::Select:
8453   case Instruction::SExt:
8454   case Instruction::Shl:
8455   case Instruction::SIToFP:
8456   case Instruction::Sub:
8457   case Instruction::Trunc:
8458   case Instruction::UIToFP:
8459   case Instruction::Xor:
8460   case Instruction::ZExt:
8461   case Instruction::Freeze:
8462     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8463   };
8464 }
8465 
8466 void VPRecipeBuilder::fixHeaderPhis() {
8467   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8468   for (VPHeaderPHIRecipe *R : PhisToFix) {
8469     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8470     VPRecipeBase *IncR =
8471         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8472     R->addOperand(IncR->getVPSingleValue());
8473   }
8474 }
8475 
8476 VPBasicBlock *VPRecipeBuilder::handleReplication(
8477     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8478     VPlanPtr &Plan) {
8479   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8480       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8481       Range);
8482 
8483   bool IsPredicated = CM.isPredicatedInst(I);
8484 
8485   // Even if the instruction is not marked as uniform, there are certain
8486   // intrinsic calls that can be effectively treated as such, so we check for
8487   // them here. Conservatively, we only do this for scalable vectors, since
8488   // for fixed-width VFs we can always fall back on full scalarization.
8489   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8490     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8491     case Intrinsic::assume:
8492     case Intrinsic::lifetime_start:
8493     case Intrinsic::lifetime_end:
8494       // For scalable vectors if one of the operands is variant then we still
8495       // want to mark as uniform, which will generate one instruction for just
8496       // the first lane of the vector. We can't scalarize the call in the same
8497       // way as for fixed-width vectors because we don't know how many lanes
8498       // there are.
8499       //
8500       // The reasons for doing it this way for scalable vectors are:
8501       //   1. For the assume intrinsic generating the instruction for the first
8502       //      lane is still be better than not generating any at all. For
8503       //      example, the input may be a splat across all lanes.
8504       //   2. For the lifetime start/end intrinsics the pointer operand only
8505       //      does anything useful when the input comes from a stack object,
8506       //      which suggests it should always be uniform. For non-stack objects
8507       //      the effect is to poison the object, which still allows us to
8508       //      remove the call.
8509       IsUniform = true;
8510       break;
8511     default:
8512       break;
8513     }
8514   }
8515 
8516   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8517                                        IsUniform, IsPredicated);
8518 
8519   // Find if I uses a predicated instruction. If so, it will use its scalar
8520   // value. Avoid hoisting the insert-element which packs the scalar value into
8521   // a vector value, as that happens iff all users use the vector value.
8522   for (VPValue *Op : Recipe->operands()) {
8523     auto *PredR =
8524         dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8525     if (!PredR)
8526       continue;
8527     auto *RepR = cast<VPReplicateRecipe>(
8528         PredR->getOperand(0)->getDefiningRecipe());
8529     assert(RepR->isPredicated() &&
8530            "expected Replicate recipe to be predicated");
8531     RepR->setAlsoPack(false);
8532   }
8533 
8534   // Finalize the recipe for Instr, first if it is not predicated.
8535   if (!IsPredicated) {
8536     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8537     setRecipe(I, Recipe);
8538     Plan->addVPValue(I, Recipe);
8539     VPBB->appendRecipe(Recipe);
8540     return VPBB;
8541   }
8542   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8543 
8544   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8545   assert(SingleSucc && "VPBB must have a single successor when handling "
8546                        "predicated replication.");
8547   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8548   // Record predicated instructions for above packing optimizations.
8549   VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8550   VPBlockUtils::insertBlockAfter(Region, VPBB);
8551   auto *RegSucc = new VPBasicBlock();
8552   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8553   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8554   return RegSucc;
8555 }
8556 
8557 VPRegionBlock *
8558 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8559                                        VPlanPtr &Plan) {
8560   Instruction *Instr = PredRecipe->getUnderlyingInstr();
8561   // Instructions marked for predication are replicated and placed under an
8562   // if-then construct to prevent side-effects.
8563   // Generate recipes to compute the block mask for this region.
8564   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8565 
8566   // Build the triangular if-then region.
8567   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8568   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8569   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8570   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8571   auto *PHIRecipe = Instr->getType()->isVoidTy()
8572                         ? nullptr
8573                         : new VPPredInstPHIRecipe(PredRecipe);
8574   if (PHIRecipe) {
8575     setRecipe(Instr, PHIRecipe);
8576     Plan->addVPValue(Instr, PHIRecipe);
8577   } else {
8578     setRecipe(Instr, PredRecipe);
8579     Plan->addVPValue(Instr, PredRecipe);
8580   }
8581 
8582   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8583   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8584   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8585 
8586   // Note: first set Entry as region entry and then connect successors starting
8587   // from it in order, to propagate the "parent" of each VPBasicBlock.
8588   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8589   VPBlockUtils::connectBlocks(Pred, Exiting);
8590 
8591   return Region;
8592 }
8593 
8594 VPRecipeOrVPValueTy
8595 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8596                                         ArrayRef<VPValue *> Operands,
8597                                         VFRange &Range, VPBasicBlock *VPBB,
8598                                         VPlanPtr &Plan) {
8599   // First, check for specific widening recipes that deal with inductions, Phi
8600   // nodes, calls and memory operations.
8601   VPRecipeBase *Recipe;
8602   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8603     if (Phi->getParent() != OrigLoop->getHeader())
8604       return tryToBlend(Phi, Operands, Plan);
8605 
8606     // Always record recipes for header phis. Later first-order recurrence phis
8607     // can have earlier phis as incoming values.
8608     recordRecipeOf(Phi);
8609 
8610     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8611       return toVPRecipeResult(Recipe);
8612 
8613     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8614     assert((Legal->isReductionVariable(Phi) ||
8615             Legal->isFixedOrderRecurrence(Phi)) &&
8616            "can only widen reductions and fixed-order recurrences here");
8617     VPValue *StartV = Operands[0];
8618     if (Legal->isReductionVariable(Phi)) {
8619       const RecurrenceDescriptor &RdxDesc =
8620           Legal->getReductionVars().find(Phi)->second;
8621       assert(RdxDesc.getRecurrenceStartValue() ==
8622              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8623       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8624                                            CM.isInLoopReduction(Phi),
8625                                            CM.useOrderedReductions(RdxDesc));
8626     } else {
8627       // TODO: Currently fixed-order recurrences are modeled as chains of
8628       // first-order recurrences. If there are no users of the intermediate
8629       // recurrences in the chain, the fixed order recurrence should be modeled
8630       // directly, enabling more efficient codegen.
8631       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8632     }
8633 
8634     // Record the incoming value from the backedge, so we can add the incoming
8635     // value from the backedge after all recipes have been created.
8636     auto *Inc = cast<Instruction>(
8637         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8638     auto RecipeIter = Ingredient2Recipe.find(Inc);
8639     if (RecipeIter == Ingredient2Recipe.end())
8640       recordRecipeOf(Inc);
8641 
8642     PhisToFix.push_back(PhiRecipe);
8643     return toVPRecipeResult(PhiRecipe);
8644   }
8645 
8646   if (isa<TruncInst>(Instr) &&
8647       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8648                                                Range, *Plan)))
8649     return toVPRecipeResult(Recipe);
8650 
8651   // All widen recipes below deal only with VF > 1.
8652   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8653           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8654     return nullptr;
8655 
8656   if (auto *CI = dyn_cast<CallInst>(Instr))
8657     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8658 
8659   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8660     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8661 
8662   if (!shouldWiden(Instr, Range))
8663     return nullptr;
8664 
8665   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8666     return toVPRecipeResult(new VPWidenGEPRecipe(
8667         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8668 
8669   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8670     bool InvariantCond =
8671         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8672     return toVPRecipeResult(new VPWidenSelectRecipe(
8673         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8674   }
8675 
8676   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8677 }
8678 
8679 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8680                                                         ElementCount MaxVF) {
8681   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8682 
8683   // Add assume instructions we need to drop to DeadInstructions, to prevent
8684   // them from being added to the VPlan.
8685   // TODO: We only need to drop assumes in blocks that get flattend. If the
8686   // control flow is preserved, we should keep them.
8687   SmallPtrSet<Instruction *, 4> DeadInstructions;
8688   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8689   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8690 
8691   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8692   // Dead instructions do not need sinking. Remove them from SinkAfter.
8693   for (Instruction *I : DeadInstructions)
8694     SinkAfter.erase(I);
8695 
8696   // Cannot sink instructions after dead instructions (there won't be any
8697   // recipes for them). Instead, find the first non-dead previous instruction.
8698   for (auto &P : Legal->getSinkAfter()) {
8699     Instruction *SinkTarget = P.second;
8700     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8701     (void)FirstInst;
8702     while (DeadInstructions.contains(SinkTarget)) {
8703       assert(
8704           SinkTarget != FirstInst &&
8705           "Must find a live instruction (at least the one feeding the "
8706           "fixed-order recurrence PHI) before reaching beginning of the block");
8707       SinkTarget = SinkTarget->getPrevNode();
8708       assert(SinkTarget != P.first &&
8709              "sink source equals target, no sinking required");
8710     }
8711     P.second = SinkTarget;
8712   }
8713 
8714   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8715   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8716     VFRange SubRange = {VF, MaxVFPlusOne};
8717     VPlans.push_back(
8718         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8719     VF = SubRange.End;
8720   }
8721 }
8722 
8723 // Add the necessary canonical IV and branch recipes required to control the
8724 // loop.
8725 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8726                                   bool HasNUW,
8727                                   bool UseLaneMaskForLoopControlFlow) {
8728   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8729   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8730 
8731   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8732   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8733   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8734   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8735   Header->insert(CanonicalIVPHI, Header->begin());
8736 
8737   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8738   // IV by VF * UF.
8739   auto *CanonicalIVIncrement =
8740       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8741                                : VPInstruction::CanonicalIVIncrement,
8742                         {CanonicalIVPHI}, DL, "index.next");
8743   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8744 
8745   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8746   EB->appendRecipe(CanonicalIVIncrement);
8747 
8748   if (UseLaneMaskForLoopControlFlow) {
8749     // Create the active lane mask instruction in the vplan preheader.
8750     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8751 
8752     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8753     // we have to take unrolling into account. Each part needs to start at
8754     //   Part * VF
8755     auto *CanonicalIVIncrementParts =
8756         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8757                                  : VPInstruction::CanonicalIVIncrementForPart,
8758                           {StartV}, DL, "index.part.next");
8759     Preheader->appendRecipe(CanonicalIVIncrementParts);
8760 
8761     // Create the ActiveLaneMask instruction using the correct start values.
8762     VPValue *TC = Plan.getOrCreateTripCount();
8763     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8764                                        {CanonicalIVIncrementParts, TC}, DL,
8765                                        "active.lane.mask.entry");
8766     Preheader->appendRecipe(EntryALM);
8767 
8768     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8769     // preheader ActiveLaneMask instruction.
8770     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8771     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8772 
8773     // Create the active lane mask for the next iteration of the loop.
8774     CanonicalIVIncrementParts =
8775         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8776                                  : VPInstruction::CanonicalIVIncrementForPart,
8777                           {CanonicalIVIncrement}, DL);
8778     EB->appendRecipe(CanonicalIVIncrementParts);
8779 
8780     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8781                                   {CanonicalIVIncrementParts, TC}, DL,
8782                                   "active.lane.mask.next");
8783     EB->appendRecipe(ALM);
8784     LaneMaskPhi->addOperand(ALM);
8785 
8786     // We have to invert the mask here because a true condition means jumping
8787     // to the exit block.
8788     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8789     EB->appendRecipe(NotMask);
8790 
8791     VPInstruction *BranchBack =
8792         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8793     EB->appendRecipe(BranchBack);
8794   } else {
8795     // Add the BranchOnCount VPInstruction to the latch.
8796     VPInstruction *BranchBack = new VPInstruction(
8797         VPInstruction::BranchOnCount,
8798         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8799     EB->appendRecipe(BranchBack);
8800   }
8801 }
8802 
8803 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8804 // original exit block.
8805 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8806                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8807                                 VPlan &Plan) {
8808   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8809   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8810   // Only handle single-exit loops with unique exit blocks for now.
8811   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8812     return;
8813 
8814   // Introduce VPUsers modeling the exit values.
8815   for (PHINode &ExitPhi : ExitBB->phis()) {
8816     Value *IncomingValue =
8817         ExitPhi.getIncomingValueForBlock(ExitingBB);
8818     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8819     Plan.addLiveOut(&ExitPhi, V);
8820   }
8821 }
8822 
8823 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8824     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8825     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8826 
8827   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8828 
8829   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8830 
8831   // ---------------------------------------------------------------------------
8832   // Pre-construction: record ingredients whose recipes we'll need to further
8833   // process after constructing the initial VPlan.
8834   // ---------------------------------------------------------------------------
8835 
8836   // Mark instructions we'll need to sink later and their targets as
8837   // ingredients whose recipe we'll need to record.
8838   for (const auto &Entry : SinkAfter) {
8839     RecipeBuilder.recordRecipeOf(Entry.first);
8840     RecipeBuilder.recordRecipeOf(Entry.second);
8841   }
8842   for (const auto &Reduction : CM.getInLoopReductionChains()) {
8843     PHINode *Phi = Reduction.first;
8844     RecurKind Kind =
8845         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8846     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8847 
8848     RecipeBuilder.recordRecipeOf(Phi);
8849     for (const auto &R : ReductionOperations) {
8850       RecipeBuilder.recordRecipeOf(R);
8851       // For min/max reductions, where we have a pair of icmp/select, we also
8852       // need to record the ICmp recipe, so it can be removed later.
8853       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8854              "Only min/max recurrences allowed for inloop reductions");
8855       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8856         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8857     }
8858   }
8859 
8860   // For each interleave group which is relevant for this (possibly trimmed)
8861   // Range, add it to the set of groups to be later applied to the VPlan and add
8862   // placeholders for its members' Recipes which we'll be replacing with a
8863   // single VPInterleaveRecipe.
8864   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8865     auto applyIG = [IG, this](ElementCount VF) -> bool {
8866       return (VF.isVector() && // Query is illegal for VF == 1
8867               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8868                   LoopVectorizationCostModel::CM_Interleave);
8869     };
8870     if (!getDecisionAndClampRange(applyIG, Range))
8871       continue;
8872     InterleaveGroups.insert(IG);
8873     for (unsigned i = 0; i < IG->getFactor(); i++)
8874       if (Instruction *Member = IG->getMember(i))
8875         RecipeBuilder.recordRecipeOf(Member);
8876   };
8877 
8878   // ---------------------------------------------------------------------------
8879   // Build initial VPlan: Scan the body of the loop in a topological order to
8880   // visit each basic block after having visited its predecessor basic blocks.
8881   // ---------------------------------------------------------------------------
8882 
8883   // Create initial VPlan skeleton, starting with a block for the pre-header,
8884   // followed by a region for the vector loop, followed by the middle block. The
8885   // skeleton vector loop region contains a header and latch block.
8886   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8887   auto Plan = std::make_unique<VPlan>(Preheader);
8888 
8889   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8890   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8891   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8892   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8893   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8894   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8895   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8896 
8897   Instruction *DLInst =
8898       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8899   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8900                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8901                         !CM.foldTailByMasking(),
8902                         CM.useActiveLaneMaskForControlFlow());
8903 
8904   // Scan the body of the loop in a topological order to visit each basic block
8905   // after having visited its predecessor basic blocks.
8906   LoopBlocksDFS DFS(OrigLoop);
8907   DFS.perform(LI);
8908 
8909   VPBasicBlock *VPBB = HeaderVPBB;
8910   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8911   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8912     // Relevant instructions from basic block BB will be grouped into VPRecipe
8913     // ingredients and fill a new VPBasicBlock.
8914     unsigned VPBBsForBB = 0;
8915     if (VPBB != HeaderVPBB)
8916       VPBB->setName(BB->getName());
8917     Builder.setInsertPoint(VPBB);
8918 
8919     // Introduce each ingredient into VPlan.
8920     // TODO: Model and preserve debug intrinsics in VPlan.
8921     for (Instruction &I : BB->instructionsWithoutDebug()) {
8922       Instruction *Instr = &I;
8923 
8924       // First filter out irrelevant instructions, to ensure no recipes are
8925       // built for them.
8926       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8927         continue;
8928 
8929       SmallVector<VPValue *, 4> Operands;
8930       auto *Phi = dyn_cast<PHINode>(Instr);
8931       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8932         Operands.push_back(Plan->getOrAddVPValue(
8933             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8934       } else {
8935         auto OpRange = Plan->mapToVPValues(Instr->operands());
8936         Operands = {OpRange.begin(), OpRange.end()};
8937       }
8938 
8939       // Invariant stores inside loop will be deleted and a single store
8940       // with the final reduction value will be added to the exit block
8941       StoreInst *SI;
8942       if ((SI = dyn_cast<StoreInst>(&I)) &&
8943           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8944         continue;
8945 
8946       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8947               Instr, Operands, Range, VPBB, Plan)) {
8948         // If Instr can be simplified to an existing VPValue, use it.
8949         if (RecipeOrValue.is<VPValue *>()) {
8950           auto *VPV = RecipeOrValue.get<VPValue *>();
8951           Plan->addVPValue(Instr, VPV);
8952           // If the re-used value is a recipe, register the recipe for the
8953           // instruction, in case the recipe for Instr needs to be recorded.
8954           if (VPRecipeBase *R = VPV->getDefiningRecipe())
8955             RecipeBuilder.setRecipe(Instr, R);
8956           continue;
8957         }
8958         // Otherwise, add the new recipe.
8959         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8960         for (auto *Def : Recipe->definedValues()) {
8961           auto *UV = Def->getUnderlyingValue();
8962           Plan->addVPValue(UV, Def);
8963         }
8964 
8965         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8966             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8967           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8968           // of the header block. That can happen for truncates of induction
8969           // variables. Those recipes are moved to the phi section of the header
8970           // block after applying SinkAfter, which relies on the original
8971           // position of the trunc.
8972           assert(isa<TruncInst>(Instr));
8973           InductionsToMove.push_back(
8974               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8975         }
8976         RecipeBuilder.setRecipe(Instr, Recipe);
8977         VPBB->appendRecipe(Recipe);
8978         continue;
8979       }
8980 
8981       // Otherwise, if all widening options failed, Instruction is to be
8982       // replicated. This may create a successor for VPBB.
8983       VPBasicBlock *NextVPBB =
8984           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8985       if (NextVPBB != VPBB) {
8986         VPBB = NextVPBB;
8987         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8988                                     : "");
8989       }
8990     }
8991 
8992     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8993     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8994   }
8995 
8996   // After here, VPBB should not be used.
8997   VPBB = nullptr;
8998 
8999   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
9000 
9001   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9002          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9003          "entry block must be set to a VPRegionBlock having a non-empty entry "
9004          "VPBasicBlock");
9005   RecipeBuilder.fixHeaderPhis();
9006 
9007   // ---------------------------------------------------------------------------
9008   // Transform initial VPlan: Apply previously taken decisions, in order, to
9009   // bring the VPlan to its final state.
9010   // ---------------------------------------------------------------------------
9011 
9012   // Apply Sink-After legal constraints.
9013   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9014     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9015     if (Region && Region->isReplicator()) {
9016       assert(Region->getNumSuccessors() == 1 &&
9017              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9018       assert(R->getParent()->size() == 1 &&
9019              "A recipe in an original replicator region must be the only "
9020              "recipe in its block");
9021       return Region;
9022     }
9023     return nullptr;
9024   };
9025   for (const auto &Entry : SinkAfter) {
9026     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9027     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9028 
9029     auto *TargetRegion = GetReplicateRegion(Target);
9030     auto *SinkRegion = GetReplicateRegion(Sink);
9031     if (!SinkRegion) {
9032       // If the sink source is not a replicate region, sink the recipe directly.
9033       if (TargetRegion) {
9034         // The target is in a replication region, make sure to move Sink to
9035         // the block after it, not into the replication region itself.
9036         VPBasicBlock *NextBlock =
9037             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9038         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9039       } else
9040         Sink->moveAfter(Target);
9041       continue;
9042     }
9043 
9044     // The sink source is in a replicate region. Unhook the region from the CFG.
9045     auto *SinkPred = SinkRegion->getSinglePredecessor();
9046     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9047     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9048     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9049     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9050 
9051     if (TargetRegion) {
9052       // The target recipe is also in a replicate region, move the sink region
9053       // after the target region.
9054       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9055       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9056       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9057       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9058     } else {
9059       // The sink source is in a replicate region, we need to move the whole
9060       // replicate region, which should only contain a single recipe in the
9061       // main block.
9062       auto *SplitBlock =
9063           Target->getParent()->splitAt(std::next(Target->getIterator()));
9064 
9065       auto *SplitPred = SplitBlock->getSinglePredecessor();
9066 
9067       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9068       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9069       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9070     }
9071   }
9072 
9073   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9074   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9075 
9076   // Now that sink-after is done, move induction recipes for optimized truncates
9077   // to the phi section of the header block.
9078   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9079     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9080 
9081   // Adjust the recipes for any inloop reductions.
9082   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9083                              RecipeBuilder, Range.Start);
9084 
9085   // Introduce a recipe to combine the incoming and previous values of a
9086   // fixed-order recurrence.
9087   for (VPRecipeBase &R :
9088        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9089     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9090     if (!RecurPhi)
9091       continue;
9092 
9093     VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9094     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9095     // to terminate.
9096     while (auto *PrevPhi =
9097                dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9098       PrevRecipe = &PrevPhi->getBackedgeRecipe();
9099     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9100     auto *Region = GetReplicateRegion(PrevRecipe);
9101     if (Region)
9102       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9103     if (!InsertBlock) {
9104       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9105       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9106     }
9107     if (Region || PrevRecipe->isPhi())
9108       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9109     else
9110       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9111 
9112     auto *RecurSplice = cast<VPInstruction>(
9113         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9114                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9115 
9116     RecurPhi->replaceAllUsesWith(RecurSplice);
9117     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9118     // all users.
9119     RecurSplice->setOperand(0, RecurPhi);
9120   }
9121 
9122   // Interleave memory: for each Interleave Group we marked earlier as relevant
9123   // for this VPlan, replace the Recipes widening its memory instructions with a
9124   // single VPInterleaveRecipe at its insertion point.
9125   for (const auto *IG : InterleaveGroups) {
9126     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9127         RecipeBuilder.getRecipe(IG->getInsertPos()));
9128     SmallVector<VPValue *, 4> StoredValues;
9129     for (unsigned i = 0; i < IG->getFactor(); ++i)
9130       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9131         auto *StoreR =
9132             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9133         StoredValues.push_back(StoreR->getStoredValue());
9134       }
9135 
9136     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9137                                         Recipe->getMask());
9138     VPIG->insertBefore(Recipe);
9139     unsigned J = 0;
9140     for (unsigned i = 0; i < IG->getFactor(); ++i)
9141       if (Instruction *Member = IG->getMember(i)) {
9142         if (!Member->getType()->isVoidTy()) {
9143           VPValue *OriginalV = Plan->getVPValue(Member);
9144           Plan->removeVPValueFor(Member);
9145           Plan->addVPValue(Member, VPIG->getVPValue(J));
9146           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9147           J++;
9148         }
9149         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9150       }
9151   }
9152 
9153   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9154        VF *= 2)
9155     Plan->addVF(VF);
9156   Plan->setName("Initial VPlan");
9157 
9158   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9159   // in ways that accessing values using original IR values is incorrect.
9160   Plan->disableValue2VPValue();
9161 
9162   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9163   VPlanTransforms::removeDeadRecipes(*Plan);
9164 
9165   bool ShouldSimplify = true;
9166   while (ShouldSimplify) {
9167     ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
9168     ShouldSimplify |=
9169         VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
9170     ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9171   }
9172 
9173   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9174   VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9175 
9176   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9177   return Plan;
9178 }
9179 
9180 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9181   // Outer loop handling: They may require CFG and instruction level
9182   // transformations before even evaluating whether vectorization is profitable.
9183   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9184   // the vectorization pipeline.
9185   assert(!OrigLoop->isInnermost());
9186   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9187 
9188   // Create new empty VPlan
9189   auto Plan = std::make_unique<VPlan>();
9190 
9191   // Build hierarchical CFG
9192   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9193   HCFGBuilder.buildHierarchicalCFG();
9194 
9195   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9196        VF *= 2)
9197     Plan->addVF(VF);
9198 
9199   SmallPtrSet<Instruction *, 1> DeadInstructions;
9200   VPlanTransforms::VPInstructionsToVPRecipes(
9201       OrigLoop, Plan,
9202       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9203       DeadInstructions, *PSE.getSE(), *TLI);
9204 
9205   // Remove the existing terminator of the exiting block of the top-most region.
9206   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9207   auto *Term =
9208       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9209   Term->eraseFromParent();
9210 
9211   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9212                         true, CM.useActiveLaneMaskForControlFlow());
9213   return Plan;
9214 }
9215 
9216 // Adjust the recipes for reductions. For in-loop reductions the chain of
9217 // instructions leading from the loop exit instr to the phi need to be converted
9218 // to reductions, with one operand being vector and the other being the scalar
9219 // reduction chain. For other reductions, a select is introduced between the phi
9220 // and live-out recipes when folding the tail.
9221 void LoopVectorizationPlanner::adjustRecipesForReductions(
9222     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9223     ElementCount MinVF) {
9224   for (const auto &Reduction : CM.getInLoopReductionChains()) {
9225     PHINode *Phi = Reduction.first;
9226     const RecurrenceDescriptor &RdxDesc =
9227         Legal->getReductionVars().find(Phi)->second;
9228     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9229 
9230     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9231       continue;
9232 
9233     // ReductionOperations are orders top-down from the phi's use to the
9234     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9235     // which of the two operands will remain scalar and which will be reduced.
9236     // For minmax the chain will be the select instructions.
9237     Instruction *Chain = Phi;
9238     for (Instruction *R : ReductionOperations) {
9239       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9240       RecurKind Kind = RdxDesc.getRecurrenceKind();
9241 
9242       VPValue *ChainOp = Plan->getVPValue(Chain);
9243       unsigned FirstOpId;
9244       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9245              "Only min/max recurrences allowed for inloop reductions");
9246       // Recognize a call to the llvm.fmuladd intrinsic.
9247       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9248       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9249              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9250       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9251         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9252                "Expected to replace a VPWidenSelectSC");
9253         FirstOpId = 1;
9254       } else {
9255         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9256                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9257                "Expected to replace a VPWidenSC");
9258         FirstOpId = 0;
9259       }
9260       unsigned VecOpId =
9261           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9262       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9263 
9264       VPValue *CondOp = nullptr;
9265       if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
9266         VPBuilder::InsertPointGuard Guard(Builder);
9267         Builder.setInsertPoint(WidenRecipe->getParent(),
9268                                WidenRecipe->getIterator());
9269         CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
9270       }
9271 
9272       if (IsFMulAdd) {
9273         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9274         // need to create an fmul recipe to use as the vector operand for the
9275         // fadd reduction.
9276         VPInstruction *FMulRecipe = new VPInstruction(
9277             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9278         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9279         WidenRecipe->getParent()->insert(FMulRecipe,
9280                                          WidenRecipe->getIterator());
9281         VecOp = FMulRecipe;
9282       }
9283       VPReductionRecipe *RedRecipe =
9284           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9285       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9286       Plan->removeVPValueFor(R);
9287       Plan->addVPValue(R, RedRecipe);
9288       // Append the recipe to the end of the VPBasicBlock because we need to
9289       // ensure that it comes after all of it's inputs, including CondOp.
9290       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9291       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9292       WidenRecipe->eraseFromParent();
9293 
9294       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9295         VPRecipeBase *CompareRecipe =
9296             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9297         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9298                "Expected to replace a VPWidenSC");
9299         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9300                "Expected no remaining users");
9301         CompareRecipe->eraseFromParent();
9302       }
9303       Chain = R;
9304     }
9305   }
9306 
9307   // If tail is folded by masking, introduce selects between the phi
9308   // and the live-out instruction of each reduction, at the beginning of the
9309   // dedicated latch block.
9310   if (CM.foldTailByMasking()) {
9311     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9312     for (VPRecipeBase &R :
9313          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9314       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9315       if (!PhiR || PhiR->isInLoop())
9316         continue;
9317       VPValue *Cond =
9318           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9319       VPValue *Red = PhiR->getBackedgeValue();
9320       assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
9321              "reduction recipe must be defined before latch");
9322       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9323     }
9324   }
9325 }
9326 
9327 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9328 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9329                                VPSlotTracker &SlotTracker) const {
9330   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9331   IG->getInsertPos()->printAsOperand(O, false);
9332   O << ", ";
9333   getAddr()->printAsOperand(O, SlotTracker);
9334   VPValue *Mask = getMask();
9335   if (Mask) {
9336     O << ", ";
9337     Mask->printAsOperand(O, SlotTracker);
9338   }
9339 
9340   unsigned OpIdx = 0;
9341   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9342     if (!IG->getMember(i))
9343       continue;
9344     if (getNumStoreOperands() > 0) {
9345       O << "\n" << Indent << "  store ";
9346       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9347       O << " to index " << i;
9348     } else {
9349       O << "\n" << Indent << "  ";
9350       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9351       O << " = load from index " << i;
9352     }
9353     ++OpIdx;
9354   }
9355 }
9356 #endif
9357 
9358 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9359   assert(!State.Instance && "Int or FP induction being replicated.");
9360 
9361   Value *Start = getStartValue()->getLiveInIRValue();
9362   const InductionDescriptor &ID = getInductionDescriptor();
9363   TruncInst *Trunc = getTruncInst();
9364   IRBuilderBase &Builder = State.Builder;
9365   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9366   assert(State.VF.isVector() && "must have vector VF");
9367 
9368   // The value from the original loop to which we are mapping the new induction
9369   // variable.
9370   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9371 
9372   // Fast-math-flags propagate from the original induction instruction.
9373   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9374   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9375     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9376 
9377   // Now do the actual transformations, and start with fetching the step value.
9378   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9379 
9380   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9381          "Expected either an induction phi-node or a truncate of it!");
9382 
9383   // Construct the initial value of the vector IV in the vector loop preheader
9384   auto CurrIP = Builder.saveIP();
9385   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9386   Builder.SetInsertPoint(VectorPH->getTerminator());
9387   if (isa<TruncInst>(EntryVal)) {
9388     assert(Start->getType()->isIntegerTy() &&
9389            "Truncation requires an integer type");
9390     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9391     Step = Builder.CreateTrunc(Step, TruncType);
9392     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9393   }
9394 
9395   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9396   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9397   Value *SteppedStart = getStepVector(
9398       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9399 
9400   // We create vector phi nodes for both integer and floating-point induction
9401   // variables. Here, we determine the kind of arithmetic we will perform.
9402   Instruction::BinaryOps AddOp;
9403   Instruction::BinaryOps MulOp;
9404   if (Step->getType()->isIntegerTy()) {
9405     AddOp = Instruction::Add;
9406     MulOp = Instruction::Mul;
9407   } else {
9408     AddOp = ID.getInductionOpcode();
9409     MulOp = Instruction::FMul;
9410   }
9411 
9412   // Multiply the vectorization factor by the step using integer or
9413   // floating-point arithmetic as appropriate.
9414   Type *StepType = Step->getType();
9415   Value *RuntimeVF;
9416   if (Step->getType()->isFloatingPointTy())
9417     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9418   else
9419     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9420   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9421 
9422   // Create a vector splat to use in the induction update.
9423   //
9424   // FIXME: If the step is non-constant, we create the vector splat with
9425   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9426   //        handle a constant vector splat.
9427   Value *SplatVF = isa<Constant>(Mul)
9428                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9429                        : Builder.CreateVectorSplat(State.VF, Mul);
9430   Builder.restoreIP(CurrIP);
9431 
9432   // We may need to add the step a number of times, depending on the unroll
9433   // factor. The last of those goes into the PHI.
9434   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9435                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9436   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9437   Instruction *LastInduction = VecInd;
9438   for (unsigned Part = 0; Part < State.UF; ++Part) {
9439     State.set(this, LastInduction, Part);
9440 
9441     if (isa<TruncInst>(EntryVal))
9442       State.addMetadata(LastInduction, EntryVal);
9443 
9444     LastInduction = cast<Instruction>(
9445         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9446     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9447   }
9448 
9449   LastInduction->setName("vec.ind.next");
9450   VecInd->addIncoming(SteppedStart, VectorPH);
9451   // Add induction update using an incorrect block temporarily. The phi node
9452   // will be fixed after VPlan execution. Note that at this point the latch
9453   // block cannot be used, as it does not exist yet.
9454   // TODO: Model increment value in VPlan, by turning the recipe into a
9455   // multi-def and a subclass of VPHeaderPHIRecipe.
9456   VecInd->addIncoming(LastInduction, VectorPH);
9457 }
9458 
9459 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9460   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9461          "Not a pointer induction according to InductionDescriptor!");
9462   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9463          "Unexpected type.");
9464 
9465   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9466   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9467 
9468   if (onlyScalarsGenerated(State.VF)) {
9469     // This is the normalized GEP that starts counting at zero.
9470     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9471         CanonicalIV, IndDesc.getStep()->getType());
9472     // Determine the number of scalars we need to generate for each unroll
9473     // iteration. If the instruction is uniform, we only need to generate the
9474     // first lane. Otherwise, we generate all VF values.
9475     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9476     assert((IsUniform || !State.VF.isScalable()) &&
9477            "Cannot scalarize a scalable VF");
9478     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9479 
9480     for (unsigned Part = 0; Part < State.UF; ++Part) {
9481       Value *PartStart =
9482           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9483 
9484       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9485         Value *Idx = State.Builder.CreateAdd(
9486             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9487         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9488 
9489         Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9490         Value *SclrGep = emitTransformedIndex(
9491             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9492         SclrGep->setName("next.gep");
9493         State.set(this, SclrGep, VPIteration(Part, Lane));
9494       }
9495     }
9496     return;
9497   }
9498 
9499   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9500          "Induction step not a SCEV constant!");
9501   Type *PhiType = IndDesc.getStep()->getType();
9502 
9503   // Build a pointer phi
9504   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9505   Type *ScStValueType = ScalarStartValue->getType();
9506   PHINode *NewPointerPhi =
9507       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9508 
9509   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9510   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9511 
9512   // A pointer induction, performed by using a gep
9513   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9514 
9515   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9516   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9517   Value *NumUnrolledElems =
9518       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9519   Value *InductionGEP = GetElementPtrInst::Create(
9520       IndDesc.getElementType(), NewPointerPhi,
9521       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9522       InductionLoc);
9523   // Add induction update using an incorrect block temporarily. The phi node
9524   // will be fixed after VPlan execution. Note that at this point the latch
9525   // block cannot be used, as it does not exist yet.
9526   // TODO: Model increment value in VPlan, by turning the recipe into a
9527   // multi-def and a subclass of VPHeaderPHIRecipe.
9528   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9529 
9530   // Create UF many actual address geps that use the pointer
9531   // phi as base and a vectorized version of the step value
9532   // (<step*0, ..., step*N>) as offset.
9533   for (unsigned Part = 0; Part < State.UF; ++Part) {
9534     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9535     Value *StartOffsetScalar =
9536         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9537     Value *StartOffset =
9538         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9539     // Create a vector of consecutive numbers from zero to VF.
9540     StartOffset = State.Builder.CreateAdd(
9541         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9542 
9543     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
9544            "scalar step must be the same across all parts");
9545     Value *GEP = State.Builder.CreateGEP(
9546         IndDesc.getElementType(), NewPointerPhi,
9547         State.Builder.CreateMul(
9548             StartOffset,
9549             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9550             "vector.gep"));
9551     State.set(this, GEP, Part);
9552   }
9553 }
9554 
9555 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9556   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9557 
9558   // Fast-math-flags propagate from the original induction instruction.
9559   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9560   if (IndDesc.getInductionBinOp() &&
9561       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9562     State.Builder.setFastMathFlags(
9563         IndDesc.getInductionBinOp()->getFastMathFlags());
9564 
9565   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9566   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9567   Value *DerivedIV =
9568       emitTransformedIndex(State.Builder, CanonicalIV,
9569                            getStartValue()->getLiveInIRValue(), Step, IndDesc);
9570   DerivedIV->setName("offset.idx");
9571   if (ResultTy != DerivedIV->getType()) {
9572     assert(Step->getType()->isIntegerTy() &&
9573            "Truncation requires an integer step");
9574     DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9575   }
9576   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9577 
9578   State.set(this, DerivedIV, VPIteration(0, 0));
9579 }
9580 
9581 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9582   // Fast-math-flags propagate from the original induction instruction.
9583   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9584   if (IndDesc.getInductionBinOp() &&
9585       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9586     State.Builder.setFastMathFlags(
9587         IndDesc.getInductionBinOp()->getFastMathFlags());
9588 
9589   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9590   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9591 
9592   buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9593 }
9594 
9595 void VPInterleaveRecipe::execute(VPTransformState &State) {
9596   assert(!State.Instance && "Interleave group being replicated.");
9597   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9598                                       getStoredValues(), getMask());
9599 }
9600 
9601 void VPReductionRecipe::execute(VPTransformState &State) {
9602   assert(!State.Instance && "Reduction being replicated.");
9603   Value *PrevInChain = State.get(getChainOp(), 0);
9604   RecurKind Kind = RdxDesc->getRecurrenceKind();
9605   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9606   // Propagate the fast-math flags carried by the underlying instruction.
9607   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9608   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9609   for (unsigned Part = 0; Part < State.UF; ++Part) {
9610     Value *NewVecOp = State.get(getVecOp(), Part);
9611     if (VPValue *Cond = getCondOp()) {
9612       Value *NewCond = State.get(Cond, Part);
9613       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9614       Value *Iden = RdxDesc->getRecurrenceIdentity(
9615           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9616       Value *IdenVec =
9617           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9618       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9619       NewVecOp = Select;
9620     }
9621     Value *NewRed;
9622     Value *NextInChain;
9623     if (IsOrdered) {
9624       if (State.VF.isVector())
9625         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9626                                         PrevInChain);
9627       else
9628         NewRed = State.Builder.CreateBinOp(
9629             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9630             NewVecOp);
9631       PrevInChain = NewRed;
9632     } else {
9633       PrevInChain = State.get(getChainOp(), Part);
9634       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9635     }
9636     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9637       NextInChain =
9638           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9639                          NewRed, PrevInChain);
9640     } else if (IsOrdered)
9641       NextInChain = NewRed;
9642     else
9643       NextInChain = State.Builder.CreateBinOp(
9644           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9645           PrevInChain);
9646     State.set(this, NextInChain, Part);
9647   }
9648 }
9649 
9650 void VPReplicateRecipe::execute(VPTransformState &State) {
9651   Instruction *UI = getUnderlyingInstr();
9652   if (State.Instance) { // Generate a single instance.
9653     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9654     State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9655                                     IsPredicated, State);
9656     // Insert scalar instance packing it into a vector.
9657     if (AlsoPack && State.VF.isVector()) {
9658       // If we're constructing lane 0, initialize to start from poison.
9659       if (State.Instance->Lane.isFirstLane()) {
9660         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9661         Value *Poison = PoisonValue::get(
9662             VectorType::get(UI->getType(), State.VF));
9663         State.set(this, Poison, State.Instance->Part);
9664       }
9665       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9666     }
9667     return;
9668   }
9669 
9670   if (IsUniform) {
9671     // If the recipe is uniform across all parts (instead of just per VF), only
9672     // generate a single instance.
9673     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9674         all_of(operands(), [](VPValue *Op) {
9675           return Op->isDefinedOutsideVectorRegions();
9676         })) {
9677       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9678                                       State);
9679       if (user_begin() != user_end()) {
9680         for (unsigned Part = 1; Part < State.UF; ++Part)
9681           State.set(this, State.get(this, VPIteration(0, 0)),
9682                     VPIteration(Part, 0));
9683       }
9684       return;
9685     }
9686 
9687     // Uniform within VL means we need to generate lane 0 only for each
9688     // unrolled copy.
9689     for (unsigned Part = 0; Part < State.UF; ++Part)
9690       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9691                                       IsPredicated, State);
9692     return;
9693   }
9694 
9695   // A store of a loop varying value to a loop invariant address only
9696   // needs only the last copy of the store.
9697   if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9698     auto Lane = VPLane::getLastLaneForVF(State.VF);
9699     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9700                                     State);
9701     return;
9702   }
9703 
9704   // Generate scalar instances for all VF lanes of all UF parts.
9705   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9706   const unsigned EndLane = State.VF.getKnownMinValue();
9707   for (unsigned Part = 0; Part < State.UF; ++Part)
9708     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9709       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9710                                       IsPredicated, State);
9711 }
9712 
9713 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9714   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9715 
9716   // Attempt to issue a wide load.
9717   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9718   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9719 
9720   assert((LI || SI) && "Invalid Load/Store instruction");
9721   assert((!SI || StoredValue) && "No stored value provided for widened store");
9722   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9723 
9724   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9725 
9726   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9727   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9728   bool CreateGatherScatter = !Consecutive;
9729 
9730   auto &Builder = State.Builder;
9731   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9732   bool isMaskRequired = getMask();
9733   if (isMaskRequired)
9734     for (unsigned Part = 0; Part < State.UF; ++Part)
9735       BlockInMaskParts[Part] = State.get(getMask(), Part);
9736 
9737   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9738     // Calculate the pointer for the specific unroll-part.
9739     GetElementPtrInst *PartPtr = nullptr;
9740 
9741     bool InBounds = false;
9742     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9743       InBounds = gep->isInBounds();
9744     if (Reverse) {
9745       // If the address is consecutive but reversed, then the
9746       // wide store needs to start at the last vector element.
9747       // RunTimeVF =  VScale * VF.getKnownMinValue()
9748       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9749       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9750       // NumElt = -Part * RunTimeVF
9751       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9752       // LastLane = 1 - RunTimeVF
9753       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9754       PartPtr =
9755           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9756       PartPtr->setIsInBounds(InBounds);
9757       PartPtr = cast<GetElementPtrInst>(
9758           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9759       PartPtr->setIsInBounds(InBounds);
9760       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9761         BlockInMaskParts[Part] =
9762             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9763     } else {
9764       Value *Increment =
9765           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9766       PartPtr = cast<GetElementPtrInst>(
9767           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9768       PartPtr->setIsInBounds(InBounds);
9769     }
9770 
9771     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9772     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9773   };
9774 
9775   // Handle Stores:
9776   if (SI) {
9777     State.setDebugLocFromInst(SI);
9778 
9779     for (unsigned Part = 0; Part < State.UF; ++Part) {
9780       Instruction *NewSI = nullptr;
9781       Value *StoredVal = State.get(StoredValue, Part);
9782       if (CreateGatherScatter) {
9783         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9784         Value *VectorGep = State.get(getAddr(), Part);
9785         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9786                                             MaskPart);
9787       } else {
9788         if (Reverse) {
9789           // If we store to reverse consecutive memory locations, then we need
9790           // to reverse the order of elements in the stored value.
9791           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9792           // We don't want to update the value in the map as it might be used in
9793           // another expression. So don't call resetVectorValue(StoredVal).
9794         }
9795         auto *VecPtr =
9796             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9797         if (isMaskRequired)
9798           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9799                                             BlockInMaskParts[Part]);
9800         else
9801           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9802       }
9803       State.addMetadata(NewSI, SI);
9804     }
9805     return;
9806   }
9807 
9808   // Handle loads.
9809   assert(LI && "Must have a load instruction");
9810   State.setDebugLocFromInst(LI);
9811   for (unsigned Part = 0; Part < State.UF; ++Part) {
9812     Value *NewLI;
9813     if (CreateGatherScatter) {
9814       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9815       Value *VectorGep = State.get(getAddr(), Part);
9816       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9817                                          nullptr, "wide.masked.gather");
9818       State.addMetadata(NewLI, LI);
9819     } else {
9820       auto *VecPtr =
9821           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9822       if (isMaskRequired)
9823         NewLI = Builder.CreateMaskedLoad(
9824             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9825             PoisonValue::get(DataTy), "wide.masked.load");
9826       else
9827         NewLI =
9828             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9829 
9830       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9831       State.addMetadata(NewLI, LI);
9832       if (Reverse)
9833         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9834     }
9835 
9836     State.set(getVPSingleValue(), NewLI, Part);
9837   }
9838 }
9839 
9840 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9841 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9842 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9843 // for predication.
9844 static ScalarEpilogueLowering getScalarEpilogueLowering(
9845     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9846     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9847     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9848     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9849   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9850   // don't look at hints or options, and don't request a scalar epilogue.
9851   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9852   // LoopAccessInfo (due to code dependency and not being able to reliably get
9853   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9854   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9855   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9856   // back to the old way and vectorize with versioning when forced. See D81345.)
9857   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9858                                                       PGSOQueryType::IRPass) &&
9859                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9860     return CM_ScalarEpilogueNotAllowedOptSize;
9861 
9862   // 2) If set, obey the directives
9863   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9864     switch (PreferPredicateOverEpilogue) {
9865     case PreferPredicateTy::ScalarEpilogue:
9866       return CM_ScalarEpilogueAllowed;
9867     case PreferPredicateTy::PredicateElseScalarEpilogue:
9868       return CM_ScalarEpilogueNotNeededUsePredicate;
9869     case PreferPredicateTy::PredicateOrDontVectorize:
9870       return CM_ScalarEpilogueNotAllowedUsePredicate;
9871     };
9872   }
9873 
9874   // 3) If set, obey the hints
9875   switch (Hints.getPredicate()) {
9876   case LoopVectorizeHints::FK_Enabled:
9877     return CM_ScalarEpilogueNotNeededUsePredicate;
9878   case LoopVectorizeHints::FK_Disabled:
9879     return CM_ScalarEpilogueAllowed;
9880   };
9881 
9882   // 4) if the TTI hook indicates this is profitable, request predication.
9883   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9884     return CM_ScalarEpilogueNotNeededUsePredicate;
9885 
9886   return CM_ScalarEpilogueAllowed;
9887 }
9888 
9889 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9890   // If Values have been set for this Def return the one relevant for \p Part.
9891   if (hasVectorValue(Def, Part))
9892     return Data.PerPartOutput[Def][Part];
9893 
9894   if (!hasScalarValue(Def, {Part, 0})) {
9895     Value *IRV = Def->getLiveInIRValue();
9896     Value *B = ILV->getBroadcastInstrs(IRV);
9897     set(Def, B, Part);
9898     return B;
9899   }
9900 
9901   Value *ScalarValue = get(Def, {Part, 0});
9902   // If we aren't vectorizing, we can just copy the scalar map values over
9903   // to the vector map.
9904   if (VF.isScalar()) {
9905     set(Def, ScalarValue, Part);
9906     return ScalarValue;
9907   }
9908 
9909   bool IsUniform = vputils::isUniformAfterVectorization(Def);
9910 
9911   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9912   // Check if there is a scalar value for the selected lane.
9913   if (!hasScalarValue(Def, {Part, LastLane})) {
9914     // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9915     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
9916             isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
9917            "unexpected recipe found to be invariant");
9918     IsUniform = true;
9919     LastLane = 0;
9920   }
9921 
9922   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9923   // Set the insert point after the last scalarized instruction or after the
9924   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9925   // will directly follow the scalar definitions.
9926   auto OldIP = Builder.saveIP();
9927   auto NewIP =
9928       isa<PHINode>(LastInst)
9929           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9930           : std::next(BasicBlock::iterator(LastInst));
9931   Builder.SetInsertPoint(&*NewIP);
9932 
9933   // However, if we are vectorizing, we need to construct the vector values.
9934   // If the value is known to be uniform after vectorization, we can just
9935   // broadcast the scalar value corresponding to lane zero for each unroll
9936   // iteration. Otherwise, we construct the vector values using
9937   // insertelement instructions. Since the resulting vectors are stored in
9938   // State, we will only generate the insertelements once.
9939   Value *VectorValue = nullptr;
9940   if (IsUniform) {
9941     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9942     set(Def, VectorValue, Part);
9943   } else {
9944     // Initialize packing with insertelements to start from undef.
9945     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9946     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9947     set(Def, Undef, Part);
9948     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9949       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9950     VectorValue = get(Def, Part);
9951   }
9952   Builder.restoreIP(OldIP);
9953   return VectorValue;
9954 }
9955 
9956 // Process the loop in the VPlan-native vectorization path. This path builds
9957 // VPlan upfront in the vectorization pipeline, which allows to apply
9958 // VPlan-to-VPlan transformations from the very beginning without modifying the
9959 // input LLVM IR.
9960 static bool processLoopInVPlanNativePath(
9961     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9962     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9963     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9964     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9965     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9966     LoopVectorizationRequirements &Requirements) {
9967 
9968   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9969     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9970     return false;
9971   }
9972   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9973   Function *F = L->getHeader()->getParent();
9974   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9975 
9976   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9977       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9978 
9979   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9980                                 &Hints, IAI);
9981   // Use the planner for outer loop vectorization.
9982   // TODO: CM is not used at this point inside the planner. Turn CM into an
9983   // optional argument if we don't need it in the future.
9984   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9985 
9986   // Get user vectorization factor.
9987   ElementCount UserVF = Hints.getWidth();
9988 
9989   CM.collectElementTypesForWidening();
9990 
9991   // Plan how to best vectorize, return the best VF and its cost.
9992   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9993 
9994   // If we are stress testing VPlan builds, do not attempt to generate vector
9995   // code. Masked vector code generation support will follow soon.
9996   // Also, do not attempt to vectorize if no vector code will be produced.
9997   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9998     return false;
9999 
10000   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10001 
10002   {
10003     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10004                              F->getParent()->getDataLayout());
10005     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10006                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
10007     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10008                       << L->getHeader()->getParent()->getName() << "\"\n");
10009     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10010   }
10011 
10012   // Mark the loop as already vectorized to avoid vectorizing again.
10013   Hints.setAlreadyVectorized();
10014   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10015   return true;
10016 }
10017 
10018 // Emit a remark if there are stores to floats that required a floating point
10019 // extension. If the vectorized loop was generated with floating point there
10020 // will be a performance penalty from the conversion overhead and the change in
10021 // the vector width.
10022 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10023   SmallVector<Instruction *, 4> Worklist;
10024   for (BasicBlock *BB : L->getBlocks()) {
10025     for (Instruction &Inst : *BB) {
10026       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10027         if (S->getValueOperand()->getType()->isFloatTy())
10028           Worklist.push_back(S);
10029       }
10030     }
10031   }
10032 
10033   // Traverse the floating point stores upwards searching, for floating point
10034   // conversions.
10035   SmallPtrSet<const Instruction *, 4> Visited;
10036   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10037   while (!Worklist.empty()) {
10038     auto *I = Worklist.pop_back_val();
10039     if (!L->contains(I))
10040       continue;
10041     if (!Visited.insert(I).second)
10042       continue;
10043 
10044     // Emit a remark if the floating point store required a floating
10045     // point conversion.
10046     // TODO: More work could be done to identify the root cause such as a
10047     // constant or a function return type and point the user to it.
10048     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10049       ORE->emit([&]() {
10050         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10051                                           I->getDebugLoc(), L->getHeader())
10052                << "floating point conversion changes vector width. "
10053                << "Mixed floating point precision requires an up/down "
10054                << "cast that will negatively impact performance.";
10055       });
10056 
10057     for (Use &Op : I->operands())
10058       if (auto *OpI = dyn_cast<Instruction>(Op))
10059         Worklist.push_back(OpI);
10060   }
10061 }
10062 
10063 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10064                                        VectorizationFactor &VF,
10065                                        std::optional<unsigned> VScale, Loop *L,
10066                                        ScalarEvolution &SE) {
10067   InstructionCost CheckCost = Checks.getCost();
10068   if (!CheckCost.isValid())
10069     return false;
10070 
10071   // When interleaving only scalar and vector cost will be equal, which in turn
10072   // would lead to a divide by 0. Fall back to hard threshold.
10073   if (VF.Width.isScalar()) {
10074     if (CheckCost > VectorizeMemoryCheckThreshold) {
10075       LLVM_DEBUG(
10076           dbgs()
10077           << "LV: Interleaving only is not profitable due to runtime checks\n");
10078       return false;
10079     }
10080     return true;
10081   }
10082 
10083   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10084   double ScalarC = *VF.ScalarCost.getValue();
10085   if (ScalarC == 0)
10086     return true;
10087 
10088   // First, compute the minimum iteration count required so that the vector
10089   // loop outperforms the scalar loop.
10090   //  The total cost of the scalar loop is
10091   //   ScalarC * TC
10092   //  where
10093   //  * TC is the actual trip count of the loop.
10094   //  * ScalarC is the cost of a single scalar iteration.
10095   //
10096   //  The total cost of the vector loop is
10097   //    RtC + VecC * (TC / VF) + EpiC
10098   //  where
10099   //  * RtC is the cost of the generated runtime checks
10100   //  * VecC is the cost of a single vector iteration.
10101   //  * TC is the actual trip count of the loop
10102   //  * VF is the vectorization factor
10103   //  * EpiCost is the cost of the generated epilogue, including the cost
10104   //    of the remaining scalar operations.
10105   //
10106   // Vectorization is profitable once the total vector cost is less than the
10107   // total scalar cost:
10108   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10109   //
10110   // Now we can compute the minimum required trip count TC as
10111   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10112   //
10113   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10114   // the computations are performed on doubles, not integers and the result
10115   // is rounded up, hence we get an upper estimate of the TC.
10116   unsigned IntVF = VF.Width.getKnownMinValue();
10117   if (VF.Width.isScalable()) {
10118     unsigned AssumedMinimumVscale = 1;
10119     if (VScale)
10120       AssumedMinimumVscale = *VScale;
10121     IntVF *= AssumedMinimumVscale;
10122   }
10123   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10124   double RtC = *CheckCost.getValue();
10125   double MinTC1 = RtC / (ScalarC - VecCOverVF);
10126 
10127   // Second, compute a minimum iteration count so that the cost of the
10128   // runtime checks is only a fraction of the total scalar loop cost. This
10129   // adds a loop-dependent bound on the overhead incurred if the runtime
10130   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10131   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10132   // cost, compute
10133   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10134   double MinTC2 = RtC * 10 / ScalarC;
10135 
10136   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10137   // next closest multiple of VF. This should partly compensate for ignoring
10138   // the epilogue cost.
10139   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10140   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10141 
10142   LLVM_DEBUG(
10143       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10144              << VF.MinProfitableTripCount << "\n");
10145 
10146   // Skip vectorization if the expected trip count is less than the minimum
10147   // required trip count.
10148   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10149     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10150                                 VF.MinProfitableTripCount)) {
10151       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10152                            "trip count < minimum profitable VF ("
10153                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10154                         << ")\n");
10155 
10156       return false;
10157     }
10158   }
10159   return true;
10160 }
10161 
10162 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10163     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10164                                !EnableLoopInterleaving),
10165       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10166                               !EnableLoopVectorization) {}
10167 
10168 bool LoopVectorizePass::processLoop(Loop *L) {
10169   assert((EnableVPlanNativePath || L->isInnermost()) &&
10170          "VPlan-native path is not enabled. Only process inner loops.");
10171 
10172 #ifndef NDEBUG
10173   const std::string DebugLocStr = getDebugLocString(L);
10174 #endif /* NDEBUG */
10175 
10176   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10177                     << L->getHeader()->getParent()->getName() << "' from "
10178                     << DebugLocStr << "\n");
10179 
10180   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10181 
10182   LLVM_DEBUG(
10183       dbgs() << "LV: Loop hints:"
10184              << " force="
10185              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10186                      ? "disabled"
10187                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10188                             ? "enabled"
10189                             : "?"))
10190              << " width=" << Hints.getWidth()
10191              << " interleave=" << Hints.getInterleave() << "\n");
10192 
10193   // Function containing loop
10194   Function *F = L->getHeader()->getParent();
10195 
10196   // Looking at the diagnostic output is the only way to determine if a loop
10197   // was vectorized (other than looking at the IR or machine code), so it
10198   // is important to generate an optimization remark for each loop. Most of
10199   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10200   // generated as OptimizationRemark and OptimizationRemarkMissed are
10201   // less verbose reporting vectorized loops and unvectorized loops that may
10202   // benefit from vectorization, respectively.
10203 
10204   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10205     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10206     return false;
10207   }
10208 
10209   PredicatedScalarEvolution PSE(*SE, *L);
10210 
10211   // Check if it is legal to vectorize the loop.
10212   LoopVectorizationRequirements Requirements;
10213   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10214                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10215   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10216     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10217     Hints.emitRemarkWithHints();
10218     return false;
10219   }
10220 
10221   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10222   // here. They may require CFG and instruction level transformations before
10223   // even evaluating whether vectorization is profitable. Since we cannot modify
10224   // the incoming IR, we need to build VPlan upfront in the vectorization
10225   // pipeline.
10226   if (!L->isInnermost())
10227     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10228                                         ORE, BFI, PSI, Hints, Requirements);
10229 
10230   assert(L->isInnermost() && "Inner loop expected.");
10231 
10232   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10233   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10234 
10235   // If an override option has been passed in for interleaved accesses, use it.
10236   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10237     UseInterleaved = EnableInterleavedMemAccesses;
10238 
10239   // Analyze interleaved memory accesses.
10240   if (UseInterleaved)
10241     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10242 
10243   // Check the function attributes and profiles to find out if this function
10244   // should be optimized for size.
10245   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10246       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10247 
10248   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10249   // count by optimizing for size, to minimize overheads.
10250   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10251   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10252     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10253                       << "This loop is worth vectorizing only if no scalar "
10254                       << "iteration overheads are incurred.");
10255     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10256       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10257     else {
10258       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10259         LLVM_DEBUG(dbgs() << "\n");
10260         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10261       } else {
10262         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10263                              "small to consider vectorizing.\n");
10264         reportVectorizationFailure(
10265             "The trip count is below the minial threshold value.",
10266             "loop trip count is too low, avoiding vectorization",
10267             "LowTripCount", ORE, L);
10268         Hints.emitRemarkWithHints();
10269         return false;
10270       }
10271     }
10272   }
10273 
10274   // Check the function attributes to see if implicit floats or vectors are
10275   // allowed.
10276   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10277     reportVectorizationFailure(
10278         "Can't vectorize when the NoImplicitFloat attribute is used",
10279         "loop not vectorized due to NoImplicitFloat attribute",
10280         "NoImplicitFloat", ORE, L);
10281     Hints.emitRemarkWithHints();
10282     return false;
10283   }
10284 
10285   // Check if the target supports potentially unsafe FP vectorization.
10286   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10287   // for the target we're vectorizing for, to make sure none of the
10288   // additional fp-math flags can help.
10289   if (Hints.isPotentiallyUnsafe() &&
10290       TTI->isFPVectorizationPotentiallyUnsafe()) {
10291     reportVectorizationFailure(
10292         "Potentially unsafe FP op prevents vectorization",
10293         "loop not vectorized due to unsafe FP support.",
10294         "UnsafeFP", ORE, L);
10295     Hints.emitRemarkWithHints();
10296     return false;
10297   }
10298 
10299   bool AllowOrderedReductions;
10300   // If the flag is set, use that instead and override the TTI behaviour.
10301   if (ForceOrderedReductions.getNumOccurrences() > 0)
10302     AllowOrderedReductions = ForceOrderedReductions;
10303   else
10304     AllowOrderedReductions = TTI->enableOrderedReductions();
10305   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10306     ORE->emit([&]() {
10307       auto *ExactFPMathInst = Requirements.getExactFPInst();
10308       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10309                                                  ExactFPMathInst->getDebugLoc(),
10310                                                  ExactFPMathInst->getParent())
10311              << "loop not vectorized: cannot prove it is safe to reorder "
10312                 "floating-point operations";
10313     });
10314     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10315                          "reorder floating-point operations\n");
10316     Hints.emitRemarkWithHints();
10317     return false;
10318   }
10319 
10320   // Use the cost model.
10321   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10322                                 F, &Hints, IAI);
10323   CM.collectValuesToIgnore();
10324   CM.collectElementTypesForWidening();
10325 
10326   // Use the planner for vectorization.
10327   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10328 
10329   // Get user vectorization factor and interleave count.
10330   ElementCount UserVF = Hints.getWidth();
10331   unsigned UserIC = Hints.getInterleave();
10332 
10333   // Plan how to best vectorize, return the best VF and its cost.
10334   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10335 
10336   VectorizationFactor VF = VectorizationFactor::Disabled();
10337   unsigned IC = 1;
10338 
10339   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10340                            F->getParent()->getDataLayout());
10341   if (MaybeVF) {
10342     VF = *MaybeVF;
10343     // Select the interleave count.
10344     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10345 
10346     unsigned SelectedIC = std::max(IC, UserIC);
10347     //  Optimistically generate runtime checks if they are needed. Drop them if
10348     //  they turn out to not be profitable.
10349     if (VF.Width.isVector() || SelectedIC > 1)
10350       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10351 
10352     // Check if it is profitable to vectorize with runtime checks.
10353     bool ForceVectorization =
10354         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10355     if (!ForceVectorization &&
10356         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10357                                     *PSE.getSE())) {
10358       ORE->emit([&]() {
10359         return OptimizationRemarkAnalysisAliasing(
10360                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10361                    L->getHeader())
10362                << "loop not vectorized: cannot prove it is safe to reorder "
10363                   "memory operations";
10364       });
10365       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10366       Hints.emitRemarkWithHints();
10367       return false;
10368     }
10369   }
10370 
10371   // Identify the diagnostic messages that should be produced.
10372   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10373   bool VectorizeLoop = true, InterleaveLoop = true;
10374   if (VF.Width.isScalar()) {
10375     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10376     VecDiagMsg = std::make_pair(
10377         "VectorizationNotBeneficial",
10378         "the cost-model indicates that vectorization is not beneficial");
10379     VectorizeLoop = false;
10380   }
10381 
10382   if (!MaybeVF && UserIC > 1) {
10383     // Tell the user interleaving was avoided up-front, despite being explicitly
10384     // requested.
10385     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10386                          "interleaving should be avoided up front\n");
10387     IntDiagMsg = std::make_pair(
10388         "InterleavingAvoided",
10389         "Ignoring UserIC, because interleaving was avoided up front");
10390     InterleaveLoop = false;
10391   } else if (IC == 1 && UserIC <= 1) {
10392     // Tell the user interleaving is not beneficial.
10393     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10394     IntDiagMsg = std::make_pair(
10395         "InterleavingNotBeneficial",
10396         "the cost-model indicates that interleaving is not beneficial");
10397     InterleaveLoop = false;
10398     if (UserIC == 1) {
10399       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10400       IntDiagMsg.second +=
10401           " and is explicitly disabled or interleave count is set to 1";
10402     }
10403   } else if (IC > 1 && UserIC == 1) {
10404     // Tell the user interleaving is beneficial, but it explicitly disabled.
10405     LLVM_DEBUG(
10406         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10407     IntDiagMsg = std::make_pair(
10408         "InterleavingBeneficialButDisabled",
10409         "the cost-model indicates that interleaving is beneficial "
10410         "but is explicitly disabled or interleave count is set to 1");
10411     InterleaveLoop = false;
10412   }
10413 
10414   // Override IC if user provided an interleave count.
10415   IC = UserIC > 0 ? UserIC : IC;
10416 
10417   // Emit diagnostic messages, if any.
10418   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10419   if (!VectorizeLoop && !InterleaveLoop) {
10420     // Do not vectorize or interleaving the loop.
10421     ORE->emit([&]() {
10422       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10423                                       L->getStartLoc(), L->getHeader())
10424              << VecDiagMsg.second;
10425     });
10426     ORE->emit([&]() {
10427       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10428                                       L->getStartLoc(), L->getHeader())
10429              << IntDiagMsg.second;
10430     });
10431     return false;
10432   } else if (!VectorizeLoop && InterleaveLoop) {
10433     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10434     ORE->emit([&]() {
10435       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10436                                         L->getStartLoc(), L->getHeader())
10437              << VecDiagMsg.second;
10438     });
10439   } else if (VectorizeLoop && !InterleaveLoop) {
10440     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10441                       << ") in " << DebugLocStr << '\n');
10442     ORE->emit([&]() {
10443       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10444                                         L->getStartLoc(), L->getHeader())
10445              << IntDiagMsg.second;
10446     });
10447   } else if (VectorizeLoop && InterleaveLoop) {
10448     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10449                       << ") in " << DebugLocStr << '\n');
10450     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10451   }
10452 
10453   bool DisableRuntimeUnroll = false;
10454   MDNode *OrigLoopID = L->getLoopID();
10455   {
10456     using namespace ore;
10457     if (!VectorizeLoop) {
10458       assert(IC > 1 && "interleave count should not be 1 or 0");
10459       // If we decided that it is not legal to vectorize the loop, then
10460       // interleave it.
10461       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10462                                  &CM, BFI, PSI, Checks);
10463 
10464       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10465       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10466 
10467       ORE->emit([&]() {
10468         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10469                                   L->getHeader())
10470                << "interleaved loop (interleaved count: "
10471                << NV("InterleaveCount", IC) << ")";
10472       });
10473     } else {
10474       // If we decided that it is *legal* to vectorize the loop, then do it.
10475 
10476       // Consider vectorizing the epilogue too if it's profitable.
10477       VectorizationFactor EpilogueVF =
10478           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10479       if (EpilogueVF.Width.isVector()) {
10480 
10481         // The first pass vectorizes the main loop and creates a scalar epilogue
10482         // to be vectorized by executing the plan (potentially with a different
10483         // factor) again shortly afterwards.
10484         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10485         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10486                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10487 
10488         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10489         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10490                         DT, true);
10491         ++LoopsVectorized;
10492 
10493         // Second pass vectorizes the epilogue and adjusts the control flow
10494         // edges from the first pass.
10495         EPI.MainLoopVF = EPI.EpilogueVF;
10496         EPI.MainLoopUF = EPI.EpilogueUF;
10497         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10498                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10499                                                  Checks);
10500 
10501         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10502         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10503         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10504         Header->setName("vec.epilog.vector.body");
10505 
10506         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10507         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10508         // before vectorizing the epilogue loop.
10509         for (VPRecipeBase &R : Header->phis()) {
10510           if (isa<VPCanonicalIVPHIRecipe>(&R))
10511             continue;
10512 
10513           Value *ResumeV = nullptr;
10514           // TODO: Move setting of resume values to prepareToExecute.
10515           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10516             ResumeV = MainILV.getReductionResumeValue(
10517                 ReductionPhi->getRecurrenceDescriptor());
10518           } else {
10519             // Create induction resume values for both widened pointer and
10520             // integer/fp inductions and update the start value of the induction
10521             // recipes to use the resume value.
10522             PHINode *IndPhi = nullptr;
10523             const InductionDescriptor *ID;
10524             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10525               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10526               ID = &Ind->getInductionDescriptor();
10527             } else {
10528               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10529               IndPhi = WidenInd->getPHINode();
10530               ID = &WidenInd->getInductionDescriptor();
10531             }
10532 
10533             ResumeV = MainILV.createInductionResumeValue(
10534                 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10535           }
10536           assert(ResumeV && "Must have a resume value");
10537           VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10538           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10539         }
10540 
10541         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10542                         DT, true);
10543         ++LoopsEpilogueVectorized;
10544 
10545         if (!MainILV.areSafetyChecksAdded())
10546           DisableRuntimeUnroll = true;
10547       } else {
10548         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10549                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10550                                PSI, Checks);
10551 
10552         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10553         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10554         ++LoopsVectorized;
10555 
10556         // Add metadata to disable runtime unrolling a scalar loop when there
10557         // are no runtime checks about strides and memory. A scalar loop that is
10558         // rarely used is not worth unrolling.
10559         if (!LB.areSafetyChecksAdded())
10560           DisableRuntimeUnroll = true;
10561       }
10562       // Report the vectorization decision.
10563       ORE->emit([&]() {
10564         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10565                                   L->getHeader())
10566                << "vectorized loop (vectorization width: "
10567                << NV("VectorizationFactor", VF.Width)
10568                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10569       });
10570     }
10571 
10572     if (ORE->allowExtraAnalysis(LV_NAME))
10573       checkMixedPrecision(L, ORE);
10574   }
10575 
10576   std::optional<MDNode *> RemainderLoopID =
10577       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10578                                       LLVMLoopVectorizeFollowupEpilogue});
10579   if (RemainderLoopID) {
10580     L->setLoopID(*RemainderLoopID);
10581   } else {
10582     if (DisableRuntimeUnroll)
10583       AddRuntimeUnrollDisableMetaData(L);
10584 
10585     // Mark the loop as already vectorized to avoid vectorizing again.
10586     Hints.setAlreadyVectorized();
10587   }
10588 
10589   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10590   return true;
10591 }
10592 
10593 LoopVectorizeResult LoopVectorizePass::runImpl(
10594     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10595     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10596     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10597     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10598   SE = &SE_;
10599   LI = &LI_;
10600   TTI = &TTI_;
10601   DT = &DT_;
10602   BFI = &BFI_;
10603   TLI = TLI_;
10604   AC = &AC_;
10605   LAIs = &LAIs_;
10606   DB = &DB_;
10607   ORE = &ORE_;
10608   PSI = PSI_;
10609 
10610   // Don't attempt if
10611   // 1. the target claims to have no vector registers, and
10612   // 2. interleaving won't help ILP.
10613   //
10614   // The second condition is necessary because, even if the target has no
10615   // vector registers, loop vectorization may still enable scalar
10616   // interleaving.
10617   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10618       TTI->getMaxInterleaveFactor(1) < 2)
10619     return LoopVectorizeResult(false, false);
10620 
10621   bool Changed = false, CFGChanged = false;
10622 
10623   // The vectorizer requires loops to be in simplified form.
10624   // Since simplification may add new inner loops, it has to run before the
10625   // legality and profitability checks. This means running the loop vectorizer
10626   // will simplify all loops, regardless of whether anything end up being
10627   // vectorized.
10628   for (const auto &L : *LI)
10629     Changed |= CFGChanged |=
10630         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10631 
10632   // Build up a worklist of inner-loops to vectorize. This is necessary as
10633   // the act of vectorizing or partially unrolling a loop creates new loops
10634   // and can invalidate iterators across the loops.
10635   SmallVector<Loop *, 8> Worklist;
10636 
10637   for (Loop *L : *LI)
10638     collectSupportedLoops(*L, LI, ORE, Worklist);
10639 
10640   LoopsAnalyzed += Worklist.size();
10641 
10642   // Now walk the identified inner loops.
10643   while (!Worklist.empty()) {
10644     Loop *L = Worklist.pop_back_val();
10645 
10646     // For the inner loops we actually process, form LCSSA to simplify the
10647     // transform.
10648     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10649 
10650     Changed |= CFGChanged |= processLoop(L);
10651 
10652     if (Changed)
10653       LAIs->clear();
10654   }
10655 
10656   // Process each loop nest in the function.
10657   return LoopVectorizeResult(Changed, CFGChanged);
10658 }
10659 
10660 PreservedAnalyses LoopVectorizePass::run(Function &F,
10661                                          FunctionAnalysisManager &AM) {
10662     auto &LI = AM.getResult<LoopAnalysis>(F);
10663     // There are no loops in the function. Return before computing other expensive
10664     // analyses.
10665     if (LI.empty())
10666       return PreservedAnalyses::all();
10667     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10668     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10669     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10670     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10671     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10672     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10673     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10674     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10675 
10676     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10677     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10678     ProfileSummaryInfo *PSI =
10679         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10680     LoopVectorizeResult Result =
10681         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10682     if (!Result.MadeAnyChange)
10683       return PreservedAnalyses::all();
10684     PreservedAnalyses PA;
10685 
10686     // We currently do not preserve loopinfo/dominator analyses with outer loop
10687     // vectorization. Until this is addressed, mark these analyses as preserved
10688     // only for non-VPlan-native path.
10689     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10690     if (!EnableVPlanNativePath) {
10691       PA.preserve<LoopAnalysis>();
10692       PA.preserve<DominatorTreeAnalysis>();
10693     }
10694 
10695     if (Result.MadeCFGChange) {
10696       // Making CFG changes likely means a loop got vectorized. Indicate that
10697       // extra simplification passes should be run.
10698       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10699       // be run if runtime checks have been added.
10700       AM.getResult<ShouldRunExtraVectorPasses>(F);
10701       PA.preserve<ShouldRunExtraVectorPasses>();
10702     } else {
10703       PA.preserveSet<CFGAnalyses>();
10704     }
10705     return PA;
10706 }
10707 
10708 void LoopVectorizePass::printPipeline(
10709     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10710   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10711       OS, MapClassName2PassName);
10712 
10713   OS << "<";
10714   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10715   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10716   OS << ">";
10717 }
10718