xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision dbbaf77801a8f30e49731395e85757f339f345bf)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/ValueTracking.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <map>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
201     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202     cl::desc("The maximum allowed number of runtime memory checks"));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // This flag enables the stress testing of the VPlan H-CFG construction in the
348 // VPlan-native vectorization path. It must be used in conjuction with
349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350 // verification of the H-CFGs built.
351 static cl::opt<bool> VPlanBuildStressTest(
352     "vplan-build-stress-test", cl::init(false), cl::Hidden,
353     cl::desc(
354         "Build VPlan for every supported loop nest in the function and bail "
355         "out right after the build (stress test the VPlan H-CFG construction "
356         "in the VPlan-native vectorization path)."));
357 
358 cl::opt<bool> llvm::EnableLoopInterleaving(
359     "interleave-loops", cl::init(true), cl::Hidden,
360     cl::desc("Enable loop interleaving in Loop vectorization passes"));
361 cl::opt<bool> llvm::EnableLoopVectorization(
362     "vectorize-loops", cl::init(true), cl::Hidden,
363     cl::desc("Run the Loop vectorization passes"));
364 
365 cl::opt<bool> PrintVPlansInDotFormat(
366     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367     cl::desc("Use dot format instead of plain text when dumping VPlans"));
368 
369 /// A helper function that returns true if the given type is irregular. The
370 /// type is irregular if its allocated size doesn't equal the store size of an
371 /// element of the corresponding vector type.
372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373   // Determine if an array of N elements of type Ty is "bitcast compatible"
374   // with a <N x Ty> vector.
375   // This is only true if there is no padding between the array elements.
376   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377 }
378 
379 /// A helper function that returns the reciprocal of the block probability of
380 /// predicated blocks. If we return X, we are assuming the predicated block
381 /// will execute once for every X iterations of the loop header.
382 ///
383 /// TODO: We should use actual block probability here, if available. Currently,
384 ///       we always assume predicated blocks have a 50% chance of executing.
385 static unsigned getReciprocalPredBlockProb() { return 2; }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 // Forward declare GeneratedRTChecks.
418 class GeneratedRTChecks;
419 
420 namespace llvm {
421 
422 AnalysisKey ShouldRunExtraVectorPasses::Key;
423 
424 /// InnerLoopVectorizer vectorizes loops which contain only one basic
425 /// block to a specified vectorization factor (VF).
426 /// This class performs the widening of scalars into vectors, or multiple
427 /// scalars. This class also implements the following features:
428 /// * It inserts an epilogue loop for handling loops that don't have iteration
429 ///   counts that are known to be a multiple of the vectorization factor.
430 /// * It handles the code generation for reduction variables.
431 /// * Scalarization (implementation using scalars) of un-vectorizable
432 ///   instructions.
433 /// InnerLoopVectorizer does not perform any vectorization-legality
434 /// checks, and relies on the caller to check for the different legality
435 /// aspects. The InnerLoopVectorizer relies on the
436 /// LoopVectorizationLegality class to provide information about the induction
437 /// and reduction variables that were found to a given vectorization factor.
438 class InnerLoopVectorizer {
439 public:
440   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441                       LoopInfo *LI, DominatorTree *DT,
442                       const TargetLibraryInfo *TLI,
443                       const TargetTransformInfo *TTI, AssumptionCache *AC,
444                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445                       ElementCount MinProfitableTripCount,
446                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
447                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
448                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
449       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
450         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
451         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
452         PSI(PSI), RTChecks(RTChecks) {
453     // Query this against the original loop and save it here because the profile
454     // of the original loop header may change as the transformation happens.
455     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
456         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
457 
458     if (MinProfitableTripCount.isZero())
459       this->MinProfitableTripCount = VecWidth;
460     else
461       this->MinProfitableTripCount = MinProfitableTripCount;
462   }
463 
464   virtual ~InnerLoopVectorizer() = default;
465 
466   /// Create a new empty loop that will contain vectorized instructions later
467   /// on, while the old loop will be used as the scalar remainder. Control flow
468   /// is generated around the vectorized (and scalar epilogue) loops consisting
469   /// of various checks and bypasses. Return the pre-header block of the new
470   /// loop and the start value for the canonical induction, if it is != 0. The
471   /// latter is the case when vectorizing the epilogue loop. In the case of
472   /// epilogue vectorization, this function is overriden to handle the more
473   /// complex control flow around the loops.
474   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
475 
476   /// Widen a single call instruction within the innermost loop.
477   void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
478                             VPTransformState &State);
479 
480   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
481   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
482 
483   // Return true if any runtime check is added.
484   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
485 
486   /// A type for vectorized values in the new loop. Each value from the
487   /// original loop, when vectorized, is represented by UF vector values in the
488   /// new unrolled loop, where UF is the unroll factor.
489   using VectorParts = SmallVector<Value *, 2>;
490 
491   /// A helper function to scalarize a single Instruction in the innermost loop.
492   /// Generates a sequence of scalar instances for each lane between \p MinLane
493   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
494   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
495   /// Instr's operands.
496   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
497                             const VPIteration &Instance, bool IfPredicateInstr,
498                             VPTransformState &State);
499 
500   /// Construct the vector value of a scalarized value \p V one lane at a time.
501   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
502                                  VPTransformState &State);
503 
504   /// Try to vectorize interleaved access group \p Group with the base address
505   /// given in \p Addr, optionally masking the vector operations if \p
506   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
507   /// values in the vectorized loop.
508   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
509                                 ArrayRef<VPValue *> VPDefs,
510                                 VPTransformState &State, VPValue *Addr,
511                                 ArrayRef<VPValue *> StoredValues,
512                                 VPValue *BlockInMask = nullptr);
513 
514   /// Fix the non-induction PHIs in \p Plan.
515   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
516 
517   /// Returns true if the reordering of FP operations is not allowed, but we are
518   /// able to vectorize with strict in-order reductions for the given RdxDesc.
519   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
520 
521   /// Create a broadcast instruction. This method generates a broadcast
522   /// instruction (shuffle) for loop invariant values and for the induction
523   /// value. If this is the induction variable then we extend it to N, N+1, ...
524   /// this is needed because each iteration in the loop corresponds to a SIMD
525   /// element.
526   virtual Value *getBroadcastInstrs(Value *V);
527 
528   // Returns the resume value (bc.merge.rdx) for a reduction as
529   // generated by fixReduction.
530   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
531 
532 protected:
533   friend class LoopVectorizationPlanner;
534 
535   /// A small list of PHINodes.
536   using PhiVector = SmallVector<PHINode *, 4>;
537 
538   /// A type for scalarized values in the new loop. Each value from the
539   /// original loop, when scalarized, is represented by UF x VF scalar values
540   /// in the new unrolled loop, where UF is the unroll factor and VF is the
541   /// vectorization factor.
542   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
543 
544   /// Set up the values of the IVs correctly when exiting the vector loop.
545   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
546                     Value *VectorTripCount, Value *EndValue,
547                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
548                     VPlan &Plan);
549 
550   /// Handle all cross-iteration phis in the header.
551   void fixCrossIterationPHIs(VPTransformState &State);
552 
553   /// Create the exit value of first order recurrences in the middle block and
554   /// update their users.
555   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
556                                VPTransformState &State);
557 
558   /// Create code for the loop exit value of the reduction.
559   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
560 
561   /// Clear NSW/NUW flags from reduction instructions if necessary.
562   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
563                                VPTransformState &State);
564 
565   /// Iteratively sink the scalarized operands of a predicated instruction into
566   /// the block that was created for it.
567   void sinkScalarOperands(Instruction *PredInst);
568 
569   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
570   /// represented as.
571   void truncateToMinimalBitwidths(VPTransformState &State);
572 
573   /// Returns (and creates if needed) the original loop trip count.
574   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
575 
576   /// Returns (and creates if needed) the trip count of the widened loop.
577   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
578 
579   /// Returns a bitcasted value to the requested vector type.
580   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
581   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
582                                 const DataLayout &DL);
583 
584   /// Emit a bypass check to see if the vector trip count is zero, including if
585   /// it overflows.
586   void emitIterationCountCheck(BasicBlock *Bypass);
587 
588   /// Emit a bypass check to see if all of the SCEV assumptions we've
589   /// had to make are correct. Returns the block containing the checks or
590   /// nullptr if no checks have been added.
591   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
592 
593   /// Emit bypass checks to check any memory assumptions we may have made.
594   /// Returns the block containing the checks or nullptr if no checks have been
595   /// added.
596   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
597 
598   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
599   /// vector loop preheader, middle block and scalar preheader.
600   void createVectorLoopSkeleton(StringRef Prefix);
601 
602   /// Create new phi nodes for the induction variables to resume iteration count
603   /// in the scalar epilogue, from where the vectorized loop left off.
604   /// In cases where the loop skeleton is more complicated (eg. epilogue
605   /// vectorization) and the resume values can come from an additional bypass
606   /// block, the \p AdditionalBypass pair provides information about the bypass
607   /// block and the end value on the edge from bypass to this loop.
608   void createInductionResumeValues(
609       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
610 
611   /// Complete the loop skeleton by adding debug MDs, creating appropriate
612   /// conditional branches in the middle block, preparing the builder and
613   /// running the verifier. Return the preheader of the completed vector loop.
614   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
615 
616   /// Collect poison-generating recipes that may generate a poison value that is
617   /// used after vectorization, even when their operands are not poison. Those
618   /// recipes meet the following conditions:
619   ///  * Contribute to the address computation of a recipe generating a widen
620   ///    memory load/store (VPWidenMemoryInstructionRecipe or
621   ///    VPInterleaveRecipe).
622   ///  * Such a widen memory load/store has at least one underlying Instruction
623   ///    that is in a basic block that needs predication and after vectorization
624   ///    the generated instruction won't be predicated.
625   void collectPoisonGeneratingRecipes(VPTransformState &State);
626 
627   /// Allow subclasses to override and print debug traces before/after vplan
628   /// execution, when trace information is requested.
629   virtual void printDebugTracesAtStart(){};
630   virtual void printDebugTracesAtEnd(){};
631 
632   /// The original loop.
633   Loop *OrigLoop;
634 
635   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
636   /// dynamic knowledge to simplify SCEV expressions and converts them to a
637   /// more usable form.
638   PredicatedScalarEvolution &PSE;
639 
640   /// Loop Info.
641   LoopInfo *LI;
642 
643   /// Dominator Tree.
644   DominatorTree *DT;
645 
646   /// Alias Analysis.
647   AAResults *AA;
648 
649   /// Target Library Info.
650   const TargetLibraryInfo *TLI;
651 
652   /// Target Transform Info.
653   const TargetTransformInfo *TTI;
654 
655   /// Assumption Cache.
656   AssumptionCache *AC;
657 
658   /// Interface to emit optimization remarks.
659   OptimizationRemarkEmitter *ORE;
660 
661   /// The vectorization SIMD factor to use. Each vector will have this many
662   /// vector elements.
663   ElementCount VF;
664 
665   ElementCount MinProfitableTripCount;
666 
667   /// The vectorization unroll factor to use. Each scalar is vectorized to this
668   /// many different vector instructions.
669   unsigned UF;
670 
671   /// The builder that we use
672   IRBuilder<> Builder;
673 
674   // --- Vectorization state ---
675 
676   /// The vector-loop preheader.
677   BasicBlock *LoopVectorPreHeader;
678 
679   /// The scalar-loop preheader.
680   BasicBlock *LoopScalarPreHeader;
681 
682   /// Middle Block between the vector and the scalar.
683   BasicBlock *LoopMiddleBlock;
684 
685   /// The unique ExitBlock of the scalar loop if one exists.  Note that
686   /// there can be multiple exiting edges reaching this block.
687   BasicBlock *LoopExitBlock;
688 
689   /// The scalar loop body.
690   BasicBlock *LoopScalarBody;
691 
692   /// A list of all bypass blocks. The first block is the entry of the loop.
693   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
694 
695   /// Store instructions that were predicated.
696   SmallVector<Instruction *, 4> PredicatedInstructions;
697 
698   /// Trip count of the original loop.
699   Value *TripCount = nullptr;
700 
701   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
702   Value *VectorTripCount = nullptr;
703 
704   /// The legality analysis.
705   LoopVectorizationLegality *Legal;
706 
707   /// The profitablity analysis.
708   LoopVectorizationCostModel *Cost;
709 
710   // Record whether runtime checks are added.
711   bool AddedSafetyChecks = false;
712 
713   // Holds the end values for each induction variable. We save the end values
714   // so we can later fix-up the external users of the induction variables.
715   DenseMap<PHINode *, Value *> IVEndValues;
716 
717   /// BFI and PSI are used to check for profile guided size optimizations.
718   BlockFrequencyInfo *BFI;
719   ProfileSummaryInfo *PSI;
720 
721   // Whether this loop should be optimized for size based on profile guided size
722   // optimizatios.
723   bool OptForSizeBasedOnProfile;
724 
725   /// Structure to hold information about generated runtime checks, responsible
726   /// for cleaning the checks, if vectorization turns out unprofitable.
727   GeneratedRTChecks &RTChecks;
728 
729   // Holds the resume values for reductions in the loops, used to set the
730   // correct start value of reduction PHIs when vectorizing the epilogue.
731   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
732       ReductionResumeValues;
733 };
734 
735 class InnerLoopUnroller : public InnerLoopVectorizer {
736 public:
737   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
738                     LoopInfo *LI, DominatorTree *DT,
739                     const TargetLibraryInfo *TLI,
740                     const TargetTransformInfo *TTI, AssumptionCache *AC,
741                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
742                     LoopVectorizationLegality *LVL,
743                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
744                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
745       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
746                             ElementCount::getFixed(1),
747                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
748                             BFI, PSI, Check) {}
749 
750 private:
751   Value *getBroadcastInstrs(Value *V) override;
752 };
753 
754 /// Encapsulate information regarding vectorization of a loop and its epilogue.
755 /// This information is meant to be updated and used across two stages of
756 /// epilogue vectorization.
757 struct EpilogueLoopVectorizationInfo {
758   ElementCount MainLoopVF = ElementCount::getFixed(0);
759   unsigned MainLoopUF = 0;
760   ElementCount EpilogueVF = ElementCount::getFixed(0);
761   unsigned EpilogueUF = 0;
762   BasicBlock *MainLoopIterationCountCheck = nullptr;
763   BasicBlock *EpilogueIterationCountCheck = nullptr;
764   BasicBlock *SCEVSafetyCheck = nullptr;
765   BasicBlock *MemSafetyCheck = nullptr;
766   Value *TripCount = nullptr;
767   Value *VectorTripCount = nullptr;
768 
769   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
770                                 ElementCount EVF, unsigned EUF)
771       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
772     assert(EUF == 1 &&
773            "A high UF for the epilogue loop is likely not beneficial.");
774   }
775 };
776 
777 /// An extension of the inner loop vectorizer that creates a skeleton for a
778 /// vectorized loop that has its epilogue (residual) also vectorized.
779 /// The idea is to run the vplan on a given loop twice, firstly to setup the
780 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
781 /// from the first step and vectorize the epilogue.  This is achieved by
782 /// deriving two concrete strategy classes from this base class and invoking
783 /// them in succession from the loop vectorizer planner.
784 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
785 public:
786   InnerLoopAndEpilogueVectorizer(
787       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
788       DominatorTree *DT, const TargetLibraryInfo *TLI,
789       const TargetTransformInfo *TTI, AssumptionCache *AC,
790       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
791       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
792       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
793       GeneratedRTChecks &Checks)
794       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
795                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
796                             CM, BFI, PSI, Checks),
797         EPI(EPI) {}
798 
799   // Override this function to handle the more complex control flow around the
800   // three loops.
801   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
802     return createEpilogueVectorizedLoopSkeleton();
803   }
804 
805   /// The interface for creating a vectorized skeleton using one of two
806   /// different strategies, each corresponding to one execution of the vplan
807   /// as described above.
808   virtual std::pair<BasicBlock *, Value *>
809   createEpilogueVectorizedLoopSkeleton() = 0;
810 
811   /// Holds and updates state information required to vectorize the main loop
812   /// and its epilogue in two separate passes. This setup helps us avoid
813   /// regenerating and recomputing runtime safety checks. It also helps us to
814   /// shorten the iteration-count-check path length for the cases where the
815   /// iteration count of the loop is so small that the main vector loop is
816   /// completely skipped.
817   EpilogueLoopVectorizationInfo &EPI;
818 };
819 
820 /// A specialized derived class of inner loop vectorizer that performs
821 /// vectorization of *main* loops in the process of vectorizing loops and their
822 /// epilogues.
823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
824 public:
825   EpilogueVectorizerMainLoop(
826       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
827       DominatorTree *DT, const TargetLibraryInfo *TLI,
828       const TargetTransformInfo *TTI, AssumptionCache *AC,
829       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
830       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
831       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
832       GeneratedRTChecks &Check)
833       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
834                                        EPI, LVL, CM, BFI, PSI, Check) {}
835   /// Implements the interface for creating a vectorized skeleton using the
836   /// *main loop* strategy (ie the first pass of vplan execution).
837   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
838 
839 protected:
840   /// Emits an iteration count bypass check once for the main loop (when \p
841   /// ForEpilogue is false) and once for the epilogue loop (when \p
842   /// ForEpilogue is true).
843   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
844   void printDebugTracesAtStart() override;
845   void printDebugTracesAtEnd() override;
846 };
847 
848 // A specialized derived class of inner loop vectorizer that performs
849 // vectorization of *epilogue* loops in the process of vectorizing loops and
850 // their epilogues.
851 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
852 public:
853   EpilogueVectorizerEpilogueLoop(
854       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
855       DominatorTree *DT, const TargetLibraryInfo *TLI,
856       const TargetTransformInfo *TTI, AssumptionCache *AC,
857       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
858       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
859       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
860       GeneratedRTChecks &Checks)
861       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
862                                        EPI, LVL, CM, BFI, PSI, Checks) {
863     TripCount = EPI.TripCount;
864   }
865   /// Implements the interface for creating a vectorized skeleton using the
866   /// *epilogue loop* strategy (ie the second pass of vplan execution).
867   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
868 
869 protected:
870   /// Emits an iteration count bypass check after the main vector loop has
871   /// finished to see if there are any iterations left to execute by either
872   /// the vector epilogue or the scalar epilogue.
873   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
874                                                       BasicBlock *Bypass,
875                                                       BasicBlock *Insert);
876   void printDebugTracesAtStart() override;
877   void printDebugTracesAtEnd() override;
878 };
879 } // end namespace llvm
880 
881 /// Look for a meaningful debug location on the instruction or it's
882 /// operands.
883 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
884   if (!I)
885     return I;
886 
887   DebugLoc Empty;
888   if (I->getDebugLoc() != Empty)
889     return I;
890 
891   for (Use &Op : I->operands()) {
892     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
893       if (OpInst->getDebugLoc() != Empty)
894         return OpInst;
895   }
896 
897   return I;
898 }
899 
900 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
901 /// is passed, the message relates to that particular instruction.
902 #ifndef NDEBUG
903 static void debugVectorizationMessage(const StringRef Prefix,
904                                       const StringRef DebugMsg,
905                                       Instruction *I) {
906   dbgs() << "LV: " << Prefix << DebugMsg;
907   if (I != nullptr)
908     dbgs() << " " << *I;
909   else
910     dbgs() << '.';
911   dbgs() << '\n';
912 }
913 #endif
914 
915 /// Create an analysis remark that explains why vectorization failed
916 ///
917 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
918 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
919 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
920 /// the location of the remark.  \return the remark object that can be
921 /// streamed to.
922 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
923     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
924   Value *CodeRegion = TheLoop->getHeader();
925   DebugLoc DL = TheLoop->getStartLoc();
926 
927   if (I) {
928     CodeRegion = I->getParent();
929     // If there is no debug location attached to the instruction, revert back to
930     // using the loop's.
931     if (I->getDebugLoc())
932       DL = I->getDebugLoc();
933   }
934 
935   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
936 }
937 
938 namespace llvm {
939 
940 /// Return a value for Step multiplied by VF.
941 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
942                        int64_t Step) {
943   assert(Ty->isIntegerTy() && "Expected an integer step");
944   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
945   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
946 }
947 
948 /// Return the runtime value for VF.
949 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
950   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
951   return VF.isScalable() ? B.CreateVScale(EC) : EC;
952 }
953 
954 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
955                                   ElementCount VF) {
956   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
957   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
958   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
959   return B.CreateUIToFP(RuntimeVF, FTy);
960 }
961 
962 void reportVectorizationFailure(const StringRef DebugMsg,
963                                 const StringRef OREMsg, const StringRef ORETag,
964                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
965                                 Instruction *I) {
966   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
967   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
968   ORE->emit(
969       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
970       << "loop not vectorized: " << OREMsg);
971 }
972 
973 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
974                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
975                              Instruction *I) {
976   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
977   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
978   ORE->emit(
979       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
980       << Msg);
981 }
982 
983 } // end namespace llvm
984 
985 #ifndef NDEBUG
986 /// \return string containing a file name and a line # for the given loop.
987 static std::string getDebugLocString(const Loop *L) {
988   std::string Result;
989   if (L) {
990     raw_string_ostream OS(Result);
991     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
992       LoopDbgLoc.print(OS);
993     else
994       // Just print the module name.
995       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
996     OS.flush();
997   }
998   return Result;
999 }
1000 #endif
1001 
1002 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1003     VPTransformState &State) {
1004 
1005   // Collect recipes in the backward slice of `Root` that may generate a poison
1006   // value that is used after vectorization.
1007   SmallPtrSet<VPRecipeBase *, 16> Visited;
1008   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1009     SmallVector<VPRecipeBase *, 16> Worklist;
1010     Worklist.push_back(Root);
1011 
1012     // Traverse the backward slice of Root through its use-def chain.
1013     while (!Worklist.empty()) {
1014       VPRecipeBase *CurRec = Worklist.back();
1015       Worklist.pop_back();
1016 
1017       if (!Visited.insert(CurRec).second)
1018         continue;
1019 
1020       // Prune search if we find another recipe generating a widen memory
1021       // instruction. Widen memory instructions involved in address computation
1022       // will lead to gather/scatter instructions, which don't need to be
1023       // handled.
1024       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1025           isa<VPInterleaveRecipe>(CurRec) ||
1026           isa<VPScalarIVStepsRecipe>(CurRec) ||
1027           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1028           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1029         continue;
1030 
1031       // This recipe contributes to the address computation of a widen
1032       // load/store. Collect recipe if its underlying instruction has
1033       // poison-generating flags.
1034       Instruction *Instr = CurRec->getUnderlyingInstr();
1035       if (Instr && Instr->hasPoisonGeneratingFlags())
1036         State.MayGeneratePoisonRecipes.insert(CurRec);
1037 
1038       // Add new definitions to the worklist.
1039       for (VPValue *operand : CurRec->operands())
1040         if (VPDef *OpDef = operand->getDef())
1041           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1042     }
1043   });
1044 
1045   // Traverse all the recipes in the VPlan and collect the poison-generating
1046   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1047   // VPInterleaveRecipe.
1048   auto Iter = depth_first(
1049       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1050   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1051     for (VPRecipeBase &Recipe : *VPBB) {
1052       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1053         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1054         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1055         if (AddrDef && WidenRec->isConsecutive() &&
1056             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1057           collectPoisonGeneratingInstrsInBackwardSlice(
1058               cast<VPRecipeBase>(AddrDef));
1059       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1060         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1061         if (AddrDef) {
1062           // Check if any member of the interleave group needs predication.
1063           const InterleaveGroup<Instruction> *InterGroup =
1064               InterleaveRec->getInterleaveGroup();
1065           bool NeedPredication = false;
1066           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1067                I < NumMembers; ++I) {
1068             Instruction *Member = InterGroup->getMember(I);
1069             if (Member)
1070               NeedPredication |=
1071                   Legal->blockNeedsPredication(Member->getParent());
1072           }
1073 
1074           if (NeedPredication)
1075             collectPoisonGeneratingInstrsInBackwardSlice(
1076                 cast<VPRecipeBase>(AddrDef));
1077         }
1078       }
1079     }
1080   }
1081 }
1082 
1083 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1084     const RecurrenceDescriptor &RdxDesc) {
1085   auto It = ReductionResumeValues.find(&RdxDesc);
1086   assert(It != ReductionResumeValues.end() &&
1087          "Expected to find a resume value for the reduction.");
1088   return It->second;
1089 }
1090 
1091 namespace llvm {
1092 
1093 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1094 // lowered.
1095 enum ScalarEpilogueLowering {
1096 
1097   // The default: allowing scalar epilogues.
1098   CM_ScalarEpilogueAllowed,
1099 
1100   // Vectorization with OptForSize: don't allow epilogues.
1101   CM_ScalarEpilogueNotAllowedOptSize,
1102 
1103   // A special case of vectorisation with OptForSize: loops with a very small
1104   // trip count are considered for vectorization under OptForSize, thereby
1105   // making sure the cost of their loop body is dominant, free of runtime
1106   // guards and scalar iteration overheads.
1107   CM_ScalarEpilogueNotAllowedLowTripLoop,
1108 
1109   // Loop hint predicate indicating an epilogue is undesired.
1110   CM_ScalarEpilogueNotNeededUsePredicate,
1111 
1112   // Directive indicating we must either tail fold or not vectorize
1113   CM_ScalarEpilogueNotAllowedUsePredicate
1114 };
1115 
1116 /// ElementCountComparator creates a total ordering for ElementCount
1117 /// for the purposes of using it in a set structure.
1118 struct ElementCountComparator {
1119   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1120     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1121            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1122   }
1123 };
1124 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1125 
1126 /// LoopVectorizationCostModel - estimates the expected speedups due to
1127 /// vectorization.
1128 /// In many cases vectorization is not profitable. This can happen because of
1129 /// a number of reasons. In this class we mainly attempt to predict the
1130 /// expected speedup/slowdowns due to the supported instruction set. We use the
1131 /// TargetTransformInfo to query the different backends for the cost of
1132 /// different operations.
1133 class LoopVectorizationCostModel {
1134 public:
1135   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1136                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1137                              LoopVectorizationLegality *Legal,
1138                              const TargetTransformInfo &TTI,
1139                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1140                              AssumptionCache *AC,
1141                              OptimizationRemarkEmitter *ORE, const Function *F,
1142                              const LoopVectorizeHints *Hints,
1143                              InterleavedAccessInfo &IAI)
1144       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1145         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1146         Hints(Hints), InterleaveInfo(IAI) {}
1147 
1148   /// \return An upper bound for the vectorization factors (both fixed and
1149   /// scalable). If the factors are 0, vectorization and interleaving should be
1150   /// avoided up front.
1151   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1152 
1153   /// \return True if runtime checks are required for vectorization, and false
1154   /// otherwise.
1155   bool runtimeChecksRequired();
1156 
1157   /// \return The most profitable vectorization factor and the cost of that VF.
1158   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1159   /// then this vectorization factor will be selected if vectorization is
1160   /// possible.
1161   VectorizationFactor
1162   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1163 
1164   VectorizationFactor
1165   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1166                                     const LoopVectorizationPlanner &LVP);
1167 
1168   /// Setup cost-based decisions for user vectorization factor.
1169   /// \return true if the UserVF is a feasible VF to be chosen.
1170   bool selectUserVectorizationFactor(ElementCount UserVF) {
1171     collectUniformsAndScalars(UserVF);
1172     collectInstsToScalarize(UserVF);
1173     return expectedCost(UserVF).first.isValid();
1174   }
1175 
1176   /// \return The size (in bits) of the smallest and widest types in the code
1177   /// that needs to be vectorized. We ignore values that remain scalar such as
1178   /// 64 bit loop indices.
1179   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1180 
1181   /// \return The desired interleave count.
1182   /// If interleave count has been specified by metadata it will be returned.
1183   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1184   /// are the selected vectorization factor and the cost of the selected VF.
1185   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1186 
1187   /// Memory access instruction may be vectorized in more than one way.
1188   /// Form of instruction after vectorization depends on cost.
1189   /// This function takes cost-based decisions for Load/Store instructions
1190   /// and collects them in a map. This decisions map is used for building
1191   /// the lists of loop-uniform and loop-scalar instructions.
1192   /// The calculated cost is saved with widening decision in order to
1193   /// avoid redundant calculations.
1194   void setCostBasedWideningDecision(ElementCount VF);
1195 
1196   /// A struct that represents some properties of the register usage
1197   /// of a loop.
1198   struct RegisterUsage {
1199     /// Holds the number of loop invariant values that are used in the loop.
1200     /// The key is ClassID of target-provided register class.
1201     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1202     /// Holds the maximum number of concurrent live intervals in the loop.
1203     /// The key is ClassID of target-provided register class.
1204     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1205   };
1206 
1207   /// \return Returns information about the register usages of the loop for the
1208   /// given vectorization factors.
1209   SmallVector<RegisterUsage, 8>
1210   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1211 
1212   /// Collect values we want to ignore in the cost model.
1213   void collectValuesToIgnore();
1214 
1215   /// Collect all element types in the loop for which widening is needed.
1216   void collectElementTypesForWidening();
1217 
1218   /// Split reductions into those that happen in the loop, and those that happen
1219   /// outside. In loop reductions are collected into InLoopReductionChains.
1220   void collectInLoopReductions();
1221 
1222   /// Returns true if we should use strict in-order reductions for the given
1223   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1224   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1225   /// of FP operations.
1226   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1227     return !Hints->allowReordering() && RdxDesc.isOrdered();
1228   }
1229 
1230   /// \returns The smallest bitwidth each instruction can be represented with.
1231   /// The vector equivalents of these instructions should be truncated to this
1232   /// type.
1233   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1234     return MinBWs;
1235   }
1236 
1237   /// \returns True if it is more profitable to scalarize instruction \p I for
1238   /// vectorization factor \p VF.
1239   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1240     assert(VF.isVector() &&
1241            "Profitable to scalarize relevant only for VF > 1.");
1242 
1243     // Cost model is not run in the VPlan-native path - return conservative
1244     // result until this changes.
1245     if (EnableVPlanNativePath)
1246       return false;
1247 
1248     auto Scalars = InstsToScalarize.find(VF);
1249     assert(Scalars != InstsToScalarize.end() &&
1250            "VF not yet analyzed for scalarization profitability");
1251     return Scalars->second.find(I) != Scalars->second.end();
1252   }
1253 
1254   /// Returns true if \p I is known to be uniform after vectorization.
1255   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1256     if (VF.isScalar())
1257       return true;
1258 
1259     // Cost model is not run in the VPlan-native path - return conservative
1260     // result until this changes.
1261     if (EnableVPlanNativePath)
1262       return false;
1263 
1264     auto UniformsPerVF = Uniforms.find(VF);
1265     assert(UniformsPerVF != Uniforms.end() &&
1266            "VF not yet analyzed for uniformity");
1267     return UniformsPerVF->second.count(I);
1268   }
1269 
1270   /// Returns true if \p I is known to be scalar after vectorization.
1271   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1272     if (VF.isScalar())
1273       return true;
1274 
1275     // Cost model is not run in the VPlan-native path - return conservative
1276     // result until this changes.
1277     if (EnableVPlanNativePath)
1278       return false;
1279 
1280     auto ScalarsPerVF = Scalars.find(VF);
1281     assert(ScalarsPerVF != Scalars.end() &&
1282            "Scalar values are not calculated for VF");
1283     return ScalarsPerVF->second.count(I);
1284   }
1285 
1286   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1287   /// for vectorization factor \p VF.
1288   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1289     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1290            !isProfitableToScalarize(I, VF) &&
1291            !isScalarAfterVectorization(I, VF);
1292   }
1293 
1294   /// Decision that was taken during cost calculation for memory instruction.
1295   enum InstWidening {
1296     CM_Unknown,
1297     CM_Widen,         // For consecutive accesses with stride +1.
1298     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1299     CM_Interleave,
1300     CM_GatherScatter,
1301     CM_Scalarize
1302   };
1303 
1304   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1305   /// instruction \p I and vector width \p VF.
1306   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1307                            InstructionCost Cost) {
1308     assert(VF.isVector() && "Expected VF >=2");
1309     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1310   }
1311 
1312   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1313   /// interleaving group \p Grp and vector width \p VF.
1314   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1315                            ElementCount VF, InstWidening W,
1316                            InstructionCost Cost) {
1317     assert(VF.isVector() && "Expected VF >=2");
1318     /// Broadcast this decicion to all instructions inside the group.
1319     /// But the cost will be assigned to one instruction only.
1320     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1321       if (auto *I = Grp->getMember(i)) {
1322         if (Grp->getInsertPos() == I)
1323           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1324         else
1325           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1326       }
1327     }
1328   }
1329 
1330   /// Return the cost model decision for the given instruction \p I and vector
1331   /// width \p VF. Return CM_Unknown if this instruction did not pass
1332   /// through the cost modeling.
1333   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1334     assert(VF.isVector() && "Expected VF to be a vector VF");
1335     // Cost model is not run in the VPlan-native path - return conservative
1336     // result until this changes.
1337     if (EnableVPlanNativePath)
1338       return CM_GatherScatter;
1339 
1340     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1341     auto Itr = WideningDecisions.find(InstOnVF);
1342     if (Itr == WideningDecisions.end())
1343       return CM_Unknown;
1344     return Itr->second.first;
1345   }
1346 
1347   /// Return the vectorization cost for the given instruction \p I and vector
1348   /// width \p VF.
1349   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1350     assert(VF.isVector() && "Expected VF >=2");
1351     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1352     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1353            "The cost is not calculated");
1354     return WideningDecisions[InstOnVF].second;
1355   }
1356 
1357   /// Return True if instruction \p I is an optimizable truncate whose operand
1358   /// is an induction variable. Such a truncate will be removed by adding a new
1359   /// induction variable with the destination type.
1360   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1361     // If the instruction is not a truncate, return false.
1362     auto *Trunc = dyn_cast<TruncInst>(I);
1363     if (!Trunc)
1364       return false;
1365 
1366     // Get the source and destination types of the truncate.
1367     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1368     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1369 
1370     // If the truncate is free for the given types, return false. Replacing a
1371     // free truncate with an induction variable would add an induction variable
1372     // update instruction to each iteration of the loop. We exclude from this
1373     // check the primary induction variable since it will need an update
1374     // instruction regardless.
1375     Value *Op = Trunc->getOperand(0);
1376     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1377       return false;
1378 
1379     // If the truncated value is not an induction variable, return false.
1380     return Legal->isInductionPhi(Op);
1381   }
1382 
1383   /// Collects the instructions to scalarize for each predicated instruction in
1384   /// the loop.
1385   void collectInstsToScalarize(ElementCount VF);
1386 
1387   /// Collect Uniform and Scalar values for the given \p VF.
1388   /// The sets depend on CM decision for Load/Store instructions
1389   /// that may be vectorized as interleave, gather-scatter or scalarized.
1390   void collectUniformsAndScalars(ElementCount VF) {
1391     // Do the analysis once.
1392     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1393       return;
1394     setCostBasedWideningDecision(VF);
1395     collectLoopUniforms(VF);
1396     collectLoopScalars(VF);
1397   }
1398 
1399   /// Returns true if the target machine supports masked store operation
1400   /// for the given \p DataType and kind of access to \p Ptr.
1401   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1402     return Legal->isConsecutivePtr(DataType, Ptr) &&
1403            TTI.isLegalMaskedStore(DataType, Alignment);
1404   }
1405 
1406   /// Returns true if the target machine supports masked load operation
1407   /// for the given \p DataType and kind of access to \p Ptr.
1408   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1409     return Legal->isConsecutivePtr(DataType, Ptr) &&
1410            TTI.isLegalMaskedLoad(DataType, Alignment);
1411   }
1412 
1413   /// Returns true if the target machine can represent \p V as a masked gather
1414   /// or scatter operation.
1415   bool isLegalGatherOrScatter(Value *V,
1416                               ElementCount VF = ElementCount::getFixed(1)) {
1417     bool LI = isa<LoadInst>(V);
1418     bool SI = isa<StoreInst>(V);
1419     if (!LI && !SI)
1420       return false;
1421     auto *Ty = getLoadStoreType(V);
1422     Align Align = getLoadStoreAlignment(V);
1423     if (VF.isVector())
1424       Ty = VectorType::get(Ty, VF);
1425     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1426            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1427   }
1428 
1429   /// Returns true if the target machine supports all of the reduction
1430   /// variables found for the given VF.
1431   bool canVectorizeReductions(ElementCount VF) const {
1432     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1433       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1434       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1435     }));
1436   }
1437 
1438   /// Returns true if \p I is an instruction that will be scalarized with
1439   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1440   /// instructions include conditional stores and instructions that may divide
1441   /// by zero.
1442   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1443 
1444   // Returns true if \p I is an instruction that will be predicated either
1445   // through scalar predication or masked load/store or masked gather/scatter.
1446   // \p VF is the vectorization factor that will be used to vectorize \p I.
1447   // Superset of instructions that return true for isScalarWithPredication.
1448   bool isPredicatedInst(Instruction *I, ElementCount VF) {
1449     // When we know the load's address is loop invariant and the instruction
1450     // in the original scalar loop was unconditionally executed then we
1451     // don't need to mark it as a predicated instruction. Tail folding may
1452     // introduce additional predication, but we're guaranteed to always have
1453     // at least one active lane.  We call Legal->blockNeedsPredication here
1454     // because it doesn't query tail-folding.
1455     if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
1456         !Legal->blockNeedsPredication(I->getParent()))
1457       return false;
1458     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1459       return false;
1460     // Loads and stores that need some form of masked operation are predicated
1461     // instructions.
1462     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1463       return Legal->isMaskRequired(I);
1464     return isScalarWithPredication(I, VF);
1465   }
1466 
1467   /// Returns true if \p I is a memory instruction with consecutive memory
1468   /// access that can be widened.
1469   bool
1470   memoryInstructionCanBeWidened(Instruction *I,
1471                                 ElementCount VF = ElementCount::getFixed(1));
1472 
1473   /// Returns true if \p I is a memory instruction in an interleaved-group
1474   /// of memory accesses that can be vectorized with wide vector loads/stores
1475   /// and shuffles.
1476   bool
1477   interleavedAccessCanBeWidened(Instruction *I,
1478                                 ElementCount VF = ElementCount::getFixed(1));
1479 
1480   /// Check if \p Instr belongs to any interleaved access group.
1481   bool isAccessInterleaved(Instruction *Instr) {
1482     return InterleaveInfo.isInterleaved(Instr);
1483   }
1484 
1485   /// Get the interleaved access group that \p Instr belongs to.
1486   const InterleaveGroup<Instruction> *
1487   getInterleavedAccessGroup(Instruction *Instr) {
1488     return InterleaveInfo.getInterleaveGroup(Instr);
1489   }
1490 
1491   /// Returns true if we're required to use a scalar epilogue for at least
1492   /// the final iteration of the original loop.
1493   bool requiresScalarEpilogue(ElementCount VF) const {
1494     if (!isScalarEpilogueAllowed())
1495       return false;
1496     // If we might exit from anywhere but the latch, must run the exiting
1497     // iteration in scalar form.
1498     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1499       return true;
1500     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1501   }
1502 
1503   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1504   /// loop hint annotation.
1505   bool isScalarEpilogueAllowed() const {
1506     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1507   }
1508 
1509   /// Returns true if all loop blocks should be masked to fold tail loop.
1510   bool foldTailByMasking() const { return FoldTailByMasking; }
1511 
1512   /// Returns true if were tail-folding and want to use the active lane mask
1513   /// for vector loop control flow.
1514   bool useActiveLaneMaskForControlFlow() const {
1515     return FoldTailByMasking &&
1516            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1517   }
1518 
1519   /// Returns true if the instructions in this block requires predication
1520   /// for any reason, e.g. because tail folding now requires a predicate
1521   /// or because the block in the original loop was predicated.
1522   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1523     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1524   }
1525 
1526   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1527   /// nodes to the chain of instructions representing the reductions. Uses a
1528   /// MapVector to ensure deterministic iteration order.
1529   using ReductionChainMap =
1530       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1531 
1532   /// Return the chain of instructions representing an inloop reduction.
1533   const ReductionChainMap &getInLoopReductionChains() const {
1534     return InLoopReductionChains;
1535   }
1536 
1537   /// Returns true if the Phi is part of an inloop reduction.
1538   bool isInLoopReduction(PHINode *Phi) const {
1539     return InLoopReductionChains.count(Phi);
1540   }
1541 
1542   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1543   /// with factor VF.  Return the cost of the instruction, including
1544   /// scalarization overhead if it's needed.
1545   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1546 
1547   /// Estimate cost of a call instruction CI if it were vectorized with factor
1548   /// VF. Return the cost of the instruction, including scalarization overhead
1549   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1550   /// scalarized -
1551   /// i.e. either vector version isn't available, or is too expensive.
1552   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1553                                     bool &NeedToScalarize) const;
1554 
1555   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1556   /// that of B.
1557   bool isMoreProfitable(const VectorizationFactor &A,
1558                         const VectorizationFactor &B) const;
1559 
1560   /// Invalidates decisions already taken by the cost model.
1561   void invalidateCostModelingDecisions() {
1562     WideningDecisions.clear();
1563     Uniforms.clear();
1564     Scalars.clear();
1565   }
1566 
1567   /// Convenience function that returns the value of vscale_range iff
1568   /// vscale_range.min == vscale_range.max or otherwise returns the value
1569   /// returned by the corresponding TLI method.
1570   Optional<unsigned> getVScaleForTuning() const;
1571 
1572 private:
1573   unsigned NumPredStores = 0;
1574 
1575   /// \return An upper bound for the vectorization factors for both
1576   /// fixed and scalable vectorization, where the minimum-known number of
1577   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1578   /// disabled or unsupported, then the scalable part will be equal to
1579   /// ElementCount::getScalable(0).
1580   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1581                                            ElementCount UserVF,
1582                                            bool FoldTailByMasking);
1583 
1584   /// \return the maximized element count based on the targets vector
1585   /// registers and the loop trip-count, but limited to a maximum safe VF.
1586   /// This is a helper function of computeFeasibleMaxVF.
1587   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1588                                        unsigned SmallestType,
1589                                        unsigned WidestType,
1590                                        ElementCount MaxSafeVF,
1591                                        bool FoldTailByMasking);
1592 
1593   /// \return the maximum legal scalable VF, based on the safe max number
1594   /// of elements.
1595   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1596 
1597   /// The vectorization cost is a combination of the cost itself and a boolean
1598   /// indicating whether any of the contributing operations will actually
1599   /// operate on vector values after type legalization in the backend. If this
1600   /// latter value is false, then all operations will be scalarized (i.e. no
1601   /// vectorization has actually taken place).
1602   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1603 
1604   /// Returns the expected execution cost. The unit of the cost does
1605   /// not matter because we use the 'cost' units to compare different
1606   /// vector widths. The cost that is returned is *not* normalized by
1607   /// the factor width. If \p Invalid is not nullptr, this function
1608   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1609   /// each instruction that has an Invalid cost for the given VF.
1610   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1611   VectorizationCostTy
1612   expectedCost(ElementCount VF,
1613                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1614 
1615   /// Returns the execution time cost of an instruction for a given vector
1616   /// width. Vector width of one means scalar.
1617   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1618 
1619   /// The cost-computation logic from getInstructionCost which provides
1620   /// the vector type as an output parameter.
1621   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1622                                      Type *&VectorTy);
1623 
1624   /// Return the cost of instructions in an inloop reduction pattern, if I is
1625   /// part of that pattern.
1626   Optional<InstructionCost>
1627   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1628                           TTI::TargetCostKind CostKind);
1629 
1630   /// Calculate vectorization cost of memory instruction \p I.
1631   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1632 
1633   /// The cost computation for scalarized memory instruction.
1634   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1635 
1636   /// The cost computation for interleaving group of memory instructions.
1637   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1638 
1639   /// The cost computation for Gather/Scatter instruction.
1640   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1641 
1642   /// The cost computation for widening instruction \p I with consecutive
1643   /// memory access.
1644   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1645 
1646   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1647   /// Load: scalar load + broadcast.
1648   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1649   /// element)
1650   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1651 
1652   /// Estimate the overhead of scalarizing an instruction. This is a
1653   /// convenience wrapper for the type-based getScalarizationOverhead API.
1654   InstructionCost getScalarizationOverhead(Instruction *I,
1655                                            ElementCount VF) const;
1656 
1657   /// Returns true if an artificially high cost for emulated masked memrefs
1658   /// should be used.
1659   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1660 
1661   /// Map of scalar integer values to the smallest bitwidth they can be legally
1662   /// represented as. The vector equivalents of these values should be truncated
1663   /// to this type.
1664   MapVector<Instruction *, uint64_t> MinBWs;
1665 
1666   /// A type representing the costs for instructions if they were to be
1667   /// scalarized rather than vectorized. The entries are Instruction-Cost
1668   /// pairs.
1669   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1670 
1671   /// A set containing all BasicBlocks that are known to present after
1672   /// vectorization as a predicated block.
1673   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1674       PredicatedBBsAfterVectorization;
1675 
1676   /// Records whether it is allowed to have the original scalar loop execute at
1677   /// least once. This may be needed as a fallback loop in case runtime
1678   /// aliasing/dependence checks fail, or to handle the tail/remainder
1679   /// iterations when the trip count is unknown or doesn't divide by the VF,
1680   /// or as a peel-loop to handle gaps in interleave-groups.
1681   /// Under optsize and when the trip count is very small we don't allow any
1682   /// iterations to execute in the scalar loop.
1683   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1684 
1685   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1686   bool FoldTailByMasking = false;
1687 
1688   /// A map holding scalar costs for different vectorization factors. The
1689   /// presence of a cost for an instruction in the mapping indicates that the
1690   /// instruction will be scalarized when vectorizing with the associated
1691   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1692   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1693 
1694   /// Holds the instructions known to be uniform after vectorization.
1695   /// The data is collected per VF.
1696   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1697 
1698   /// Holds the instructions known to be scalar after vectorization.
1699   /// The data is collected per VF.
1700   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1701 
1702   /// Holds the instructions (address computations) that are forced to be
1703   /// scalarized.
1704   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1705 
1706   /// PHINodes of the reductions that should be expanded in-loop along with
1707   /// their associated chains of reduction operations, in program order from top
1708   /// (PHI) to bottom
1709   ReductionChainMap InLoopReductionChains;
1710 
1711   /// A Map of inloop reduction operations and their immediate chain operand.
1712   /// FIXME: This can be removed once reductions can be costed correctly in
1713   /// vplan. This was added to allow quick lookup to the inloop operations,
1714   /// without having to loop through InLoopReductionChains.
1715   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1716 
1717   /// Returns the expected difference in cost from scalarizing the expression
1718   /// feeding a predicated instruction \p PredInst. The instructions to
1719   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1720   /// non-negative return value implies the expression will be scalarized.
1721   /// Currently, only single-use chains are considered for scalarization.
1722   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1723                               ElementCount VF);
1724 
1725   /// Collect the instructions that are uniform after vectorization. An
1726   /// instruction is uniform if we represent it with a single scalar value in
1727   /// the vectorized loop corresponding to each vector iteration. Examples of
1728   /// uniform instructions include pointer operands of consecutive or
1729   /// interleaved memory accesses. Note that although uniformity implies an
1730   /// instruction will be scalar, the reverse is not true. In general, a
1731   /// scalarized instruction will be represented by VF scalar values in the
1732   /// vectorized loop, each corresponding to an iteration of the original
1733   /// scalar loop.
1734   void collectLoopUniforms(ElementCount VF);
1735 
1736   /// Collect the instructions that are scalar after vectorization. An
1737   /// instruction is scalar if it is known to be uniform or will be scalarized
1738   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1739   /// to the list if they are used by a load/store instruction that is marked as
1740   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1741   /// VF values in the vectorized loop, each corresponding to an iteration of
1742   /// the original scalar loop.
1743   void collectLoopScalars(ElementCount VF);
1744 
1745   /// Keeps cost model vectorization decision and cost for instructions.
1746   /// Right now it is used for memory instructions only.
1747   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1748                                 std::pair<InstWidening, InstructionCost>>;
1749 
1750   DecisionList WideningDecisions;
1751 
1752   /// Returns true if \p V is expected to be vectorized and it needs to be
1753   /// extracted.
1754   bool needsExtract(Value *V, ElementCount VF) const {
1755     Instruction *I = dyn_cast<Instruction>(V);
1756     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1757         TheLoop->isLoopInvariant(I))
1758       return false;
1759 
1760     // Assume we can vectorize V (and hence we need extraction) if the
1761     // scalars are not computed yet. This can happen, because it is called
1762     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1763     // the scalars are collected. That should be a safe assumption in most
1764     // cases, because we check if the operands have vectorizable types
1765     // beforehand in LoopVectorizationLegality.
1766     return Scalars.find(VF) == Scalars.end() ||
1767            !isScalarAfterVectorization(I, VF);
1768   };
1769 
1770   /// Returns a range containing only operands needing to be extracted.
1771   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1772                                                    ElementCount VF) const {
1773     return SmallVector<Value *, 4>(make_filter_range(
1774         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1775   }
1776 
1777   /// Determines if we have the infrastructure to vectorize loop \p L and its
1778   /// epilogue, assuming the main loop is vectorized by \p VF.
1779   bool isCandidateForEpilogueVectorization(const Loop &L,
1780                                            const ElementCount VF) const;
1781 
1782   /// Returns true if epilogue vectorization is considered profitable, and
1783   /// false otherwise.
1784   /// \p VF is the vectorization factor chosen for the original loop.
1785   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1786 
1787 public:
1788   /// The loop that we evaluate.
1789   Loop *TheLoop;
1790 
1791   /// Predicated scalar evolution analysis.
1792   PredicatedScalarEvolution &PSE;
1793 
1794   /// Loop Info analysis.
1795   LoopInfo *LI;
1796 
1797   /// Vectorization legality.
1798   LoopVectorizationLegality *Legal;
1799 
1800   /// Vector target information.
1801   const TargetTransformInfo &TTI;
1802 
1803   /// Target Library Info.
1804   const TargetLibraryInfo *TLI;
1805 
1806   /// Demanded bits analysis.
1807   DemandedBits *DB;
1808 
1809   /// Assumption cache.
1810   AssumptionCache *AC;
1811 
1812   /// Interface to emit optimization remarks.
1813   OptimizationRemarkEmitter *ORE;
1814 
1815   const Function *TheFunction;
1816 
1817   /// Loop Vectorize Hint.
1818   const LoopVectorizeHints *Hints;
1819 
1820   /// The interleave access information contains groups of interleaved accesses
1821   /// with the same stride and close to each other.
1822   InterleavedAccessInfo &InterleaveInfo;
1823 
1824   /// Values to ignore in the cost model.
1825   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1826 
1827   /// Values to ignore in the cost model when VF > 1.
1828   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1829 
1830   /// All element types found in the loop.
1831   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1832 
1833   /// Profitable vector factors.
1834   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1835 };
1836 } // end namespace llvm
1837 
1838 /// Helper struct to manage generating runtime checks for vectorization.
1839 ///
1840 /// The runtime checks are created up-front in temporary blocks to allow better
1841 /// estimating the cost and un-linked from the existing IR. After deciding to
1842 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1843 /// temporary blocks are completely removed.
1844 class GeneratedRTChecks {
1845   /// Basic block which contains the generated SCEV checks, if any.
1846   BasicBlock *SCEVCheckBlock = nullptr;
1847 
1848   /// The value representing the result of the generated SCEV checks. If it is
1849   /// nullptr, either no SCEV checks have been generated or they have been used.
1850   Value *SCEVCheckCond = nullptr;
1851 
1852   /// Basic block which contains the generated memory runtime checks, if any.
1853   BasicBlock *MemCheckBlock = nullptr;
1854 
1855   /// The value representing the result of the generated memory runtime checks.
1856   /// If it is nullptr, either no memory runtime checks have been generated or
1857   /// they have been used.
1858   Value *MemRuntimeCheckCond = nullptr;
1859 
1860   DominatorTree *DT;
1861   LoopInfo *LI;
1862   TargetTransformInfo *TTI;
1863 
1864   SCEVExpander SCEVExp;
1865   SCEVExpander MemCheckExp;
1866 
1867   bool CostTooHigh = false;
1868 
1869 public:
1870   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1871                     TargetTransformInfo *TTI, const DataLayout &DL)
1872       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1873         MemCheckExp(SE, DL, "scev.check") {}
1874 
1875   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1876   /// accurately estimate the cost of the runtime checks. The blocks are
1877   /// un-linked from the IR and is added back during vector code generation. If
1878   /// there is no vector code generation, the check blocks are removed
1879   /// completely.
1880   void Create(Loop *L, const LoopAccessInfo &LAI,
1881               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1882 
1883     // Hard cutoff to limit compile-time increase in case a very large number of
1884     // runtime checks needs to be generated.
1885     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1886     // profile info.
1887     CostTooHigh =
1888         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1889     if (CostTooHigh)
1890       return;
1891 
1892     BasicBlock *LoopHeader = L->getHeader();
1893     BasicBlock *Preheader = L->getLoopPreheader();
1894 
1895     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1896     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1897     // may be used by SCEVExpander. The blocks will be un-linked from their
1898     // predecessors and removed from LI & DT at the end of the function.
1899     if (!UnionPred.isAlwaysTrue()) {
1900       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1901                                   nullptr, "vector.scevcheck");
1902 
1903       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1904           &UnionPred, SCEVCheckBlock->getTerminator());
1905     }
1906 
1907     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1908     if (RtPtrChecking.Need) {
1909       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1910       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1911                                  "vector.memcheck");
1912 
1913       auto DiffChecks = RtPtrChecking.getDiffChecks();
1914       if (DiffChecks) {
1915         Value *RuntimeVF = nullptr;
1916         MemRuntimeCheckCond = addDiffRuntimeChecks(
1917             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1918             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1919               if (!RuntimeVF)
1920                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1921               return RuntimeVF;
1922             },
1923             IC);
1924       } else {
1925         MemRuntimeCheckCond =
1926             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1927                              RtPtrChecking.getChecks(), MemCheckExp);
1928       }
1929       assert(MemRuntimeCheckCond &&
1930              "no RT checks generated although RtPtrChecking "
1931              "claimed checks are required");
1932     }
1933 
1934     if (!MemCheckBlock && !SCEVCheckBlock)
1935       return;
1936 
1937     // Unhook the temporary block with the checks, update various places
1938     // accordingly.
1939     if (SCEVCheckBlock)
1940       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1941     if (MemCheckBlock)
1942       MemCheckBlock->replaceAllUsesWith(Preheader);
1943 
1944     if (SCEVCheckBlock) {
1945       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1946       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1947       Preheader->getTerminator()->eraseFromParent();
1948     }
1949     if (MemCheckBlock) {
1950       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1951       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1952       Preheader->getTerminator()->eraseFromParent();
1953     }
1954 
1955     DT->changeImmediateDominator(LoopHeader, Preheader);
1956     if (MemCheckBlock) {
1957       DT->eraseNode(MemCheckBlock);
1958       LI->removeBlock(MemCheckBlock);
1959     }
1960     if (SCEVCheckBlock) {
1961       DT->eraseNode(SCEVCheckBlock);
1962       LI->removeBlock(SCEVCheckBlock);
1963     }
1964   }
1965 
1966   InstructionCost getCost() {
1967     if (SCEVCheckBlock || MemCheckBlock)
1968       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1969 
1970     if (CostTooHigh) {
1971       InstructionCost Cost;
1972       Cost.setInvalid();
1973       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1974       return Cost;
1975     }
1976 
1977     InstructionCost RTCheckCost = 0;
1978     if (SCEVCheckBlock)
1979       for (Instruction &I : *SCEVCheckBlock) {
1980         if (SCEVCheckBlock->getTerminator() == &I)
1981           continue;
1982         InstructionCost C =
1983             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1984         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1985         RTCheckCost += C;
1986       }
1987     if (MemCheckBlock)
1988       for (Instruction &I : *MemCheckBlock) {
1989         if (MemCheckBlock->getTerminator() == &I)
1990           continue;
1991         InstructionCost C =
1992             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1993         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1994         RTCheckCost += C;
1995       }
1996 
1997     if (SCEVCheckBlock || MemCheckBlock)
1998       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1999                         << "\n");
2000 
2001     return RTCheckCost;
2002   }
2003 
2004   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2005   /// unused.
2006   ~GeneratedRTChecks() {
2007     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2008     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2009     if (!SCEVCheckCond)
2010       SCEVCleaner.markResultUsed();
2011 
2012     if (!MemRuntimeCheckCond)
2013       MemCheckCleaner.markResultUsed();
2014 
2015     if (MemRuntimeCheckCond) {
2016       auto &SE = *MemCheckExp.getSE();
2017       // Memory runtime check generation creates compares that use expanded
2018       // values. Remove them before running the SCEVExpanderCleaners.
2019       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2020         if (MemCheckExp.isInsertedInstruction(&I))
2021           continue;
2022         SE.forgetValue(&I);
2023         I.eraseFromParent();
2024       }
2025     }
2026     MemCheckCleaner.cleanup();
2027     SCEVCleaner.cleanup();
2028 
2029     if (SCEVCheckCond)
2030       SCEVCheckBlock->eraseFromParent();
2031     if (MemRuntimeCheckCond)
2032       MemCheckBlock->eraseFromParent();
2033   }
2034 
2035   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2036   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2037   /// depending on the generated condition.
2038   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2039                              BasicBlock *LoopVectorPreHeader,
2040                              BasicBlock *LoopExitBlock) {
2041     if (!SCEVCheckCond)
2042       return nullptr;
2043 
2044     Value *Cond = SCEVCheckCond;
2045     // Mark the check as used, to prevent it from being removed during cleanup.
2046     SCEVCheckCond = nullptr;
2047     if (auto *C = dyn_cast<ConstantInt>(Cond))
2048       if (C->isZero())
2049         return nullptr;
2050 
2051     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2052 
2053     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2054     // Create new preheader for vector loop.
2055     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2056       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2057 
2058     SCEVCheckBlock->getTerminator()->eraseFromParent();
2059     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2060     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2061                                                 SCEVCheckBlock);
2062 
2063     DT->addNewBlock(SCEVCheckBlock, Pred);
2064     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2065 
2066     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2067                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2068     return SCEVCheckBlock;
2069   }
2070 
2071   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2072   /// the branches to branch to the vector preheader or \p Bypass, depending on
2073   /// the generated condition.
2074   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2075                                    BasicBlock *LoopVectorPreHeader) {
2076     // Check if we generated code that checks in runtime if arrays overlap.
2077     if (!MemRuntimeCheckCond)
2078       return nullptr;
2079 
2080     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2081     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2082                                                 MemCheckBlock);
2083 
2084     DT->addNewBlock(MemCheckBlock, Pred);
2085     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2086     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2087 
2088     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2089       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2090 
2091     ReplaceInstWithInst(
2092         MemCheckBlock->getTerminator(),
2093         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2094     MemCheckBlock->getTerminator()->setDebugLoc(
2095         Pred->getTerminator()->getDebugLoc());
2096 
2097     // Mark the check as used, to prevent it from being removed during cleanup.
2098     MemRuntimeCheckCond = nullptr;
2099     return MemCheckBlock;
2100   }
2101 };
2102 
2103 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2104 // vectorization. The loop needs to be annotated with #pragma omp simd
2105 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2106 // vector length information is not provided, vectorization is not considered
2107 // explicit. Interleave hints are not allowed either. These limitations will be
2108 // relaxed in the future.
2109 // Please, note that we are currently forced to abuse the pragma 'clang
2110 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2111 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2112 // provides *explicit vectorization hints* (LV can bypass legal checks and
2113 // assume that vectorization is legal). However, both hints are implemented
2114 // using the same metadata (llvm.loop.vectorize, processed by
2115 // LoopVectorizeHints). This will be fixed in the future when the native IR
2116 // representation for pragma 'omp simd' is introduced.
2117 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2118                                    OptimizationRemarkEmitter *ORE) {
2119   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2120   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2121 
2122   // Only outer loops with an explicit vectorization hint are supported.
2123   // Unannotated outer loops are ignored.
2124   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2125     return false;
2126 
2127   Function *Fn = OuterLp->getHeader()->getParent();
2128   if (!Hints.allowVectorization(Fn, OuterLp,
2129                                 true /*VectorizeOnlyWhenForced*/)) {
2130     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2131     return false;
2132   }
2133 
2134   if (Hints.getInterleave() > 1) {
2135     // TODO: Interleave support is future work.
2136     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2137                          "outer loops.\n");
2138     Hints.emitRemarkWithHints();
2139     return false;
2140   }
2141 
2142   return true;
2143 }
2144 
2145 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2146                                   OptimizationRemarkEmitter *ORE,
2147                                   SmallVectorImpl<Loop *> &V) {
2148   // Collect inner loops and outer loops without irreducible control flow. For
2149   // now, only collect outer loops that have explicit vectorization hints. If we
2150   // are stress testing the VPlan H-CFG construction, we collect the outermost
2151   // loop of every loop nest.
2152   if (L.isInnermost() || VPlanBuildStressTest ||
2153       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2154     LoopBlocksRPO RPOT(&L);
2155     RPOT.perform(LI);
2156     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2157       V.push_back(&L);
2158       // TODO: Collect inner loops inside marked outer loops in case
2159       // vectorization fails for the outer loop. Do not invoke
2160       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2161       // already known to be reducible. We can use an inherited attribute for
2162       // that.
2163       return;
2164     }
2165   }
2166   for (Loop *InnerL : L)
2167     collectSupportedLoops(*InnerL, LI, ORE, V);
2168 }
2169 
2170 namespace {
2171 
2172 /// The LoopVectorize Pass.
2173 struct LoopVectorize : public FunctionPass {
2174   /// Pass identification, replacement for typeid
2175   static char ID;
2176 
2177   LoopVectorizePass Impl;
2178 
2179   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2180                          bool VectorizeOnlyWhenForced = false)
2181       : FunctionPass(ID),
2182         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2183     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2184   }
2185 
2186   bool runOnFunction(Function &F) override {
2187     if (skipFunction(F))
2188       return false;
2189 
2190     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2191     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2192     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2193     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2194     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2195     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2196     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2197     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2198     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2199     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2200     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2201     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2202     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2203 
2204     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2205         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2206 
2207     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2208                         GetLAA, *ORE, PSI).MadeAnyChange;
2209   }
2210 
2211   void getAnalysisUsage(AnalysisUsage &AU) const override {
2212     AU.addRequired<AssumptionCacheTracker>();
2213     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2214     AU.addRequired<DominatorTreeWrapperPass>();
2215     AU.addRequired<LoopInfoWrapperPass>();
2216     AU.addRequired<ScalarEvolutionWrapperPass>();
2217     AU.addRequired<TargetTransformInfoWrapperPass>();
2218     AU.addRequired<AAResultsWrapperPass>();
2219     AU.addRequired<LoopAccessLegacyAnalysis>();
2220     AU.addRequired<DemandedBitsWrapperPass>();
2221     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2222     AU.addRequired<InjectTLIMappingsLegacy>();
2223 
2224     // We currently do not preserve loopinfo/dominator analyses with outer loop
2225     // vectorization. Until this is addressed, mark these analyses as preserved
2226     // only for non-VPlan-native path.
2227     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2228     if (!EnableVPlanNativePath) {
2229       AU.addPreserved<LoopInfoWrapperPass>();
2230       AU.addPreserved<DominatorTreeWrapperPass>();
2231     }
2232 
2233     AU.addPreserved<BasicAAWrapperPass>();
2234     AU.addPreserved<GlobalsAAWrapperPass>();
2235     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2236   }
2237 };
2238 
2239 } // end anonymous namespace
2240 
2241 //===----------------------------------------------------------------------===//
2242 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2243 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2244 //===----------------------------------------------------------------------===//
2245 
2246 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2247   // We need to place the broadcast of invariant variables outside the loop,
2248   // but only if it's proven safe to do so. Else, broadcast will be inside
2249   // vector loop body.
2250   Instruction *Instr = dyn_cast<Instruction>(V);
2251   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2252                      (!Instr ||
2253                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2254   // Place the code for broadcasting invariant variables in the new preheader.
2255   IRBuilder<>::InsertPointGuard Guard(Builder);
2256   if (SafeToHoist)
2257     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2258 
2259   // Broadcast the scalar into all locations in the vector.
2260   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2261 
2262   return Shuf;
2263 }
2264 
2265 /// This function adds
2266 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2267 /// to each vector element of Val. The sequence starts at StartIndex.
2268 /// \p Opcode is relevant for FP induction variable.
2269 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2270                             Instruction::BinaryOps BinOp, ElementCount VF,
2271                             IRBuilderBase &Builder) {
2272   assert(VF.isVector() && "only vector VFs are supported");
2273 
2274   // Create and check the types.
2275   auto *ValVTy = cast<VectorType>(Val->getType());
2276   ElementCount VLen = ValVTy->getElementCount();
2277 
2278   Type *STy = Val->getType()->getScalarType();
2279   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2280          "Induction Step must be an integer or FP");
2281   assert(Step->getType() == STy && "Step has wrong type");
2282 
2283   SmallVector<Constant *, 8> Indices;
2284 
2285   // Create a vector of consecutive numbers from zero to VF.
2286   VectorType *InitVecValVTy = ValVTy;
2287   if (STy->isFloatingPointTy()) {
2288     Type *InitVecValSTy =
2289         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2290     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2291   }
2292   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2293 
2294   // Splat the StartIdx
2295   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2296 
2297   if (STy->isIntegerTy()) {
2298     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2299     Step = Builder.CreateVectorSplat(VLen, Step);
2300     assert(Step->getType() == Val->getType() && "Invalid step vec");
2301     // FIXME: The newly created binary instructions should contain nsw/nuw
2302     // flags, which can be found from the original scalar operations.
2303     Step = Builder.CreateMul(InitVec, Step);
2304     return Builder.CreateAdd(Val, Step, "induction");
2305   }
2306 
2307   // Floating point induction.
2308   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2309          "Binary Opcode should be specified for FP induction");
2310   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2311   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2312 
2313   Step = Builder.CreateVectorSplat(VLen, Step);
2314   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2315   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2316 }
2317 
2318 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2319 /// variable on which to base the steps, \p Step is the size of the step.
2320 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2321                              const InductionDescriptor &ID, VPValue *Def,
2322                              VPTransformState &State) {
2323   IRBuilderBase &Builder = State.Builder;
2324   // We shouldn't have to build scalar steps if we aren't vectorizing.
2325   assert(State.VF.isVector() && "VF should be greater than one");
2326   // Get the value type and ensure it and the step have the same integer type.
2327   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2328   assert(ScalarIVTy == Step->getType() &&
2329          "Val and Step should have the same type");
2330 
2331   // We build scalar steps for both integer and floating-point induction
2332   // variables. Here, we determine the kind of arithmetic we will perform.
2333   Instruction::BinaryOps AddOp;
2334   Instruction::BinaryOps MulOp;
2335   if (ScalarIVTy->isIntegerTy()) {
2336     AddOp = Instruction::Add;
2337     MulOp = Instruction::Mul;
2338   } else {
2339     AddOp = ID.getInductionOpcode();
2340     MulOp = Instruction::FMul;
2341   }
2342 
2343   // Determine the number of scalars we need to generate for each unroll
2344   // iteration.
2345   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2346   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2347   // Compute the scalar steps and save the results in State.
2348   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2349                                      ScalarIVTy->getScalarSizeInBits());
2350   Type *VecIVTy = nullptr;
2351   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2352   if (!FirstLaneOnly && State.VF.isScalable()) {
2353     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2354     UnitStepVec =
2355         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2356     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2357     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2358   }
2359 
2360   for (unsigned Part = 0; Part < State.UF; ++Part) {
2361     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2362 
2363     if (!FirstLaneOnly && State.VF.isScalable()) {
2364       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2365       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2366       if (ScalarIVTy->isFloatingPointTy())
2367         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2368       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2369       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2370       State.set(Def, Add, Part);
2371       // It's useful to record the lane values too for the known minimum number
2372       // of elements so we do those below. This improves the code quality when
2373       // trying to extract the first element, for example.
2374     }
2375 
2376     if (ScalarIVTy->isFloatingPointTy())
2377       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2378 
2379     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2380       Value *StartIdx = Builder.CreateBinOp(
2381           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2382       // The step returned by `createStepForVF` is a runtime-evaluated value
2383       // when VF is scalable. Otherwise, it should be folded into a Constant.
2384       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2385              "Expected StartIdx to be folded to a constant when VF is not "
2386              "scalable");
2387       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2388       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2389       State.set(Def, Add, VPIteration(Part, Lane));
2390     }
2391   }
2392 }
2393 
2394 // Generate code for the induction step. Note that induction steps are
2395 // required to be loop-invariant
2396 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2397                               Instruction *InsertBefore,
2398                               Loop *OrigLoop = nullptr) {
2399   const DataLayout &DL = SE.getDataLayout();
2400   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2401          "Induction step should be loop invariant");
2402   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2403     return E->getValue();
2404 
2405   SCEVExpander Exp(SE, DL, "induction");
2406   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2407 }
2408 
2409 /// Compute the transformed value of Index at offset StartValue using step
2410 /// StepValue.
2411 /// For integer induction, returns StartValue + Index * StepValue.
2412 /// For pointer induction, returns StartValue[Index * StepValue].
2413 /// FIXME: The newly created binary instructions should contain nsw/nuw
2414 /// flags, which can be found from the original scalar operations.
2415 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2416                                    Value *StartValue, Value *Step,
2417                                    const InductionDescriptor &ID) {
2418   assert(Index->getType()->getScalarType() == Step->getType() &&
2419          "Index scalar type does not match StepValue type");
2420 
2421   // Note: the IR at this point is broken. We cannot use SE to create any new
2422   // SCEV and then expand it, hoping that SCEV's simplification will give us
2423   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2424   // lead to various SCEV crashes. So all we can do is to use builder and rely
2425   // on InstCombine for future simplifications. Here we handle some trivial
2426   // cases only.
2427   auto CreateAdd = [&B](Value *X, Value *Y) {
2428     assert(X->getType() == Y->getType() && "Types don't match!");
2429     if (auto *CX = dyn_cast<ConstantInt>(X))
2430       if (CX->isZero())
2431         return Y;
2432     if (auto *CY = dyn_cast<ConstantInt>(Y))
2433       if (CY->isZero())
2434         return X;
2435     return B.CreateAdd(X, Y);
2436   };
2437 
2438   // We allow X to be a vector type, in which case Y will potentially be
2439   // splatted into a vector with the same element count.
2440   auto CreateMul = [&B](Value *X, Value *Y) {
2441     assert(X->getType()->getScalarType() == Y->getType() &&
2442            "Types don't match!");
2443     if (auto *CX = dyn_cast<ConstantInt>(X))
2444       if (CX->isOne())
2445         return Y;
2446     if (auto *CY = dyn_cast<ConstantInt>(Y))
2447       if (CY->isOne())
2448         return X;
2449     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2450     if (XVTy && !isa<VectorType>(Y->getType()))
2451       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2452     return B.CreateMul(X, Y);
2453   };
2454 
2455   switch (ID.getKind()) {
2456   case InductionDescriptor::IK_IntInduction: {
2457     assert(!isa<VectorType>(Index->getType()) &&
2458            "Vector indices not supported for integer inductions yet");
2459     assert(Index->getType() == StartValue->getType() &&
2460            "Index type does not match StartValue type");
2461     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2462       return B.CreateSub(StartValue, Index);
2463     auto *Offset = CreateMul(Index, Step);
2464     return CreateAdd(StartValue, Offset);
2465   }
2466   case InductionDescriptor::IK_PtrInduction: {
2467     assert(isa<Constant>(Step) &&
2468            "Expected constant step for pointer induction");
2469     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2470   }
2471   case InductionDescriptor::IK_FpInduction: {
2472     assert(!isa<VectorType>(Index->getType()) &&
2473            "Vector indices not supported for FP inductions yet");
2474     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2475     auto InductionBinOp = ID.getInductionBinOp();
2476     assert(InductionBinOp &&
2477            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2478             InductionBinOp->getOpcode() == Instruction::FSub) &&
2479            "Original bin op should be defined for FP induction");
2480 
2481     Value *MulExp = B.CreateFMul(Step, Index);
2482     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2483                          "induction");
2484   }
2485   case InductionDescriptor::IK_NoInduction:
2486     return nullptr;
2487   }
2488   llvm_unreachable("invalid enum");
2489 }
2490 
2491 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2492                                                     const VPIteration &Instance,
2493                                                     VPTransformState &State) {
2494   Value *ScalarInst = State.get(Def, Instance);
2495   Value *VectorValue = State.get(Def, Instance.Part);
2496   VectorValue = Builder.CreateInsertElement(
2497       VectorValue, ScalarInst,
2498       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2499   State.set(Def, VectorValue, Instance.Part);
2500 }
2501 
2502 // Return whether we allow using masked interleave-groups (for dealing with
2503 // strided loads/stores that reside in predicated blocks, or for dealing
2504 // with gaps).
2505 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2506   // If an override option has been passed in for interleaved accesses, use it.
2507   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2508     return EnableMaskedInterleavedMemAccesses;
2509 
2510   return TTI.enableMaskedInterleavedAccessVectorization();
2511 }
2512 
2513 // Try to vectorize the interleave group that \p Instr belongs to.
2514 //
2515 // E.g. Translate following interleaved load group (factor = 3):
2516 //   for (i = 0; i < N; i+=3) {
2517 //     R = Pic[i];             // Member of index 0
2518 //     G = Pic[i+1];           // Member of index 1
2519 //     B = Pic[i+2];           // Member of index 2
2520 //     ... // do something to R, G, B
2521 //   }
2522 // To:
2523 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2524 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2525 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2526 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2527 //
2528 // Or translate following interleaved store group (factor = 3):
2529 //   for (i = 0; i < N; i+=3) {
2530 //     ... do something to R, G, B
2531 //     Pic[i]   = R;           // Member of index 0
2532 //     Pic[i+1] = G;           // Member of index 1
2533 //     Pic[i+2] = B;           // Member of index 2
2534 //   }
2535 // To:
2536 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2537 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2538 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2539 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2540 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2541 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2542     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2543     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2544     VPValue *BlockInMask) {
2545   Instruction *Instr = Group->getInsertPos();
2546   const DataLayout &DL = Instr->getModule()->getDataLayout();
2547 
2548   // Prepare for the vector type of the interleaved load/store.
2549   Type *ScalarTy = getLoadStoreType(Instr);
2550   unsigned InterleaveFactor = Group->getFactor();
2551   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2552   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2553 
2554   // Prepare for the new pointers.
2555   SmallVector<Value *, 2> AddrParts;
2556   unsigned Index = Group->getIndex(Instr);
2557 
2558   // TODO: extend the masked interleaved-group support to reversed access.
2559   assert((!BlockInMask || !Group->isReverse()) &&
2560          "Reversed masked interleave-group not supported.");
2561 
2562   // If the group is reverse, adjust the index to refer to the last vector lane
2563   // instead of the first. We adjust the index from the first vector lane,
2564   // rather than directly getting the pointer for lane VF - 1, because the
2565   // pointer operand of the interleaved access is supposed to be uniform. For
2566   // uniform instructions, we're only required to generate a value for the
2567   // first vector lane in each unroll iteration.
2568   if (Group->isReverse())
2569     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2570 
2571   for (unsigned Part = 0; Part < UF; Part++) {
2572     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2573     State.setDebugLocFromInst(AddrPart);
2574 
2575     // Notice current instruction could be any index. Need to adjust the address
2576     // to the member of index 0.
2577     //
2578     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2579     //       b = A[i];       // Member of index 0
2580     // Current pointer is pointed to A[i+1], adjust it to A[i].
2581     //
2582     // E.g.  A[i+1] = a;     // Member of index 1
2583     //       A[i]   = b;     // Member of index 0
2584     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2585     // Current pointer is pointed to A[i+2], adjust it to A[i].
2586 
2587     bool InBounds = false;
2588     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2589       InBounds = gep->isInBounds();
2590     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2591     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2592 
2593     // Cast to the vector pointer type.
2594     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2595     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2596     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2597   }
2598 
2599   State.setDebugLocFromInst(Instr);
2600   Value *PoisonVec = PoisonValue::get(VecTy);
2601 
2602   Value *MaskForGaps = nullptr;
2603   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2604     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2605     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2606   }
2607 
2608   // Vectorize the interleaved load group.
2609   if (isa<LoadInst>(Instr)) {
2610     // For each unroll part, create a wide load for the group.
2611     SmallVector<Value *, 2> NewLoads;
2612     for (unsigned Part = 0; Part < UF; Part++) {
2613       Instruction *NewLoad;
2614       if (BlockInMask || MaskForGaps) {
2615         assert(useMaskedInterleavedAccesses(*TTI) &&
2616                "masked interleaved groups are not allowed.");
2617         Value *GroupMask = MaskForGaps;
2618         if (BlockInMask) {
2619           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2620           Value *ShuffledMask = Builder.CreateShuffleVector(
2621               BlockInMaskPart,
2622               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2623               "interleaved.mask");
2624           GroupMask = MaskForGaps
2625                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2626                                                 MaskForGaps)
2627                           : ShuffledMask;
2628         }
2629         NewLoad =
2630             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2631                                      GroupMask, PoisonVec, "wide.masked.vec");
2632       }
2633       else
2634         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2635                                             Group->getAlign(), "wide.vec");
2636       Group->addMetadata(NewLoad);
2637       NewLoads.push_back(NewLoad);
2638     }
2639 
2640     // For each member in the group, shuffle out the appropriate data from the
2641     // wide loads.
2642     unsigned J = 0;
2643     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2644       Instruction *Member = Group->getMember(I);
2645 
2646       // Skip the gaps in the group.
2647       if (!Member)
2648         continue;
2649 
2650       auto StrideMask =
2651           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2652       for (unsigned Part = 0; Part < UF; Part++) {
2653         Value *StridedVec = Builder.CreateShuffleVector(
2654             NewLoads[Part], StrideMask, "strided.vec");
2655 
2656         // If this member has different type, cast the result type.
2657         if (Member->getType() != ScalarTy) {
2658           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2659           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2660           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2661         }
2662 
2663         if (Group->isReverse())
2664           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2665 
2666         State.set(VPDefs[J], StridedVec, Part);
2667       }
2668       ++J;
2669     }
2670     return;
2671   }
2672 
2673   // The sub vector type for current instruction.
2674   auto *SubVT = VectorType::get(ScalarTy, VF);
2675 
2676   // Vectorize the interleaved store group.
2677   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2678   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2679          "masked interleaved groups are not allowed.");
2680   assert((!MaskForGaps || !VF.isScalable()) &&
2681          "masking gaps for scalable vectors is not yet supported.");
2682   for (unsigned Part = 0; Part < UF; Part++) {
2683     // Collect the stored vector from each member.
2684     SmallVector<Value *, 4> StoredVecs;
2685     for (unsigned i = 0; i < InterleaveFactor; i++) {
2686       assert((Group->getMember(i) || MaskForGaps) &&
2687              "Fail to get a member from an interleaved store group");
2688       Instruction *Member = Group->getMember(i);
2689 
2690       // Skip the gaps in the group.
2691       if (!Member) {
2692         Value *Undef = PoisonValue::get(SubVT);
2693         StoredVecs.push_back(Undef);
2694         continue;
2695       }
2696 
2697       Value *StoredVec = State.get(StoredValues[i], Part);
2698 
2699       if (Group->isReverse())
2700         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2701 
2702       // If this member has different type, cast it to a unified type.
2703 
2704       if (StoredVec->getType() != SubVT)
2705         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2706 
2707       StoredVecs.push_back(StoredVec);
2708     }
2709 
2710     // Concatenate all vectors into a wide vector.
2711     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2712 
2713     // Interleave the elements in the wide vector.
2714     Value *IVec = Builder.CreateShuffleVector(
2715         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2716         "interleaved.vec");
2717 
2718     Instruction *NewStoreInstr;
2719     if (BlockInMask || MaskForGaps) {
2720       Value *GroupMask = MaskForGaps;
2721       if (BlockInMask) {
2722         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2723         Value *ShuffledMask = Builder.CreateShuffleVector(
2724             BlockInMaskPart,
2725             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2726             "interleaved.mask");
2727         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2728                                                       ShuffledMask, MaskForGaps)
2729                                 : ShuffledMask;
2730       }
2731       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2732                                                 Group->getAlign(), GroupMask);
2733     } else
2734       NewStoreInstr =
2735           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2736 
2737     Group->addMetadata(NewStoreInstr);
2738   }
2739 }
2740 
2741 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2742                                                VPReplicateRecipe *RepRecipe,
2743                                                const VPIteration &Instance,
2744                                                bool IfPredicateInstr,
2745                                                VPTransformState &State) {
2746   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2747 
2748   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2749   // the first lane and part.
2750   if (isa<NoAliasScopeDeclInst>(Instr))
2751     if (!Instance.isFirstIteration())
2752       return;
2753 
2754   // Does this instruction return a value ?
2755   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2756 
2757   Instruction *Cloned = Instr->clone();
2758   if (!IsVoidRetTy)
2759     Cloned->setName(Instr->getName() + ".cloned");
2760 
2761   // If the scalarized instruction contributes to the address computation of a
2762   // widen masked load/store which was in a basic block that needed predication
2763   // and is not predicated after vectorization, we can't propagate
2764   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2765   // instruction could feed a poison value to the base address of the widen
2766   // load/store.
2767   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2768     Cloned->dropPoisonGeneratingFlags();
2769 
2770   if (Instr->getDebugLoc())
2771     State.setDebugLocFromInst(Instr);
2772 
2773   // Replace the operands of the cloned instructions with their scalar
2774   // equivalents in the new loop.
2775   for (auto &I : enumerate(RepRecipe->operands())) {
2776     auto InputInstance = Instance;
2777     VPValue *Operand = I.value();
2778     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2779     if (OperandR && OperandR->isUniform())
2780       InputInstance.Lane = VPLane::getFirstLane();
2781     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2782   }
2783   State.addNewMetadata(Cloned, Instr);
2784 
2785   // Place the cloned scalar in the new loop.
2786   State.Builder.Insert(Cloned);
2787 
2788   State.set(RepRecipe, Cloned, Instance);
2789 
2790   // If we just cloned a new assumption, add it the assumption cache.
2791   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2792     AC->registerAssumption(II);
2793 
2794   // End if-block.
2795   if (IfPredicateInstr)
2796     PredicatedInstructions.push_back(Cloned);
2797 }
2798 
2799 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2800   if (TripCount)
2801     return TripCount;
2802 
2803   assert(InsertBlock);
2804   IRBuilder<> Builder(InsertBlock->getTerminator());
2805   // Find the loop boundaries.
2806   ScalarEvolution *SE = PSE.getSE();
2807   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2808   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2809          "Invalid loop count");
2810 
2811   Type *IdxTy = Legal->getWidestInductionType();
2812   assert(IdxTy && "No type for induction");
2813 
2814   // The exit count might have the type of i64 while the phi is i32. This can
2815   // happen if we have an induction variable that is sign extended before the
2816   // compare. The only way that we get a backedge taken count is that the
2817   // induction variable was signed and as such will not overflow. In such a case
2818   // truncation is legal.
2819   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2820       IdxTy->getPrimitiveSizeInBits())
2821     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2822   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2823 
2824   // Get the total trip count from the count by adding 1.
2825   const SCEV *ExitCount = SE->getAddExpr(
2826       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2827 
2828   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2829 
2830   // Expand the trip count and place the new instructions in the preheader.
2831   // Notice that the pre-header does not change, only the loop body.
2832   SCEVExpander Exp(*SE, DL, "induction");
2833 
2834   // Count holds the overall loop count (N).
2835   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2836                                 InsertBlock->getTerminator());
2837 
2838   if (TripCount->getType()->isPointerTy())
2839     TripCount =
2840         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2841                                     InsertBlock->getTerminator());
2842 
2843   return TripCount;
2844 }
2845 
2846 Value *
2847 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2848   if (VectorTripCount)
2849     return VectorTripCount;
2850 
2851   Value *TC = getOrCreateTripCount(InsertBlock);
2852   IRBuilder<> Builder(InsertBlock->getTerminator());
2853 
2854   Type *Ty = TC->getType();
2855   // This is where we can make the step a runtime constant.
2856   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2857 
2858   // If the tail is to be folded by masking, round the number of iterations N
2859   // up to a multiple of Step instead of rounding down. This is done by first
2860   // adding Step-1 and then rounding down. Note that it's ok if this addition
2861   // overflows: the vector induction variable will eventually wrap to zero given
2862   // that it starts at zero and its Step is a power of two; the loop will then
2863   // exit, with the last early-exit vector comparison also producing all-true.
2864   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2865   // is accounted for in emitIterationCountCheck that adds an overflow check.
2866   if (Cost->foldTailByMasking()) {
2867     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2868            "VF*UF must be a power of 2 when folding tail by masking");
2869     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2870     TC = Builder.CreateAdd(
2871         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2872   }
2873 
2874   // Now we need to generate the expression for the part of the loop that the
2875   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2876   // iterations are not required for correctness, or N - Step, otherwise. Step
2877   // is equal to the vectorization factor (number of SIMD elements) times the
2878   // unroll factor (number of SIMD instructions).
2879   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2880 
2881   // There are cases where we *must* run at least one iteration in the remainder
2882   // loop.  See the cost model for when this can happen.  If the step evenly
2883   // divides the trip count, we set the remainder to be equal to the step. If
2884   // the step does not evenly divide the trip count, no adjustment is necessary
2885   // since there will already be scalar iterations. Note that the minimum
2886   // iterations check ensures that N >= Step.
2887   if (Cost->requiresScalarEpilogue(VF)) {
2888     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2889     R = Builder.CreateSelect(IsZero, Step, R);
2890   }
2891 
2892   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2893 
2894   return VectorTripCount;
2895 }
2896 
2897 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2898                                                    const DataLayout &DL) {
2899   // Verify that V is a vector type with same number of elements as DstVTy.
2900   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2901   unsigned VF = DstFVTy->getNumElements();
2902   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2903   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2904   Type *SrcElemTy = SrcVecTy->getElementType();
2905   Type *DstElemTy = DstFVTy->getElementType();
2906   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2907          "Vector elements must have same size");
2908 
2909   // Do a direct cast if element types are castable.
2910   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2911     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2912   }
2913   // V cannot be directly casted to desired vector type.
2914   // May happen when V is a floating point vector but DstVTy is a vector of
2915   // pointers or vice-versa. Handle this using a two-step bitcast using an
2916   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2917   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2918          "Only one type should be a pointer type");
2919   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2920          "Only one type should be a floating point type");
2921   Type *IntTy =
2922       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2923   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2924   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2925   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2926 }
2927 
2928 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2929   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2930   // Reuse existing vector loop preheader for TC checks.
2931   // Note that new preheader block is generated for vector loop.
2932   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2933   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2934 
2935   // Generate code to check if the loop's trip count is less than VF * UF, or
2936   // equal to it in case a scalar epilogue is required; this implies that the
2937   // vector trip count is zero. This check also covers the case where adding one
2938   // to the backedge-taken count overflowed leading to an incorrect trip count
2939   // of zero. In this case we will also jump to the scalar loop.
2940   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2941                                             : ICmpInst::ICMP_ULT;
2942 
2943   // If tail is to be folded, vector loop takes care of all iterations.
2944   Type *CountTy = Count->getType();
2945   Value *CheckMinIters = Builder.getFalse();
2946   auto CreateStep = [&]() -> Value * {
2947     // Create step with max(MinProTripCount, UF * VF).
2948     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2949       return createStepForVF(Builder, CountTy, VF, UF);
2950 
2951     Value *MinProfTC =
2952         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2953     if (!VF.isScalable())
2954       return MinProfTC;
2955     return Builder.CreateBinaryIntrinsic(
2956         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2957   };
2958 
2959   if (!Cost->foldTailByMasking())
2960     CheckMinIters =
2961         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2962   else if (VF.isScalable()) {
2963     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2964     // an overflow to zero when updating induction variables and so an
2965     // additional overflow check is required before entering the vector loop.
2966 
2967     // Get the maximum unsigned value for the type.
2968     Value *MaxUIntTripCount =
2969         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2970     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2971 
2972     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2973     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2974   }
2975 
2976   // Create new preheader for vector loop.
2977   LoopVectorPreHeader =
2978       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2979                  "vector.ph");
2980 
2981   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2982                                DT->getNode(Bypass)->getIDom()) &&
2983          "TC check is expected to dominate Bypass");
2984 
2985   // Update dominator for Bypass & LoopExit (if needed).
2986   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2987   if (!Cost->requiresScalarEpilogue(VF))
2988     // If there is an epilogue which must run, there's no edge from the
2989     // middle block to exit blocks  and thus no need to update the immediate
2990     // dominator of the exit blocks.
2991     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2992 
2993   ReplaceInstWithInst(
2994       TCCheckBlock->getTerminator(),
2995       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2996   LoopBypassBlocks.push_back(TCCheckBlock);
2997 }
2998 
2999 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3000   BasicBlock *const SCEVCheckBlock =
3001       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3002   if (!SCEVCheckBlock)
3003     return nullptr;
3004 
3005   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3006            (OptForSizeBasedOnProfile &&
3007             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3008          "Cannot SCEV check stride or overflow when optimizing for size");
3009 
3010 
3011   // Update dominator only if this is first RT check.
3012   if (LoopBypassBlocks.empty()) {
3013     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3014     if (!Cost->requiresScalarEpilogue(VF))
3015       // If there is an epilogue which must run, there's no edge from the
3016       // middle block to exit blocks  and thus no need to update the immediate
3017       // dominator of the exit blocks.
3018       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3019   }
3020 
3021   LoopBypassBlocks.push_back(SCEVCheckBlock);
3022   AddedSafetyChecks = true;
3023   return SCEVCheckBlock;
3024 }
3025 
3026 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3027   // VPlan-native path does not do any analysis for runtime checks currently.
3028   if (EnableVPlanNativePath)
3029     return nullptr;
3030 
3031   BasicBlock *const MemCheckBlock =
3032       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3033 
3034   // Check if we generated code that checks in runtime if arrays overlap. We put
3035   // the checks into a separate block to make the more common case of few
3036   // elements faster.
3037   if (!MemCheckBlock)
3038     return nullptr;
3039 
3040   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3041     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3042            "Cannot emit memory checks when optimizing for size, unless forced "
3043            "to vectorize.");
3044     ORE->emit([&]() {
3045       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3046                                         OrigLoop->getStartLoc(),
3047                                         OrigLoop->getHeader())
3048              << "Code-size may be reduced by not forcing "
3049                 "vectorization, or by source-code modifications "
3050                 "eliminating the need for runtime checks "
3051                 "(e.g., adding 'restrict').";
3052     });
3053   }
3054 
3055   LoopBypassBlocks.push_back(MemCheckBlock);
3056 
3057   AddedSafetyChecks = true;
3058 
3059   return MemCheckBlock;
3060 }
3061 
3062 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3063   LoopScalarBody = OrigLoop->getHeader();
3064   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3065   assert(LoopVectorPreHeader && "Invalid loop structure");
3066   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3067   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3068          "multiple exit loop without required epilogue?");
3069 
3070   LoopMiddleBlock =
3071       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3072                  LI, nullptr, Twine(Prefix) + "middle.block");
3073   LoopScalarPreHeader =
3074       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3075                  nullptr, Twine(Prefix) + "scalar.ph");
3076 
3077   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3078 
3079   // Set up the middle block terminator.  Two cases:
3080   // 1) If we know that we must execute the scalar epilogue, emit an
3081   //    unconditional branch.
3082   // 2) Otherwise, we must have a single unique exit block (due to how we
3083   //    implement the multiple exit case).  In this case, set up a conditonal
3084   //    branch from the middle block to the loop scalar preheader, and the
3085   //    exit block.  completeLoopSkeleton will update the condition to use an
3086   //    iteration check, if required to decide whether to execute the remainder.
3087   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3088     BranchInst::Create(LoopScalarPreHeader) :
3089     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3090                        Builder.getTrue());
3091   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3092   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3093 
3094   // Update dominator for loop exit. During skeleton creation, only the vector
3095   // pre-header and the middle block are created. The vector loop is entirely
3096   // created during VPlan exection.
3097   if (!Cost->requiresScalarEpilogue(VF))
3098     // If there is an epilogue which must run, there's no edge from the
3099     // middle block to exit blocks  and thus no need to update the immediate
3100     // dominator of the exit blocks.
3101     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3102 }
3103 
3104 void InnerLoopVectorizer::createInductionResumeValues(
3105     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3106   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3107           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3108          "Inconsistent information about additional bypass.");
3109 
3110   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3111   assert(VectorTripCount && "Expected valid arguments");
3112   // We are going to resume the execution of the scalar loop.
3113   // Go over all of the induction variables that we found and fix the
3114   // PHIs that are left in the scalar version of the loop.
3115   // The starting values of PHI nodes depend on the counter of the last
3116   // iteration in the vectorized loop.
3117   // If we come from a bypass edge then we need to start from the original
3118   // start value.
3119   Instruction *OldInduction = Legal->getPrimaryInduction();
3120   for (auto &InductionEntry : Legal->getInductionVars()) {
3121     PHINode *OrigPhi = InductionEntry.first;
3122     InductionDescriptor II = InductionEntry.second;
3123 
3124     Value *&EndValue = IVEndValues[OrigPhi];
3125     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3126     if (OrigPhi == OldInduction) {
3127       // We know what the end value is.
3128       EndValue = VectorTripCount;
3129     } else {
3130       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3131 
3132       // Fast-math-flags propagate from the original induction instruction.
3133       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3134         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3135 
3136       Type *StepType = II.getStep()->getType();
3137       Instruction::CastOps CastOp =
3138           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3139       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3140       Value *Step =
3141           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3142       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3143       EndValue->setName("ind.end");
3144 
3145       // Compute the end value for the additional bypass (if applicable).
3146       if (AdditionalBypass.first) {
3147         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3148         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3149                                          StepType, true);
3150         Value *Step =
3151             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3152         VTC =
3153             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3154         EndValueFromAdditionalBypass =
3155             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3156         EndValueFromAdditionalBypass->setName("ind.end");
3157       }
3158     }
3159 
3160     // Create phi nodes to merge from the  backedge-taken check block.
3161     PHINode *BCResumeVal =
3162         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3163                         LoopScalarPreHeader->getTerminator());
3164     // Copy original phi DL over to the new one.
3165     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3166 
3167     // The new PHI merges the original incoming value, in case of a bypass,
3168     // or the value at the end of the vectorized loop.
3169     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3170 
3171     // Fix the scalar body counter (PHI node).
3172     // The old induction's phi node in the scalar body needs the truncated
3173     // value.
3174     for (BasicBlock *BB : LoopBypassBlocks)
3175       BCResumeVal->addIncoming(II.getStartValue(), BB);
3176 
3177     if (AdditionalBypass.first)
3178       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3179                                             EndValueFromAdditionalBypass);
3180 
3181     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3182   }
3183 }
3184 
3185 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3186   // The trip counts should be cached by now.
3187   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3188   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3189 
3190   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3191 
3192   // Add a check in the middle block to see if we have completed
3193   // all of the iterations in the first vector loop.  Three cases:
3194   // 1) If we require a scalar epilogue, there is no conditional branch as
3195   //    we unconditionally branch to the scalar preheader.  Do nothing.
3196   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3197   //    Thus if tail is to be folded, we know we don't need to run the
3198   //    remainder and we can use the previous value for the condition (true).
3199   // 3) Otherwise, construct a runtime check.
3200   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3201     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3202                                         Count, VectorTripCount, "cmp.n",
3203                                         LoopMiddleBlock->getTerminator());
3204 
3205     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3206     // of the corresponding compare because they may have ended up with
3207     // different line numbers and we want to avoid awkward line stepping while
3208     // debugging. Eg. if the compare has got a line number inside the loop.
3209     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3210     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3211   }
3212 
3213 #ifdef EXPENSIVE_CHECKS
3214   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3215 #endif
3216 
3217   return LoopVectorPreHeader;
3218 }
3219 
3220 std::pair<BasicBlock *, Value *>
3221 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3222   /*
3223    In this function we generate a new loop. The new loop will contain
3224    the vectorized instructions while the old loop will continue to run the
3225    scalar remainder.
3226 
3227        [ ] <-- loop iteration number check.
3228     /   |
3229    /    v
3230   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3231   |  /  |
3232   | /   v
3233   ||   [ ]     <-- vector pre header.
3234   |/    |
3235   |     v
3236   |    [  ] \
3237   |    [  ]_|   <-- vector loop (created during VPlan execution).
3238   |     |
3239   |     v
3240   \   -[ ]   <--- middle-block.
3241    \/   |
3242    /\   v
3243    | ->[ ]     <--- new preheader.
3244    |    |
3245  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3246    |   [ ] \
3247    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3248     \   |
3249      \  v
3250       >[ ]     <-- exit block(s).
3251    ...
3252    */
3253 
3254   // Get the metadata of the original loop before it gets modified.
3255   MDNode *OrigLoopID = OrigLoop->getLoopID();
3256 
3257   // Workaround!  Compute the trip count of the original loop and cache it
3258   // before we start modifying the CFG.  This code has a systemic problem
3259   // wherein it tries to run analysis over partially constructed IR; this is
3260   // wrong, and not simply for SCEV.  The trip count of the original loop
3261   // simply happens to be prone to hitting this in practice.  In theory, we
3262   // can hit the same issue for any SCEV, or ValueTracking query done during
3263   // mutation.  See PR49900.
3264   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3265 
3266   // Create an empty vector loop, and prepare basic blocks for the runtime
3267   // checks.
3268   createVectorLoopSkeleton("");
3269 
3270   // Now, compare the new count to zero. If it is zero skip the vector loop and
3271   // jump to the scalar loop. This check also covers the case where the
3272   // backedge-taken count is uint##_max: adding one to it will overflow leading
3273   // to an incorrect trip count of zero. In this (rare) case we will also jump
3274   // to the scalar loop.
3275   emitIterationCountCheck(LoopScalarPreHeader);
3276 
3277   // Generate the code to check any assumptions that we've made for SCEV
3278   // expressions.
3279   emitSCEVChecks(LoopScalarPreHeader);
3280 
3281   // Generate the code that checks in runtime if arrays overlap. We put the
3282   // checks into a separate block to make the more common case of few elements
3283   // faster.
3284   emitMemRuntimeChecks(LoopScalarPreHeader);
3285 
3286   // Emit phis for the new starting index of the scalar loop.
3287   createInductionResumeValues();
3288 
3289   return {completeLoopSkeleton(OrigLoopID), nullptr};
3290 }
3291 
3292 // Fix up external users of the induction variable. At this point, we are
3293 // in LCSSA form, with all external PHIs that use the IV having one input value,
3294 // coming from the remainder loop. We need those PHIs to also have a correct
3295 // value for the IV when arriving directly from the middle block.
3296 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3297                                        const InductionDescriptor &II,
3298                                        Value *VectorTripCount, Value *EndValue,
3299                                        BasicBlock *MiddleBlock,
3300                                        BasicBlock *VectorHeader, VPlan &Plan) {
3301   // There are two kinds of external IV usages - those that use the value
3302   // computed in the last iteration (the PHI) and those that use the penultimate
3303   // value (the value that feeds into the phi from the loop latch).
3304   // We allow both, but they, obviously, have different values.
3305 
3306   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3307 
3308   DenseMap<Value *, Value *> MissingVals;
3309 
3310   // An external user of the last iteration's value should see the value that
3311   // the remainder loop uses to initialize its own IV.
3312   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3313   for (User *U : PostInc->users()) {
3314     Instruction *UI = cast<Instruction>(U);
3315     if (!OrigLoop->contains(UI)) {
3316       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3317       MissingVals[UI] = EndValue;
3318     }
3319   }
3320 
3321   // An external user of the penultimate value need to see EndValue - Step.
3322   // The simplest way to get this is to recompute it from the constituent SCEVs,
3323   // that is Start + (Step * (CRD - 1)).
3324   for (User *U : OrigPhi->users()) {
3325     auto *UI = cast<Instruction>(U);
3326     if (!OrigLoop->contains(UI)) {
3327       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3328 
3329       IRBuilder<> B(MiddleBlock->getTerminator());
3330 
3331       // Fast-math-flags propagate from the original induction instruction.
3332       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3333         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3334 
3335       Value *CountMinusOne = B.CreateSub(
3336           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3337       Value *CMO =
3338           !II.getStep()->getType()->isIntegerTy()
3339               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3340                              II.getStep()->getType())
3341               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3342       CMO->setName("cast.cmo");
3343 
3344       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3345                                     VectorHeader->getTerminator());
3346       Value *Escape =
3347           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3348       Escape->setName("ind.escape");
3349       MissingVals[UI] = Escape;
3350     }
3351   }
3352 
3353   for (auto &I : MissingVals) {
3354     PHINode *PHI = cast<PHINode>(I.first);
3355     // One corner case we have to handle is two IVs "chasing" each-other,
3356     // that is %IV2 = phi [...], [ %IV1, %latch ]
3357     // In this case, if IV1 has an external use, we need to avoid adding both
3358     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3359     // don't already have an incoming value for the middle block.
3360     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3361       PHI->addIncoming(I.second, MiddleBlock);
3362       Plan.removeLiveOut(PHI);
3363     }
3364   }
3365 }
3366 
3367 namespace {
3368 
3369 struct CSEDenseMapInfo {
3370   static bool canHandle(const Instruction *I) {
3371     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3372            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3373   }
3374 
3375   static inline Instruction *getEmptyKey() {
3376     return DenseMapInfo<Instruction *>::getEmptyKey();
3377   }
3378 
3379   static inline Instruction *getTombstoneKey() {
3380     return DenseMapInfo<Instruction *>::getTombstoneKey();
3381   }
3382 
3383   static unsigned getHashValue(const Instruction *I) {
3384     assert(canHandle(I) && "Unknown instruction!");
3385     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3386                                                            I->value_op_end()));
3387   }
3388 
3389   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3390     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3391         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3392       return LHS == RHS;
3393     return LHS->isIdenticalTo(RHS);
3394   }
3395 };
3396 
3397 } // end anonymous namespace
3398 
3399 ///Perform cse of induction variable instructions.
3400 static void cse(BasicBlock *BB) {
3401   // Perform simple cse.
3402   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3403   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3404     if (!CSEDenseMapInfo::canHandle(&In))
3405       continue;
3406 
3407     // Check if we can replace this instruction with any of the
3408     // visited instructions.
3409     if (Instruction *V = CSEMap.lookup(&In)) {
3410       In.replaceAllUsesWith(V);
3411       In.eraseFromParent();
3412       continue;
3413     }
3414 
3415     CSEMap[&In] = &In;
3416   }
3417 }
3418 
3419 InstructionCost
3420 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3421                                               bool &NeedToScalarize) const {
3422   Function *F = CI->getCalledFunction();
3423   Type *ScalarRetTy = CI->getType();
3424   SmallVector<Type *, 4> Tys, ScalarTys;
3425   for (auto &ArgOp : CI->args())
3426     ScalarTys.push_back(ArgOp->getType());
3427 
3428   // Estimate cost of scalarized vector call. The source operands are assumed
3429   // to be vectors, so we need to extract individual elements from there,
3430   // execute VF scalar calls, and then gather the result into the vector return
3431   // value.
3432   InstructionCost ScalarCallCost =
3433       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3434   if (VF.isScalar())
3435     return ScalarCallCost;
3436 
3437   // Compute corresponding vector type for return value and arguments.
3438   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3439   for (Type *ScalarTy : ScalarTys)
3440     Tys.push_back(ToVectorTy(ScalarTy, VF));
3441 
3442   // Compute costs of unpacking argument values for the scalar calls and
3443   // packing the return values to a vector.
3444   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3445 
3446   InstructionCost Cost =
3447       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3448 
3449   // If we can't emit a vector call for this function, then the currently found
3450   // cost is the cost we need to return.
3451   NeedToScalarize = true;
3452   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3453   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3454 
3455   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3456     return Cost;
3457 
3458   // If the corresponding vector cost is cheaper, return its cost.
3459   InstructionCost VectorCallCost =
3460       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3461   if (VectorCallCost < Cost) {
3462     NeedToScalarize = false;
3463     Cost = VectorCallCost;
3464   }
3465   return Cost;
3466 }
3467 
3468 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3469   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3470     return Elt;
3471   return VectorType::get(Elt, VF);
3472 }
3473 
3474 InstructionCost
3475 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3476                                                    ElementCount VF) const {
3477   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3478   assert(ID && "Expected intrinsic call!");
3479   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3480   FastMathFlags FMF;
3481   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3482     FMF = FPMO->getFastMathFlags();
3483 
3484   SmallVector<const Value *> Arguments(CI->args());
3485   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3486   SmallVector<Type *> ParamTys;
3487   std::transform(FTy->param_begin(), FTy->param_end(),
3488                  std::back_inserter(ParamTys),
3489                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3490 
3491   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3492                                     dyn_cast<IntrinsicInst>(CI));
3493   return TTI.getIntrinsicInstrCost(CostAttrs,
3494                                    TargetTransformInfo::TCK_RecipThroughput);
3495 }
3496 
3497 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3498   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3499   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3500   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3501 }
3502 
3503 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3504   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3505   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3506   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3507 }
3508 
3509 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3510   // For every instruction `I` in MinBWs, truncate the operands, create a
3511   // truncated version of `I` and reextend its result. InstCombine runs
3512   // later and will remove any ext/trunc pairs.
3513   SmallPtrSet<Value *, 4> Erased;
3514   for (const auto &KV : Cost->getMinimalBitwidths()) {
3515     // If the value wasn't vectorized, we must maintain the original scalar
3516     // type. The absence of the value from State indicates that it
3517     // wasn't vectorized.
3518     // FIXME: Should not rely on getVPValue at this point.
3519     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3520     if (!State.hasAnyVectorValue(Def))
3521       continue;
3522     for (unsigned Part = 0; Part < UF; ++Part) {
3523       Value *I = State.get(Def, Part);
3524       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3525         continue;
3526       Type *OriginalTy = I->getType();
3527       Type *ScalarTruncatedTy =
3528           IntegerType::get(OriginalTy->getContext(), KV.second);
3529       auto *TruncatedTy = VectorType::get(
3530           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3531       if (TruncatedTy == OriginalTy)
3532         continue;
3533 
3534       IRBuilder<> B(cast<Instruction>(I));
3535       auto ShrinkOperand = [&](Value *V) -> Value * {
3536         if (auto *ZI = dyn_cast<ZExtInst>(V))
3537           if (ZI->getSrcTy() == TruncatedTy)
3538             return ZI->getOperand(0);
3539         return B.CreateZExtOrTrunc(V, TruncatedTy);
3540       };
3541 
3542       // The actual instruction modification depends on the instruction type,
3543       // unfortunately.
3544       Value *NewI = nullptr;
3545       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3546         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3547                              ShrinkOperand(BO->getOperand(1)));
3548 
3549         // Any wrapping introduced by shrinking this operation shouldn't be
3550         // considered undefined behavior. So, we can't unconditionally copy
3551         // arithmetic wrapping flags to NewI.
3552         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3553       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3554         NewI =
3555             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3556                          ShrinkOperand(CI->getOperand(1)));
3557       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3558         NewI = B.CreateSelect(SI->getCondition(),
3559                               ShrinkOperand(SI->getTrueValue()),
3560                               ShrinkOperand(SI->getFalseValue()));
3561       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3562         switch (CI->getOpcode()) {
3563         default:
3564           llvm_unreachable("Unhandled cast!");
3565         case Instruction::Trunc:
3566           NewI = ShrinkOperand(CI->getOperand(0));
3567           break;
3568         case Instruction::SExt:
3569           NewI = B.CreateSExtOrTrunc(
3570               CI->getOperand(0),
3571               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3572           break;
3573         case Instruction::ZExt:
3574           NewI = B.CreateZExtOrTrunc(
3575               CI->getOperand(0),
3576               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3577           break;
3578         }
3579       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3580         auto Elements0 =
3581             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3582         auto *O0 = B.CreateZExtOrTrunc(
3583             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3584         auto Elements1 =
3585             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3586         auto *O1 = B.CreateZExtOrTrunc(
3587             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3588 
3589         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3590       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3591         // Don't do anything with the operands, just extend the result.
3592         continue;
3593       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3594         auto Elements =
3595             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3596         auto *O0 = B.CreateZExtOrTrunc(
3597             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3598         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3599         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3600       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3601         auto Elements =
3602             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3603         auto *O0 = B.CreateZExtOrTrunc(
3604             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3605         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3606       } else {
3607         // If we don't know what to do, be conservative and don't do anything.
3608         continue;
3609       }
3610 
3611       // Lastly, extend the result.
3612       NewI->takeName(cast<Instruction>(I));
3613       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3614       I->replaceAllUsesWith(Res);
3615       cast<Instruction>(I)->eraseFromParent();
3616       Erased.insert(I);
3617       State.reset(Def, Res, Part);
3618     }
3619   }
3620 
3621   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3622   for (const auto &KV : Cost->getMinimalBitwidths()) {
3623     // If the value wasn't vectorized, we must maintain the original scalar
3624     // type. The absence of the value from State indicates that it
3625     // wasn't vectorized.
3626     // FIXME: Should not rely on getVPValue at this point.
3627     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3628     if (!State.hasAnyVectorValue(Def))
3629       continue;
3630     for (unsigned Part = 0; Part < UF; ++Part) {
3631       Value *I = State.get(Def, Part);
3632       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3633       if (Inst && Inst->use_empty()) {
3634         Value *NewI = Inst->getOperand(0);
3635         Inst->eraseFromParent();
3636         State.reset(Def, NewI, Part);
3637       }
3638     }
3639   }
3640 }
3641 
3642 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3643                                             VPlan &Plan) {
3644   // Insert truncates and extends for any truncated instructions as hints to
3645   // InstCombine.
3646   if (VF.isVector())
3647     truncateToMinimalBitwidths(State);
3648 
3649   // Fix widened non-induction PHIs by setting up the PHI operands.
3650   if (EnableVPlanNativePath)
3651     fixNonInductionPHIs(Plan, State);
3652 
3653   // At this point every instruction in the original loop is widened to a
3654   // vector form. Now we need to fix the recurrences in the loop. These PHI
3655   // nodes are currently empty because we did not want to introduce cycles.
3656   // This is the second stage of vectorizing recurrences.
3657   fixCrossIterationPHIs(State);
3658 
3659   // Forget the original basic block.
3660   PSE.getSE()->forgetLoop(OrigLoop);
3661 
3662   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3663   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3664   if (Cost->requiresScalarEpilogue(VF)) {
3665     // No edge from the middle block to the unique exit block has been inserted
3666     // and there is nothing to fix from vector loop; phis should have incoming
3667     // from scalar loop only.
3668     Plan.clearLiveOuts();
3669   } else {
3670     // If we inserted an edge from the middle block to the unique exit block,
3671     // update uses outside the loop (phis) to account for the newly inserted
3672     // edge.
3673 
3674     // Fix-up external users of the induction variables.
3675     for (auto &Entry : Legal->getInductionVars())
3676       fixupIVUsers(Entry.first, Entry.second,
3677                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3678                    IVEndValues[Entry.first], LoopMiddleBlock,
3679                    VectorLoop->getHeader(), Plan);
3680   }
3681 
3682   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3683   // in the exit block, so update the builder.
3684   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3685   for (auto &KV : Plan.getLiveOuts())
3686     KV.second->fixPhi(Plan, State);
3687 
3688   for (Instruction *PI : PredicatedInstructions)
3689     sinkScalarOperands(&*PI);
3690 
3691   // Remove redundant induction instructions.
3692   cse(VectorLoop->getHeader());
3693 
3694   // Set/update profile weights for the vector and remainder loops as original
3695   // loop iterations are now distributed among them. Note that original loop
3696   // represented by LoopScalarBody becomes remainder loop after vectorization.
3697   //
3698   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3699   // end up getting slightly roughened result but that should be OK since
3700   // profile is not inherently precise anyway. Note also possible bypass of
3701   // vector code caused by legality checks is ignored, assigning all the weight
3702   // to the vector loop, optimistically.
3703   //
3704   // For scalable vectorization we can't know at compile time how many iterations
3705   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3706   // vscale of '1'.
3707   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3708                                LI->getLoopFor(LoopScalarBody),
3709                                VF.getKnownMinValue() * UF);
3710 }
3711 
3712 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3713   // In order to support recurrences we need to be able to vectorize Phi nodes.
3714   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3715   // stage #2: We now need to fix the recurrences by adding incoming edges to
3716   // the currently empty PHI nodes. At this point every instruction in the
3717   // original loop is widened to a vector form so we can use them to construct
3718   // the incoming edges.
3719   VPBasicBlock *Header =
3720       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3721   for (VPRecipeBase &R : Header->phis()) {
3722     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3723       fixReduction(ReductionPhi, State);
3724     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3725       fixFirstOrderRecurrence(FOR, State);
3726   }
3727 }
3728 
3729 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3730     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3731   // This is the second phase of vectorizing first-order recurrences. An
3732   // overview of the transformation is described below. Suppose we have the
3733   // following loop.
3734   //
3735   //   for (int i = 0; i < n; ++i)
3736   //     b[i] = a[i] - a[i - 1];
3737   //
3738   // There is a first-order recurrence on "a". For this loop, the shorthand
3739   // scalar IR looks like:
3740   //
3741   //   scalar.ph:
3742   //     s_init = a[-1]
3743   //     br scalar.body
3744   //
3745   //   scalar.body:
3746   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3747   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3748   //     s2 = a[i]
3749   //     b[i] = s2 - s1
3750   //     br cond, scalar.body, ...
3751   //
3752   // In this example, s1 is a recurrence because it's value depends on the
3753   // previous iteration. In the first phase of vectorization, we created a
3754   // vector phi v1 for s1. We now complete the vectorization and produce the
3755   // shorthand vector IR shown below (for VF = 4, UF = 1).
3756   //
3757   //   vector.ph:
3758   //     v_init = vector(..., ..., ..., a[-1])
3759   //     br vector.body
3760   //
3761   //   vector.body
3762   //     i = phi [0, vector.ph], [i+4, vector.body]
3763   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3764   //     v2 = a[i, i+1, i+2, i+3];
3765   //     v3 = vector(v1(3), v2(0, 1, 2))
3766   //     b[i, i+1, i+2, i+3] = v2 - v3
3767   //     br cond, vector.body, middle.block
3768   //
3769   //   middle.block:
3770   //     x = v2(3)
3771   //     br scalar.ph
3772   //
3773   //   scalar.ph:
3774   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3775   //     br scalar.body
3776   //
3777   // After execution completes the vector loop, we extract the next value of
3778   // the recurrence (x) to use as the initial value in the scalar loop.
3779 
3780   // Extract the last vector element in the middle block. This will be the
3781   // initial value for the recurrence when jumping to the scalar loop.
3782   VPValue *PreviousDef = PhiR->getBackedgeValue();
3783   Value *Incoming = State.get(PreviousDef, UF - 1);
3784   auto *ExtractForScalar = Incoming;
3785   auto *IdxTy = Builder.getInt32Ty();
3786   if (VF.isVector()) {
3787     auto *One = ConstantInt::get(IdxTy, 1);
3788     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3789     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3790     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3791     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3792                                                     "vector.recur.extract");
3793   }
3794   // Extract the second last element in the middle block if the
3795   // Phi is used outside the loop. We need to extract the phi itself
3796   // and not the last element (the phi update in the current iteration). This
3797   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3798   // when the scalar loop is not run at all.
3799   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3800   if (VF.isVector()) {
3801     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3802     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3803     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3804         Incoming, Idx, "vector.recur.extract.for.phi");
3805   } else if (UF > 1)
3806     // When loop is unrolled without vectorizing, initialize
3807     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3808     // of `Incoming`. This is analogous to the vectorized case above: extracting
3809     // the second last element when VF > 1.
3810     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3811 
3812   // Fix the initial value of the original recurrence in the scalar loop.
3813   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3814   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3815   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3816   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3817   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3818     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3819     Start->addIncoming(Incoming, BB);
3820   }
3821 
3822   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3823   Phi->setName("scalar.recur");
3824 
3825   // Finally, fix users of the recurrence outside the loop. The users will need
3826   // either the last value of the scalar recurrence or the last value of the
3827   // vector recurrence we extracted in the middle block. Since the loop is in
3828   // LCSSA form, we just need to find all the phi nodes for the original scalar
3829   // recurrence in the exit block, and then add an edge for the middle block.
3830   // Note that LCSSA does not imply single entry when the original scalar loop
3831   // had multiple exiting edges (as we always run the last iteration in the
3832   // scalar epilogue); in that case, there is no edge from middle to exit and
3833   // and thus no phis which needed updated.
3834   if (!Cost->requiresScalarEpilogue(VF))
3835     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3836       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3837         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3838         State.Plan->removeLiveOut(&LCSSAPhi);
3839       }
3840 }
3841 
3842 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3843                                        VPTransformState &State) {
3844   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3845   // Get it's reduction variable descriptor.
3846   assert(Legal->isReductionVariable(OrigPhi) &&
3847          "Unable to find the reduction variable");
3848   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3849 
3850   RecurKind RK = RdxDesc.getRecurrenceKind();
3851   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3852   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3853   State.setDebugLocFromInst(ReductionStartValue);
3854 
3855   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3856   // This is the vector-clone of the value that leaves the loop.
3857   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3858 
3859   // Wrap flags are in general invalid after vectorization, clear them.
3860   clearReductionWrapFlags(PhiR, State);
3861 
3862   // Before each round, move the insertion point right between
3863   // the PHIs and the values we are going to write.
3864   // This allows us to write both PHINodes and the extractelement
3865   // instructions.
3866   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3867 
3868   State.setDebugLocFromInst(LoopExitInst);
3869 
3870   Type *PhiTy = OrigPhi->getType();
3871 
3872   VPBasicBlock *LatchVPBB =
3873       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3874   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3875   // If tail is folded by masking, the vector value to leave the loop should be
3876   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3877   // instead of the former. For an inloop reduction the reduction will already
3878   // be predicated, and does not need to be handled here.
3879   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3880     for (unsigned Part = 0; Part < UF; ++Part) {
3881       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3882       SelectInst *Sel = nullptr;
3883       for (User *U : VecLoopExitInst->users()) {
3884         if (isa<SelectInst>(U)) {
3885           assert(!Sel && "Reduction exit feeding two selects");
3886           Sel = cast<SelectInst>(U);
3887         } else
3888           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3889       }
3890       assert(Sel && "Reduction exit feeds no select");
3891       State.reset(LoopExitInstDef, Sel, Part);
3892 
3893       if (isa<FPMathOperator>(Sel))
3894         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3895 
3896       // If the target can create a predicated operator for the reduction at no
3897       // extra cost in the loop (for example a predicated vadd), it can be
3898       // cheaper for the select to remain in the loop than be sunk out of it,
3899       // and so use the select value for the phi instead of the old
3900       // LoopExitValue.
3901       if (PreferPredicatedReductionSelect ||
3902           TTI->preferPredicatedReductionSelect(
3903               RdxDesc.getOpcode(), PhiTy,
3904               TargetTransformInfo::ReductionFlags())) {
3905         auto *VecRdxPhi =
3906             cast<PHINode>(State.get(PhiR, Part));
3907         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3908       }
3909     }
3910   }
3911 
3912   // If the vector reduction can be performed in a smaller type, we truncate
3913   // then extend the loop exit value to enable InstCombine to evaluate the
3914   // entire expression in the smaller type.
3915   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3916     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3917     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3918     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3919     VectorParts RdxParts(UF);
3920     for (unsigned Part = 0; Part < UF; ++Part) {
3921       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3922       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3923       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3924                                         : Builder.CreateZExt(Trunc, VecTy);
3925       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3926         if (U != Trunc) {
3927           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3928           RdxParts[Part] = Extnd;
3929         }
3930     }
3931     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3932     for (unsigned Part = 0; Part < UF; ++Part) {
3933       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3934       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3935     }
3936   }
3937 
3938   // Reduce all of the unrolled parts into a single vector.
3939   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3940   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3941 
3942   // The middle block terminator has already been assigned a DebugLoc here (the
3943   // OrigLoop's single latch terminator). We want the whole middle block to
3944   // appear to execute on this line because: (a) it is all compiler generated,
3945   // (b) these instructions are always executed after evaluating the latch
3946   // conditional branch, and (c) other passes may add new predecessors which
3947   // terminate on this line. This is the easiest way to ensure we don't
3948   // accidentally cause an extra step back into the loop while debugging.
3949   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3950   if (PhiR->isOrdered())
3951     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3952   else {
3953     // Floating-point operations should have some FMF to enable the reduction.
3954     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3955     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3956     for (unsigned Part = 1; Part < UF; ++Part) {
3957       Value *RdxPart = State.get(LoopExitInstDef, Part);
3958       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3959         ReducedPartRdx = Builder.CreateBinOp(
3960             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3961       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3962         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3963                                            ReducedPartRdx, RdxPart);
3964       else
3965         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3966     }
3967   }
3968 
3969   // Create the reduction after the loop. Note that inloop reductions create the
3970   // target reduction in the loop using a Reduction recipe.
3971   if (VF.isVector() && !PhiR->isInLoop()) {
3972     ReducedPartRdx =
3973         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3974     // If the reduction can be performed in a smaller type, we need to extend
3975     // the reduction to the wider type before we branch to the original loop.
3976     if (PhiTy != RdxDesc.getRecurrenceType())
3977       ReducedPartRdx = RdxDesc.isSigned()
3978                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3979                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3980   }
3981 
3982   PHINode *ResumePhi =
3983       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3984 
3985   // Create a phi node that merges control-flow from the backedge-taken check
3986   // block and the middle block.
3987   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3988                                         LoopScalarPreHeader->getTerminator());
3989 
3990   // If we are fixing reductions in the epilogue loop then we should already
3991   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3992   // we carry over the incoming values correctly.
3993   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3994     if (Incoming == LoopMiddleBlock)
3995       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3996     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3997       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3998                               Incoming);
3999     else
4000       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4001   }
4002 
4003   // Set the resume value for this reduction
4004   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4005 
4006   // If there were stores of the reduction value to a uniform memory address
4007   // inside the loop, create the final store here.
4008   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4009     StoreInst *NewSI =
4010         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4011     propagateMetadata(NewSI, SI);
4012 
4013     // If the reduction value is used in other places,
4014     // then let the code below create PHI's for that.
4015   }
4016 
4017   // Now, we need to fix the users of the reduction variable
4018   // inside and outside of the scalar remainder loop.
4019 
4020   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4021   // in the exit blocks.  See comment on analogous loop in
4022   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4023   if (!Cost->requiresScalarEpilogue(VF))
4024     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4025       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4026         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4027         State.Plan->removeLiveOut(&LCSSAPhi);
4028       }
4029 
4030   // Fix the scalar loop reduction variable with the incoming reduction sum
4031   // from the vector body and from the backedge value.
4032   int IncomingEdgeBlockIdx =
4033       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4034   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4035   // Pick the other block.
4036   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4037   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4038   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4039 }
4040 
4041 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4042                                                   VPTransformState &State) {
4043   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4044   RecurKind RK = RdxDesc.getRecurrenceKind();
4045   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4046     return;
4047 
4048   SmallVector<VPValue *, 8> Worklist;
4049   SmallPtrSet<VPValue *, 8> Visited;
4050   Worklist.push_back(PhiR);
4051   Visited.insert(PhiR);
4052 
4053   while (!Worklist.empty()) {
4054     VPValue *Cur = Worklist.pop_back_val();
4055     for (unsigned Part = 0; Part < UF; ++Part) {
4056       Value *V = State.get(Cur, Part);
4057       if (!isa<OverflowingBinaryOperator>(V))
4058         break;
4059       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4060       }
4061 
4062       for (VPUser *U : Cur->users()) {
4063         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4064         if (!UserRecipe)
4065           continue;
4066         for (VPValue *V : UserRecipe->definedValues())
4067           if (Visited.insert(V).second)
4068             Worklist.push_back(V);
4069       }
4070   }
4071 }
4072 
4073 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4074   // The basic block and loop containing the predicated instruction.
4075   auto *PredBB = PredInst->getParent();
4076   auto *VectorLoop = LI->getLoopFor(PredBB);
4077 
4078   // Initialize a worklist with the operands of the predicated instruction.
4079   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4080 
4081   // Holds instructions that we need to analyze again. An instruction may be
4082   // reanalyzed if we don't yet know if we can sink it or not.
4083   SmallVector<Instruction *, 8> InstsToReanalyze;
4084 
4085   // Returns true if a given use occurs in the predicated block. Phi nodes use
4086   // their operands in their corresponding predecessor blocks.
4087   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4088     auto *I = cast<Instruction>(U.getUser());
4089     BasicBlock *BB = I->getParent();
4090     if (auto *Phi = dyn_cast<PHINode>(I))
4091       BB = Phi->getIncomingBlock(
4092           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4093     return BB == PredBB;
4094   };
4095 
4096   // Iteratively sink the scalarized operands of the predicated instruction
4097   // into the block we created for it. When an instruction is sunk, it's
4098   // operands are then added to the worklist. The algorithm ends after one pass
4099   // through the worklist doesn't sink a single instruction.
4100   bool Changed;
4101   do {
4102     // Add the instructions that need to be reanalyzed to the worklist, and
4103     // reset the changed indicator.
4104     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4105     InstsToReanalyze.clear();
4106     Changed = false;
4107 
4108     while (!Worklist.empty()) {
4109       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4110 
4111       // We can't sink an instruction if it is a phi node, is not in the loop,
4112       // or may have side effects.
4113       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4114           I->mayHaveSideEffects())
4115         continue;
4116 
4117       // If the instruction is already in PredBB, check if we can sink its
4118       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4119       // sinking the scalar instruction I, hence it appears in PredBB; but it
4120       // may have failed to sink I's operands (recursively), which we try
4121       // (again) here.
4122       if (I->getParent() == PredBB) {
4123         Worklist.insert(I->op_begin(), I->op_end());
4124         continue;
4125       }
4126 
4127       // It's legal to sink the instruction if all its uses occur in the
4128       // predicated block. Otherwise, there's nothing to do yet, and we may
4129       // need to reanalyze the instruction.
4130       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4131         InstsToReanalyze.push_back(I);
4132         continue;
4133       }
4134 
4135       // Move the instruction to the beginning of the predicated block, and add
4136       // it's operands to the worklist.
4137       I->moveBefore(&*PredBB->getFirstInsertionPt());
4138       Worklist.insert(I->op_begin(), I->op_end());
4139 
4140       // The sinking may have enabled other instructions to be sunk, so we will
4141       // need to iterate.
4142       Changed = true;
4143     }
4144   } while (Changed);
4145 }
4146 
4147 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4148                                               VPTransformState &State) {
4149   auto Iter = depth_first(
4150       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4151   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4152     for (VPRecipeBase &P : VPBB->phis()) {
4153       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4154       if (!VPPhi)
4155         continue;
4156       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4157       // Make sure the builder has a valid insert point.
4158       Builder.SetInsertPoint(NewPhi);
4159       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4160         VPValue *Inc = VPPhi->getIncomingValue(i);
4161         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4162         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4163       }
4164     }
4165   }
4166 }
4167 
4168 bool InnerLoopVectorizer::useOrderedReductions(
4169     const RecurrenceDescriptor &RdxDesc) {
4170   return Cost->useOrderedReductions(RdxDesc);
4171 }
4172 
4173 void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
4174                                                VPUser &ArgOperands,
4175                                                VPTransformState &State) {
4176   assert(!isa<DbgInfoIntrinsic>(CI) &&
4177          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4178   State.setDebugLocFromInst(&CI);
4179 
4180   SmallVector<Type *, 4> Tys;
4181   for (Value *ArgOperand : CI.args())
4182     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4183 
4184   Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
4185 
4186   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4187   // version of the instruction.
4188   // Is it beneficial to perform intrinsic call compared to lib call?
4189   bool NeedToScalarize = false;
4190   InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
4191   InstructionCost IntrinsicCost =
4192       ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
4193   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4194   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4195          "Instruction should be scalarized elsewhere.");
4196   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4197          "Either the intrinsic cost or vector call cost must be valid");
4198 
4199   for (unsigned Part = 0; Part < UF; ++Part) {
4200     SmallVector<Type *, 2> TysForDecl = {CI.getType()};
4201     SmallVector<Value *, 4> Args;
4202     for (auto &I : enumerate(ArgOperands.operands())) {
4203       // Some intrinsics have a scalar argument - don't replace it with a
4204       // vector.
4205       Value *Arg;
4206       if (!UseVectorIntrinsic ||
4207           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4208         Arg = State.get(I.value(), Part);
4209       else
4210         Arg = State.get(I.value(), VPIteration(0, 0));
4211       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4212         TysForDecl.push_back(Arg->getType());
4213       Args.push_back(Arg);
4214     }
4215 
4216     Function *VectorF;
4217     if (UseVectorIntrinsic) {
4218       // Use vector version of the intrinsic.
4219       if (VF.isVector())
4220         TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
4221       Module *M = State.Builder.GetInsertBlock()->getModule();
4222       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4223       assert(VectorF && "Can't retrieve vector intrinsic.");
4224     } else {
4225       // Use vector version of the function call.
4226       const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
4227 #ifndef NDEBUG
4228       assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
4229              "Can't create vector function.");
4230 #endif
4231       VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
4232     }
4233       SmallVector<OperandBundleDef, 1> OpBundles;
4234       CI.getOperandBundlesAsDefs(OpBundles);
4235       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4236 
4237       if (isa<FPMathOperator>(V))
4238         V->copyFastMathFlags(&CI);
4239 
4240       State.set(Def, V, Part);
4241       State.addMetadata(V, &CI);
4242   }
4243 }
4244 
4245 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4246   // We should not collect Scalars more than once per VF. Right now, this
4247   // function is called from collectUniformsAndScalars(), which already does
4248   // this check. Collecting Scalars for VF=1 does not make any sense.
4249   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4250          "This function should not be visited twice for the same VF");
4251 
4252   // This avoids any chances of creating a REPLICATE recipe during planning
4253   // since that would result in generation of scalarized code during execution,
4254   // which is not supported for scalable vectors.
4255   if (VF.isScalable()) {
4256     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4257     return;
4258   }
4259 
4260   SmallSetVector<Instruction *, 8> Worklist;
4261 
4262   // These sets are used to seed the analysis with pointers used by memory
4263   // accesses that will remain scalar.
4264   SmallSetVector<Instruction *, 8> ScalarPtrs;
4265   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4266   auto *Latch = TheLoop->getLoopLatch();
4267 
4268   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4269   // The pointer operands of loads and stores will be scalar as long as the
4270   // memory access is not a gather or scatter operation. The value operand of a
4271   // store will remain scalar if the store is scalarized.
4272   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4273     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4274     assert(WideningDecision != CM_Unknown &&
4275            "Widening decision should be ready at this moment");
4276     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4277       if (Ptr == Store->getValueOperand())
4278         return WideningDecision == CM_Scalarize;
4279     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4280            "Ptr is neither a value or pointer operand");
4281     return WideningDecision != CM_GatherScatter;
4282   };
4283 
4284   // A helper that returns true if the given value is a bitcast or
4285   // getelementptr instruction contained in the loop.
4286   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4287     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4288             isa<GetElementPtrInst>(V)) &&
4289            !TheLoop->isLoopInvariant(V);
4290   };
4291 
4292   // A helper that evaluates a memory access's use of a pointer. If the use will
4293   // be a scalar use and the pointer is only used by memory accesses, we place
4294   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4295   // PossibleNonScalarPtrs.
4296   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4297     // We only care about bitcast and getelementptr instructions contained in
4298     // the loop.
4299     if (!isLoopVaryingBitCastOrGEP(Ptr))
4300       return;
4301 
4302     // If the pointer has already been identified as scalar (e.g., if it was
4303     // also identified as uniform), there's nothing to do.
4304     auto *I = cast<Instruction>(Ptr);
4305     if (Worklist.count(I))
4306       return;
4307 
4308     // If the use of the pointer will be a scalar use, and all users of the
4309     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4310     // place the pointer in PossibleNonScalarPtrs.
4311     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4312           return isa<LoadInst>(U) || isa<StoreInst>(U);
4313         }))
4314       ScalarPtrs.insert(I);
4315     else
4316       PossibleNonScalarPtrs.insert(I);
4317   };
4318 
4319   // We seed the scalars analysis with three classes of instructions: (1)
4320   // instructions marked uniform-after-vectorization and (2) bitcast,
4321   // getelementptr and (pointer) phi instructions used by memory accesses
4322   // requiring a scalar use.
4323   //
4324   // (1) Add to the worklist all instructions that have been identified as
4325   // uniform-after-vectorization.
4326   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4327 
4328   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4329   // memory accesses requiring a scalar use. The pointer operands of loads and
4330   // stores will be scalar as long as the memory accesses is not a gather or
4331   // scatter operation. The value operand of a store will remain scalar if the
4332   // store is scalarized.
4333   for (auto *BB : TheLoop->blocks())
4334     for (auto &I : *BB) {
4335       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4336         evaluatePtrUse(Load, Load->getPointerOperand());
4337       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4338         evaluatePtrUse(Store, Store->getPointerOperand());
4339         evaluatePtrUse(Store, Store->getValueOperand());
4340       }
4341     }
4342   for (auto *I : ScalarPtrs)
4343     if (!PossibleNonScalarPtrs.count(I)) {
4344       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4345       Worklist.insert(I);
4346     }
4347 
4348   // Insert the forced scalars.
4349   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4350   // induction variable when the PHI user is scalarized.
4351   auto ForcedScalar = ForcedScalars.find(VF);
4352   if (ForcedScalar != ForcedScalars.end())
4353     for (auto *I : ForcedScalar->second)
4354       Worklist.insert(I);
4355 
4356   // Expand the worklist by looking through any bitcasts and getelementptr
4357   // instructions we've already identified as scalar. This is similar to the
4358   // expansion step in collectLoopUniforms(); however, here we're only
4359   // expanding to include additional bitcasts and getelementptr instructions.
4360   unsigned Idx = 0;
4361   while (Idx != Worklist.size()) {
4362     Instruction *Dst = Worklist[Idx++];
4363     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4364       continue;
4365     auto *Src = cast<Instruction>(Dst->getOperand(0));
4366     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4367           auto *J = cast<Instruction>(U);
4368           return !TheLoop->contains(J) || Worklist.count(J) ||
4369                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4370                   isScalarUse(J, Src));
4371         })) {
4372       Worklist.insert(Src);
4373       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4374     }
4375   }
4376 
4377   // An induction variable will remain scalar if all users of the induction
4378   // variable and induction variable update remain scalar.
4379   for (auto &Induction : Legal->getInductionVars()) {
4380     auto *Ind = Induction.first;
4381     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4382 
4383     // If tail-folding is applied, the primary induction variable will be used
4384     // to feed a vector compare.
4385     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4386       continue;
4387 
4388     // Returns true if \p Indvar is a pointer induction that is used directly by
4389     // load/store instruction \p I.
4390     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4391                                               Instruction *I) {
4392       return Induction.second.getKind() ==
4393                  InductionDescriptor::IK_PtrInduction &&
4394              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4395              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4396     };
4397 
4398     // Determine if all users of the induction variable are scalar after
4399     // vectorization.
4400     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4401       auto *I = cast<Instruction>(U);
4402       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4403              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4404     });
4405     if (!ScalarInd)
4406       continue;
4407 
4408     // Determine if all users of the induction variable update instruction are
4409     // scalar after vectorization.
4410     auto ScalarIndUpdate =
4411         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4412           auto *I = cast<Instruction>(U);
4413           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4414                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4415         });
4416     if (!ScalarIndUpdate)
4417       continue;
4418 
4419     // The induction variable and its update instruction will remain scalar.
4420     Worklist.insert(Ind);
4421     Worklist.insert(IndUpdate);
4422     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4423     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4424                       << "\n");
4425   }
4426 
4427   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4428 }
4429 
4430 bool LoopVectorizationCostModel::isScalarWithPredication(
4431     Instruction *I, ElementCount VF) const {
4432   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4433     return false;
4434   switch(I->getOpcode()) {
4435   default:
4436     break;
4437   case Instruction::Load:
4438   case Instruction::Store: {
4439     if (!Legal->isMaskRequired(I))
4440       return false;
4441     auto *Ptr = getLoadStorePointerOperand(I);
4442     auto *Ty = getLoadStoreType(I);
4443     Type *VTy = Ty;
4444     if (VF.isVector())
4445       VTy = VectorType::get(Ty, VF);
4446     const Align Alignment = getLoadStoreAlignment(I);
4447     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4448                                 TTI.isLegalMaskedGather(VTy, Alignment))
4449                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4450                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4451   }
4452   case Instruction::UDiv:
4453   case Instruction::SDiv:
4454   case Instruction::SRem:
4455   case Instruction::URem:
4456     // TODO: We can use the loop-preheader as context point here and get
4457     // context sensitive reasoning
4458     return !isSafeToSpeculativelyExecute(I);
4459   }
4460   return false;
4461 }
4462 
4463 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4464     Instruction *I, ElementCount VF) {
4465   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4466   assert(getWideningDecision(I, VF) == CM_Unknown &&
4467          "Decision should not be set yet.");
4468   auto *Group = getInterleavedAccessGroup(I);
4469   assert(Group && "Must have a group.");
4470 
4471   // If the instruction's allocated size doesn't equal it's type size, it
4472   // requires padding and will be scalarized.
4473   auto &DL = I->getModule()->getDataLayout();
4474   auto *ScalarTy = getLoadStoreType(I);
4475   if (hasIrregularType(ScalarTy, DL))
4476     return false;
4477 
4478   // If the group involves a non-integral pointer, we may not be able to
4479   // losslessly cast all values to a common type.
4480   unsigned InterleaveFactor = Group->getFactor();
4481   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4482   for (unsigned i = 0; i < InterleaveFactor; i++) {
4483     Instruction *Member = Group->getMember(i);
4484     if (!Member)
4485       continue;
4486     auto *MemberTy = getLoadStoreType(Member);
4487     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4488     // Don't coerce non-integral pointers to integers or vice versa.
4489     if (MemberNI != ScalarNI) {
4490       // TODO: Consider adding special nullptr value case here
4491       return false;
4492     } else if (MemberNI && ScalarNI &&
4493                ScalarTy->getPointerAddressSpace() !=
4494                MemberTy->getPointerAddressSpace()) {
4495       return false;
4496     }
4497   }
4498 
4499   // Check if masking is required.
4500   // A Group may need masking for one of two reasons: it resides in a block that
4501   // needs predication, or it was decided to use masking to deal with gaps
4502   // (either a gap at the end of a load-access that may result in a speculative
4503   // load, or any gaps in a store-access).
4504   bool PredicatedAccessRequiresMasking =
4505       blockNeedsPredicationForAnyReason(I->getParent()) &&
4506       Legal->isMaskRequired(I);
4507   bool LoadAccessWithGapsRequiresEpilogMasking =
4508       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4509       !isScalarEpilogueAllowed();
4510   bool StoreAccessWithGapsRequiresMasking =
4511       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4512   if (!PredicatedAccessRequiresMasking &&
4513       !LoadAccessWithGapsRequiresEpilogMasking &&
4514       !StoreAccessWithGapsRequiresMasking)
4515     return true;
4516 
4517   // If masked interleaving is required, we expect that the user/target had
4518   // enabled it, because otherwise it either wouldn't have been created or
4519   // it should have been invalidated by the CostModel.
4520   assert(useMaskedInterleavedAccesses(TTI) &&
4521          "Masked interleave-groups for predicated accesses are not enabled.");
4522 
4523   if (Group->isReverse())
4524     return false;
4525 
4526   auto *Ty = getLoadStoreType(I);
4527   const Align Alignment = getLoadStoreAlignment(I);
4528   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4529                           : TTI.isLegalMaskedStore(Ty, Alignment);
4530 }
4531 
4532 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4533     Instruction *I, ElementCount VF) {
4534   // Get and ensure we have a valid memory instruction.
4535   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4536 
4537   auto *Ptr = getLoadStorePointerOperand(I);
4538   auto *ScalarTy = getLoadStoreType(I);
4539 
4540   // In order to be widened, the pointer should be consecutive, first of all.
4541   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4542     return false;
4543 
4544   // If the instruction is a store located in a predicated block, it will be
4545   // scalarized.
4546   if (isScalarWithPredication(I, VF))
4547     return false;
4548 
4549   // If the instruction's allocated size doesn't equal it's type size, it
4550   // requires padding and will be scalarized.
4551   auto &DL = I->getModule()->getDataLayout();
4552   if (hasIrregularType(ScalarTy, DL))
4553     return false;
4554 
4555   return true;
4556 }
4557 
4558 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4559   // We should not collect Uniforms more than once per VF. Right now,
4560   // this function is called from collectUniformsAndScalars(), which
4561   // already does this check. Collecting Uniforms for VF=1 does not make any
4562   // sense.
4563 
4564   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4565          "This function should not be visited twice for the same VF");
4566 
4567   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4568   // not analyze again.  Uniforms.count(VF) will return 1.
4569   Uniforms[VF].clear();
4570 
4571   // We now know that the loop is vectorizable!
4572   // Collect instructions inside the loop that will remain uniform after
4573   // vectorization.
4574 
4575   // Global values, params and instructions outside of current loop are out of
4576   // scope.
4577   auto isOutOfScope = [&](Value *V) -> bool {
4578     Instruction *I = dyn_cast<Instruction>(V);
4579     return (!I || !TheLoop->contains(I));
4580   };
4581 
4582   // Worklist containing uniform instructions demanding lane 0.
4583   SetVector<Instruction *> Worklist;
4584   BasicBlock *Latch = TheLoop->getLoopLatch();
4585 
4586   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4587   // that are scalar with predication must not be considered uniform after
4588   // vectorization, because that would create an erroneous replicating region
4589   // where only a single instance out of VF should be formed.
4590   // TODO: optimize such seldom cases if found important, see PR40816.
4591   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4592     if (isOutOfScope(I)) {
4593       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4594                         << *I << "\n");
4595       return;
4596     }
4597     if (isScalarWithPredication(I, VF)) {
4598       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4599                         << *I << "\n");
4600       return;
4601     }
4602     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4603     Worklist.insert(I);
4604   };
4605 
4606   // Start with the conditional branch. If the branch condition is an
4607   // instruction contained in the loop that is only used by the branch, it is
4608   // uniform.
4609   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4610   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4611     addToWorklistIfAllowed(Cmp);
4612 
4613   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4614     InstWidening WideningDecision = getWideningDecision(I, VF);
4615     assert(WideningDecision != CM_Unknown &&
4616            "Widening decision should be ready at this moment");
4617 
4618     // A uniform memory op is itself uniform.  We exclude uniform stores
4619     // here as they demand the last lane, not the first one.
4620     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4621       assert(WideningDecision == CM_Scalarize);
4622       return true;
4623     }
4624 
4625     return (WideningDecision == CM_Widen ||
4626             WideningDecision == CM_Widen_Reverse ||
4627             WideningDecision == CM_Interleave);
4628   };
4629 
4630   // Returns true if Ptr is the pointer operand of a memory access instruction
4631   // I, I is known to not require scalarization, and the pointer is not also
4632   // stored.
4633   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4634     auto GetStoredValue = [I]() -> Value * {
4635       if (!isa<StoreInst>(I))
4636         return nullptr;
4637       return I->getOperand(0);
4638     };
4639     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) &&
4640            GetStoredValue() != Ptr;
4641   };
4642 
4643   // Holds a list of values which are known to have at least one uniform use.
4644   // Note that there may be other uses which aren't uniform.  A "uniform use"
4645   // here is something which only demands lane 0 of the unrolled iterations;
4646   // it does not imply that all lanes produce the same value (e.g. this is not
4647   // the usual meaning of uniform)
4648   SetVector<Value *> HasUniformUse;
4649 
4650   // Scan the loop for instructions which are either a) known to have only
4651   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4652   for (auto *BB : TheLoop->blocks())
4653     for (auto &I : *BB) {
4654       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4655         switch (II->getIntrinsicID()) {
4656         case Intrinsic::sideeffect:
4657         case Intrinsic::experimental_noalias_scope_decl:
4658         case Intrinsic::assume:
4659         case Intrinsic::lifetime_start:
4660         case Intrinsic::lifetime_end:
4661           if (TheLoop->hasLoopInvariantOperands(&I))
4662             addToWorklistIfAllowed(&I);
4663           break;
4664         default:
4665           break;
4666         }
4667       }
4668 
4669       // ExtractValue instructions must be uniform, because the operands are
4670       // known to be loop-invariant.
4671       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4672         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4673                "Expected aggregate value to be loop invariant");
4674         addToWorklistIfAllowed(EVI);
4675         continue;
4676       }
4677 
4678       // If there's no pointer operand, there's nothing to do.
4679       auto *Ptr = getLoadStorePointerOperand(&I);
4680       if (!Ptr)
4681         continue;
4682 
4683       // A uniform memory op is itself uniform.  We exclude uniform stores
4684       // here as they demand the last lane, not the first one.
4685       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4686         addToWorklistIfAllowed(&I);
4687 
4688       if (isVectorizedMemAccessUse(&I, Ptr)) {
4689         assert(isUniformDecision(&I, VF) && "consistency check");
4690         HasUniformUse.insert(Ptr);
4691       }
4692     }
4693 
4694   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4695   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4696   // disallows uses outside the loop as well.
4697   for (auto *V : HasUniformUse) {
4698     if (isOutOfScope(V))
4699       continue;
4700     auto *I = cast<Instruction>(V);
4701     auto UsersAreMemAccesses =
4702       llvm::all_of(I->users(), [&](User *U) -> bool {
4703         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4704       });
4705     if (UsersAreMemAccesses)
4706       addToWorklistIfAllowed(I);
4707   }
4708 
4709   // Expand Worklist in topological order: whenever a new instruction
4710   // is added , its users should be already inside Worklist.  It ensures
4711   // a uniform instruction will only be used by uniform instructions.
4712   unsigned idx = 0;
4713   while (idx != Worklist.size()) {
4714     Instruction *I = Worklist[idx++];
4715 
4716     for (auto OV : I->operand_values()) {
4717       // isOutOfScope operands cannot be uniform instructions.
4718       if (isOutOfScope(OV))
4719         continue;
4720       // First order recurrence Phi's should typically be considered
4721       // non-uniform.
4722       auto *OP = dyn_cast<PHINode>(OV);
4723       if (OP && Legal->isFirstOrderRecurrence(OP))
4724         continue;
4725       // If all the users of the operand are uniform, then add the
4726       // operand into the uniform worklist.
4727       auto *OI = cast<Instruction>(OV);
4728       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4729             auto *J = cast<Instruction>(U);
4730             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4731           }))
4732         addToWorklistIfAllowed(OI);
4733     }
4734   }
4735 
4736   // For an instruction to be added into Worklist above, all its users inside
4737   // the loop should also be in Worklist. However, this condition cannot be
4738   // true for phi nodes that form a cyclic dependence. We must process phi
4739   // nodes separately. An induction variable will remain uniform if all users
4740   // of the induction variable and induction variable update remain uniform.
4741   // The code below handles both pointer and non-pointer induction variables.
4742   for (auto &Induction : Legal->getInductionVars()) {
4743     auto *Ind = Induction.first;
4744     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4745 
4746     // Determine if all users of the induction variable are uniform after
4747     // vectorization.
4748     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4749       auto *I = cast<Instruction>(U);
4750       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4751              isVectorizedMemAccessUse(I, Ind);
4752     });
4753     if (!UniformInd)
4754       continue;
4755 
4756     // Determine if all users of the induction variable update instruction are
4757     // uniform after vectorization.
4758     auto UniformIndUpdate =
4759         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4760           auto *I = cast<Instruction>(U);
4761           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4762                  isVectorizedMemAccessUse(I, IndUpdate);
4763         });
4764     if (!UniformIndUpdate)
4765       continue;
4766 
4767     // The induction variable and its update instruction will remain uniform.
4768     addToWorklistIfAllowed(Ind);
4769     addToWorklistIfAllowed(IndUpdate);
4770   }
4771 
4772   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4773 }
4774 
4775 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4776   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4777 
4778   if (Legal->getRuntimePointerChecking()->Need) {
4779     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4780         "runtime pointer checks needed. Enable vectorization of this "
4781         "loop with '#pragma clang loop vectorize(enable)' when "
4782         "compiling with -Os/-Oz",
4783         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4784     return true;
4785   }
4786 
4787   if (!PSE.getPredicate().isAlwaysTrue()) {
4788     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4789         "runtime SCEV checks needed. Enable vectorization of this "
4790         "loop with '#pragma clang loop vectorize(enable)' when "
4791         "compiling with -Os/-Oz",
4792         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4793     return true;
4794   }
4795 
4796   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4797   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4798     reportVectorizationFailure("Runtime stride check for small trip count",
4799         "runtime stride == 1 checks needed. Enable vectorization of "
4800         "this loop without such check by compiling with -Os/-Oz",
4801         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4802     return true;
4803   }
4804 
4805   return false;
4806 }
4807 
4808 ElementCount
4809 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4810   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4811     return ElementCount::getScalable(0);
4812 
4813   if (Hints->isScalableVectorizationDisabled()) {
4814     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4815                             "ScalableVectorizationDisabled", ORE, TheLoop);
4816     return ElementCount::getScalable(0);
4817   }
4818 
4819   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4820 
4821   auto MaxScalableVF = ElementCount::getScalable(
4822       std::numeric_limits<ElementCount::ScalarTy>::max());
4823 
4824   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4825   // FIXME: While for scalable vectors this is currently sufficient, this should
4826   // be replaced by a more detailed mechanism that filters out specific VFs,
4827   // instead of invalidating vectorization for a whole set of VFs based on the
4828   // MaxVF.
4829 
4830   // Disable scalable vectorization if the loop contains unsupported reductions.
4831   if (!canVectorizeReductions(MaxScalableVF)) {
4832     reportVectorizationInfo(
4833         "Scalable vectorization not supported for the reduction "
4834         "operations found in this loop.",
4835         "ScalableVFUnfeasible", ORE, TheLoop);
4836     return ElementCount::getScalable(0);
4837   }
4838 
4839   // Disable scalable vectorization if the loop contains any instructions
4840   // with element types not supported for scalable vectors.
4841   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4842         return !Ty->isVoidTy() &&
4843                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4844       })) {
4845     reportVectorizationInfo("Scalable vectorization is not supported "
4846                             "for all element types found in this loop.",
4847                             "ScalableVFUnfeasible", ORE, TheLoop);
4848     return ElementCount::getScalable(0);
4849   }
4850 
4851   if (Legal->isSafeForAnyVectorWidth())
4852     return MaxScalableVF;
4853 
4854   // Limit MaxScalableVF by the maximum safe dependence distance.
4855   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4856   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4857     MaxVScale =
4858         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4859   MaxScalableVF = ElementCount::getScalable(
4860       MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
4861   if (!MaxScalableVF)
4862     reportVectorizationInfo(
4863         "Max legal vector width too small, scalable vectorization "
4864         "unfeasible.",
4865         "ScalableVFUnfeasible", ORE, TheLoop);
4866 
4867   return MaxScalableVF;
4868 }
4869 
4870 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4871     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4872   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4873   unsigned SmallestType, WidestType;
4874   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4875 
4876   // Get the maximum safe dependence distance in bits computed by LAA.
4877   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4878   // the memory accesses that is most restrictive (involved in the smallest
4879   // dependence distance).
4880   unsigned MaxSafeElements =
4881       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4882 
4883   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4884   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4885 
4886   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4887                     << ".\n");
4888   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4889                     << ".\n");
4890 
4891   // First analyze the UserVF, fall back if the UserVF should be ignored.
4892   if (UserVF) {
4893     auto MaxSafeUserVF =
4894         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4895 
4896     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4897       // If `VF=vscale x N` is safe, then so is `VF=N`
4898       if (UserVF.isScalable())
4899         return FixedScalableVFPair(
4900             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4901       else
4902         return UserVF;
4903     }
4904 
4905     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4906 
4907     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4908     // is better to ignore the hint and let the compiler choose a suitable VF.
4909     if (!UserVF.isScalable()) {
4910       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4911                         << " is unsafe, clamping to max safe VF="
4912                         << MaxSafeFixedVF << ".\n");
4913       ORE->emit([&]() {
4914         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4915                                           TheLoop->getStartLoc(),
4916                                           TheLoop->getHeader())
4917                << "User-specified vectorization factor "
4918                << ore::NV("UserVectorizationFactor", UserVF)
4919                << " is unsafe, clamping to maximum safe vectorization factor "
4920                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4921       });
4922       return MaxSafeFixedVF;
4923     }
4924 
4925     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4926       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4927                         << " is ignored because scalable vectors are not "
4928                            "available.\n");
4929       ORE->emit([&]() {
4930         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4931                                           TheLoop->getStartLoc(),
4932                                           TheLoop->getHeader())
4933                << "User-specified vectorization factor "
4934                << ore::NV("UserVectorizationFactor", UserVF)
4935                << " is ignored because the target does not support scalable "
4936                   "vectors. The compiler will pick a more suitable value.";
4937       });
4938     } else {
4939       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4940                         << " is unsafe. Ignoring scalable UserVF.\n");
4941       ORE->emit([&]() {
4942         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4943                                           TheLoop->getStartLoc(),
4944                                           TheLoop->getHeader())
4945                << "User-specified vectorization factor "
4946                << ore::NV("UserVectorizationFactor", UserVF)
4947                << " is unsafe. Ignoring the hint to let the compiler pick a "
4948                   "more suitable value.";
4949       });
4950     }
4951   }
4952 
4953   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4954                     << " / " << WidestType << " bits.\n");
4955 
4956   FixedScalableVFPair Result(ElementCount::getFixed(1),
4957                              ElementCount::getScalable(0));
4958   if (auto MaxVF =
4959           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4960                                   MaxSafeFixedVF, FoldTailByMasking))
4961     Result.FixedVF = MaxVF;
4962 
4963   if (auto MaxVF =
4964           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4965                                   MaxSafeScalableVF, FoldTailByMasking))
4966     if (MaxVF.isScalable()) {
4967       Result.ScalableVF = MaxVF;
4968       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4969                         << "\n");
4970     }
4971 
4972   return Result;
4973 }
4974 
4975 FixedScalableVFPair
4976 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4977   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4978     // TODO: It may by useful to do since it's still likely to be dynamically
4979     // uniform if the target can skip.
4980     reportVectorizationFailure(
4981         "Not inserting runtime ptr check for divergent target",
4982         "runtime pointer checks needed. Not enabled for divergent target",
4983         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4984     return FixedScalableVFPair::getNone();
4985   }
4986 
4987   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4988   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4989   if (TC == 1) {
4990     reportVectorizationFailure("Single iteration (non) loop",
4991         "loop trip count is one, irrelevant for vectorization",
4992         "SingleIterationLoop", ORE, TheLoop);
4993     return FixedScalableVFPair::getNone();
4994   }
4995 
4996   switch (ScalarEpilogueStatus) {
4997   case CM_ScalarEpilogueAllowed:
4998     return computeFeasibleMaxVF(TC, UserVF, false);
4999   case CM_ScalarEpilogueNotAllowedUsePredicate:
5000     LLVM_FALLTHROUGH;
5001   case CM_ScalarEpilogueNotNeededUsePredicate:
5002     LLVM_DEBUG(
5003         dbgs() << "LV: vector predicate hint/switch found.\n"
5004                << "LV: Not allowing scalar epilogue, creating predicated "
5005                << "vector loop.\n");
5006     break;
5007   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5008     // fallthrough as a special case of OptForSize
5009   case CM_ScalarEpilogueNotAllowedOptSize:
5010     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5011       LLVM_DEBUG(
5012           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5013     else
5014       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5015                         << "count.\n");
5016 
5017     // Bail if runtime checks are required, which are not good when optimising
5018     // for size.
5019     if (runtimeChecksRequired())
5020       return FixedScalableVFPair::getNone();
5021 
5022     break;
5023   }
5024 
5025   // The only loops we can vectorize without a scalar epilogue, are loops with
5026   // a bottom-test and a single exiting block. We'd have to handle the fact
5027   // that not every instruction executes on the last iteration.  This will
5028   // require a lane mask which varies through the vector loop body.  (TODO)
5029   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5030     // If there was a tail-folding hint/switch, but we can't fold the tail by
5031     // masking, fallback to a vectorization with a scalar epilogue.
5032     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5033       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5034                            "scalar epilogue instead.\n");
5035       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5036       return computeFeasibleMaxVF(TC, UserVF, false);
5037     }
5038     return FixedScalableVFPair::getNone();
5039   }
5040 
5041   // Now try the tail folding
5042 
5043   // Invalidate interleave groups that require an epilogue if we can't mask
5044   // the interleave-group.
5045   if (!useMaskedInterleavedAccesses(TTI)) {
5046     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5047            "No decisions should have been taken at this point");
5048     // Note: There is no need to invalidate any cost modeling decisions here, as
5049     // non where taken so far.
5050     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5051   }
5052 
5053   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5054   // Avoid tail folding if the trip count is known to be a multiple of any VF
5055   // we chose.
5056   // FIXME: The condition below pessimises the case for fixed-width vectors,
5057   // when scalable VFs are also candidates for vectorization.
5058   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5059     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5060     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5061            "MaxFixedVF must be a power of 2");
5062     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5063                                    : MaxFixedVF.getFixedValue();
5064     ScalarEvolution *SE = PSE.getSE();
5065     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5066     const SCEV *ExitCount = SE->getAddExpr(
5067         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5068     const SCEV *Rem = SE->getURemExpr(
5069         SE->applyLoopGuards(ExitCount, TheLoop),
5070         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5071     if (Rem->isZero()) {
5072       // Accept MaxFixedVF if we do not have a tail.
5073       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5074       return MaxFactors;
5075     }
5076   }
5077 
5078   // If we don't know the precise trip count, or if the trip count that we
5079   // found modulo the vectorization factor is not zero, try to fold the tail
5080   // by masking.
5081   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5082   if (Legal->prepareToFoldTailByMasking()) {
5083     FoldTailByMasking = true;
5084     return MaxFactors;
5085   }
5086 
5087   // If there was a tail-folding hint/switch, but we can't fold the tail by
5088   // masking, fallback to a vectorization with a scalar epilogue.
5089   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5090     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5091                          "scalar epilogue instead.\n");
5092     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5093     return MaxFactors;
5094   }
5095 
5096   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5097     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5098     return FixedScalableVFPair::getNone();
5099   }
5100 
5101   if (TC == 0) {
5102     reportVectorizationFailure(
5103         "Unable to calculate the loop count due to complex control flow",
5104         "unable to calculate the loop count due to complex control flow",
5105         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5106     return FixedScalableVFPair::getNone();
5107   }
5108 
5109   reportVectorizationFailure(
5110       "Cannot optimize for size and vectorize at the same time.",
5111       "cannot optimize for size and vectorize at the same time. "
5112       "Enable vectorization of this loop with '#pragma clang loop "
5113       "vectorize(enable)' when compiling with -Os/-Oz",
5114       "NoTailLoopWithOptForSize", ORE, TheLoop);
5115   return FixedScalableVFPair::getNone();
5116 }
5117 
5118 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5119     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5120     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5121   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5122   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5123       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5124                            : TargetTransformInfo::RGK_FixedWidthVector);
5125 
5126   // Convenience function to return the minimum of two ElementCounts.
5127   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5128     assert((LHS.isScalable() == RHS.isScalable()) &&
5129            "Scalable flags must match");
5130     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5131   };
5132 
5133   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5134   // Note that both WidestRegister and WidestType may not be a powers of 2.
5135   auto MaxVectorElementCount = ElementCount::get(
5136       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5137       ComputeScalableMaxVF);
5138   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5139   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5140                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5141 
5142   if (!MaxVectorElementCount) {
5143     LLVM_DEBUG(dbgs() << "LV: The target has no "
5144                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5145                       << " vector registers.\n");
5146     return ElementCount::getFixed(1);
5147   }
5148 
5149   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5150   if (ConstTripCount &&
5151       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5152       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5153     // If loop trip count (TC) is known at compile time there is no point in
5154     // choosing VF greater than TC (as done in the loop below). Select maximum
5155     // power of two which doesn't exceed TC.
5156     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5157     // when the TC is less than or equal to the known number of lanes.
5158     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5159     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5160                          "exceeding the constant trip count: "
5161                       << ClampedConstTripCount << "\n");
5162     return ElementCount::getFixed(ClampedConstTripCount);
5163   }
5164 
5165   TargetTransformInfo::RegisterKind RegKind =
5166       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5167                            : TargetTransformInfo::RGK_FixedWidthVector;
5168   ElementCount MaxVF = MaxVectorElementCount;
5169   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5170                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5171     auto MaxVectorElementCountMaxBW = ElementCount::get(
5172         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5173         ComputeScalableMaxVF);
5174     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5175 
5176     // Collect all viable vectorization factors larger than the default MaxVF
5177     // (i.e. MaxVectorElementCount).
5178     SmallVector<ElementCount, 8> VFs;
5179     for (ElementCount VS = MaxVectorElementCount * 2;
5180          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5181       VFs.push_back(VS);
5182 
5183     // For each VF calculate its register usage.
5184     auto RUs = calculateRegisterUsage(VFs);
5185 
5186     // Select the largest VF which doesn't require more registers than existing
5187     // ones.
5188     for (int i = RUs.size() - 1; i >= 0; --i) {
5189       bool Selected = true;
5190       for (auto &pair : RUs[i].MaxLocalUsers) {
5191         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5192         if (pair.second > TargetNumRegisters)
5193           Selected = false;
5194       }
5195       if (Selected) {
5196         MaxVF = VFs[i];
5197         break;
5198       }
5199     }
5200     if (ElementCount MinVF =
5201             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5202       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5203         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5204                           << ") with target's minimum: " << MinVF << '\n');
5205         MaxVF = MinVF;
5206       }
5207     }
5208 
5209     // Invalidate any widening decisions we might have made, in case the loop
5210     // requires prediction (decided later), but we have already made some
5211     // load/store widening decisions.
5212     invalidateCostModelingDecisions();
5213   }
5214   return MaxVF;
5215 }
5216 
5217 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5218   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5219     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5220     auto Min = Attr.getVScaleRangeMin();
5221     auto Max = Attr.getVScaleRangeMax();
5222     if (Max && Min == Max)
5223       return Max;
5224   }
5225 
5226   return TTI.getVScaleForTuning();
5227 }
5228 
5229 bool LoopVectorizationCostModel::isMoreProfitable(
5230     const VectorizationFactor &A, const VectorizationFactor &B) const {
5231   InstructionCost CostA = A.Cost;
5232   InstructionCost CostB = B.Cost;
5233 
5234   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5235 
5236   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5237       MaxTripCount) {
5238     // If we are folding the tail and the trip count is a known (possibly small)
5239     // constant, the trip count will be rounded up to an integer number of
5240     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5241     // which we compare directly. When not folding the tail, the total cost will
5242     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5243     // approximated with the per-lane cost below instead of using the tripcount
5244     // as here.
5245     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5246     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5247     return RTCostA < RTCostB;
5248   }
5249 
5250   // Improve estimate for the vector width if it is scalable.
5251   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5252   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5253   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5254     if (A.Width.isScalable())
5255       EstimatedWidthA *= VScale.value();
5256     if (B.Width.isScalable())
5257       EstimatedWidthB *= VScale.value();
5258   }
5259 
5260   // Assume vscale may be larger than 1 (or the value being tuned for),
5261   // so that scalable vectorization is slightly favorable over fixed-width
5262   // vectorization.
5263   if (A.Width.isScalable() && !B.Width.isScalable())
5264     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5265 
5266   // To avoid the need for FP division:
5267   //      (CostA / A.Width) < (CostB / B.Width)
5268   // <=>  (CostA * B.Width) < (CostB * A.Width)
5269   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5270 }
5271 
5272 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5273     const ElementCountSet &VFCandidates) {
5274   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5275   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5276   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5277   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5278          "Expected Scalar VF to be a candidate");
5279 
5280   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5281                                        ExpectedCost);
5282   VectorizationFactor ChosenFactor = ScalarCost;
5283 
5284   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5285   if (ForceVectorization && VFCandidates.size() > 1) {
5286     // Ignore scalar width, because the user explicitly wants vectorization.
5287     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5288     // evaluation.
5289     ChosenFactor.Cost = InstructionCost::getMax();
5290   }
5291 
5292   SmallVector<InstructionVFPair> InvalidCosts;
5293   for (const auto &i : VFCandidates) {
5294     // The cost for scalar VF=1 is already calculated, so ignore it.
5295     if (i.isScalar())
5296       continue;
5297 
5298     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5299     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5300 
5301 #ifndef NDEBUG
5302     unsigned AssumedMinimumVscale = 1;
5303     if (Optional<unsigned> VScale = getVScaleForTuning())
5304       AssumedMinimumVscale = *VScale;
5305     unsigned Width =
5306         Candidate.Width.isScalable()
5307             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5308             : Candidate.Width.getFixedValue();
5309     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5310                       << " costs: " << (Candidate.Cost / Width));
5311     if (i.isScalable())
5312       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5313                         << AssumedMinimumVscale << ")");
5314     LLVM_DEBUG(dbgs() << ".\n");
5315 #endif
5316 
5317     if (!C.second && !ForceVectorization) {
5318       LLVM_DEBUG(
5319           dbgs() << "LV: Not considering vector loop of width " << i
5320                  << " because it will not generate any vector instructions.\n");
5321       continue;
5322     }
5323 
5324     // If profitable add it to ProfitableVF list.
5325     if (isMoreProfitable(Candidate, ScalarCost))
5326       ProfitableVFs.push_back(Candidate);
5327 
5328     if (isMoreProfitable(Candidate, ChosenFactor))
5329       ChosenFactor = Candidate;
5330   }
5331 
5332   // Emit a report of VFs with invalid costs in the loop.
5333   if (!InvalidCosts.empty()) {
5334     // Group the remarks per instruction, keeping the instruction order from
5335     // InvalidCosts.
5336     std::map<Instruction *, unsigned> Numbering;
5337     unsigned I = 0;
5338     for (auto &Pair : InvalidCosts)
5339       if (!Numbering.count(Pair.first))
5340         Numbering[Pair.first] = I++;
5341 
5342     // Sort the list, first on instruction(number) then on VF.
5343     llvm::sort(InvalidCosts,
5344                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5345                  if (Numbering[A.first] != Numbering[B.first])
5346                    return Numbering[A.first] < Numbering[B.first];
5347                  ElementCountComparator ECC;
5348                  return ECC(A.second, B.second);
5349                });
5350 
5351     // For a list of ordered instruction-vf pairs:
5352     //   [(load, vf1), (load, vf2), (store, vf1)]
5353     // Group the instructions together to emit separate remarks for:
5354     //   load  (vf1, vf2)
5355     //   store (vf1)
5356     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5357     auto Subset = ArrayRef<InstructionVFPair>();
5358     do {
5359       if (Subset.empty())
5360         Subset = Tail.take_front(1);
5361 
5362       Instruction *I = Subset.front().first;
5363 
5364       // If the next instruction is different, or if there are no other pairs,
5365       // emit a remark for the collated subset. e.g.
5366       //   [(load, vf1), (load, vf2))]
5367       // to emit:
5368       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5369       if (Subset == Tail || Tail[Subset.size()].first != I) {
5370         std::string OutString;
5371         raw_string_ostream OS(OutString);
5372         assert(!Subset.empty() && "Unexpected empty range");
5373         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5374         for (auto &Pair : Subset)
5375           OS << (Pair.second == Subset.front().second ? "" : ", ")
5376              << Pair.second;
5377         OS << "):";
5378         if (auto *CI = dyn_cast<CallInst>(I))
5379           OS << " call to " << CI->getCalledFunction()->getName();
5380         else
5381           OS << " " << I->getOpcodeName();
5382         OS.flush();
5383         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5384         Tail = Tail.drop_front(Subset.size());
5385         Subset = {};
5386       } else
5387         // Grow the subset by one element
5388         Subset = Tail.take_front(Subset.size() + 1);
5389     } while (!Tail.empty());
5390   }
5391 
5392   if (!EnableCondStoresVectorization && NumPredStores) {
5393     reportVectorizationFailure("There are conditional stores.",
5394         "store that is conditionally executed prevents vectorization",
5395         "ConditionalStore", ORE, TheLoop);
5396     ChosenFactor = ScalarCost;
5397   }
5398 
5399   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5400                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5401              << "LV: Vectorization seems to be not beneficial, "
5402              << "but was forced by a user.\n");
5403   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5404   return ChosenFactor;
5405 }
5406 
5407 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5408     const Loop &L, ElementCount VF) const {
5409   // Cross iteration phis such as reductions need special handling and are
5410   // currently unsupported.
5411   if (any_of(L.getHeader()->phis(),
5412              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5413     return false;
5414 
5415   // Phis with uses outside of the loop require special handling and are
5416   // currently unsupported.
5417   for (auto &Entry : Legal->getInductionVars()) {
5418     // Look for uses of the value of the induction at the last iteration.
5419     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5420     for (User *U : PostInc->users())
5421       if (!L.contains(cast<Instruction>(U)))
5422         return false;
5423     // Look for uses of penultimate value of the induction.
5424     for (User *U : Entry.first->users())
5425       if (!L.contains(cast<Instruction>(U)))
5426         return false;
5427   }
5428 
5429   // Induction variables that are widened require special handling that is
5430   // currently not supported.
5431   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5432         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5433                  this->isProfitableToScalarize(Entry.first, VF));
5434       }))
5435     return false;
5436 
5437   // Epilogue vectorization code has not been auditted to ensure it handles
5438   // non-latch exits properly.  It may be fine, but it needs auditted and
5439   // tested.
5440   if (L.getExitingBlock() != L.getLoopLatch())
5441     return false;
5442 
5443   return true;
5444 }
5445 
5446 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5447     const ElementCount VF) const {
5448   // FIXME: We need a much better cost-model to take different parameters such
5449   // as register pressure, code size increase and cost of extra branches into
5450   // account. For now we apply a very crude heuristic and only consider loops
5451   // with vectorization factors larger than a certain value.
5452   // We also consider epilogue vectorization unprofitable for targets that don't
5453   // consider interleaving beneficial (eg. MVE).
5454   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5455     return false;
5456   // FIXME: We should consider changing the threshold for scalable
5457   // vectors to take VScaleForTuning into account.
5458   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5459     return true;
5460   return false;
5461 }
5462 
5463 VectorizationFactor
5464 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5465     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5466   VectorizationFactor Result = VectorizationFactor::Disabled();
5467   if (!EnableEpilogueVectorization) {
5468     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5469     return Result;
5470   }
5471 
5472   if (!isScalarEpilogueAllowed()) {
5473     LLVM_DEBUG(
5474         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5475                   "allowed.\n";);
5476     return Result;
5477   }
5478 
5479   // Not really a cost consideration, but check for unsupported cases here to
5480   // simplify the logic.
5481   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5482     LLVM_DEBUG(
5483         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5484                   "not a supported candidate.\n";);
5485     return Result;
5486   }
5487 
5488   if (EpilogueVectorizationForceVF > 1) {
5489     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5490     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5491     if (LVP.hasPlanWithVF(ForcedEC))
5492       return {ForcedEC, 0, 0};
5493     else {
5494       LLVM_DEBUG(
5495           dbgs()
5496               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5497       return Result;
5498     }
5499   }
5500 
5501   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5502       TheLoop->getHeader()->getParent()->hasMinSize()) {
5503     LLVM_DEBUG(
5504         dbgs()
5505             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5506     return Result;
5507   }
5508 
5509   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5510     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5511                          "this loop\n");
5512     return Result;
5513   }
5514 
5515   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5516   // the main loop handles 8 lanes per iteration. We could still benefit from
5517   // vectorizing the epilogue loop with VF=4.
5518   ElementCount EstimatedRuntimeVF = MainLoopVF;
5519   if (MainLoopVF.isScalable()) {
5520     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5521     if (Optional<unsigned> VScale = getVScaleForTuning())
5522       EstimatedRuntimeVF *= *VScale;
5523   }
5524 
5525   for (auto &NextVF : ProfitableVFs)
5526     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5527           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5528          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5529         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5530         LVP.hasPlanWithVF(NextVF.Width))
5531       Result = NextVF;
5532 
5533   if (Result != VectorizationFactor::Disabled())
5534     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5535                       << Result.Width << "\n";);
5536   return Result;
5537 }
5538 
5539 std::pair<unsigned, unsigned>
5540 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5541   unsigned MinWidth = -1U;
5542   unsigned MaxWidth = 8;
5543   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5544   // For in-loop reductions, no element types are added to ElementTypesInLoop
5545   // if there are no loads/stores in the loop. In this case, check through the
5546   // reduction variables to determine the maximum width.
5547   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5548     // Reset MaxWidth so that we can find the smallest type used by recurrences
5549     // in the loop.
5550     MaxWidth = -1U;
5551     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5552       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5553       // When finding the min width used by the recurrence we need to account
5554       // for casts on the input operands of the recurrence.
5555       MaxWidth = std::min<unsigned>(
5556           MaxWidth, std::min<unsigned>(
5557                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5558                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5559     }
5560   } else {
5561     for (Type *T : ElementTypesInLoop) {
5562       MinWidth = std::min<unsigned>(
5563           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5564       MaxWidth = std::max<unsigned>(
5565           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5566     }
5567   }
5568   return {MinWidth, MaxWidth};
5569 }
5570 
5571 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5572   ElementTypesInLoop.clear();
5573   // For each block.
5574   for (BasicBlock *BB : TheLoop->blocks()) {
5575     // For each instruction in the loop.
5576     for (Instruction &I : BB->instructionsWithoutDebug()) {
5577       Type *T = I.getType();
5578 
5579       // Skip ignored values.
5580       if (ValuesToIgnore.count(&I))
5581         continue;
5582 
5583       // Only examine Loads, Stores and PHINodes.
5584       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5585         continue;
5586 
5587       // Examine PHI nodes that are reduction variables. Update the type to
5588       // account for the recurrence type.
5589       if (auto *PN = dyn_cast<PHINode>(&I)) {
5590         if (!Legal->isReductionVariable(PN))
5591           continue;
5592         const RecurrenceDescriptor &RdxDesc =
5593             Legal->getReductionVars().find(PN)->second;
5594         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5595             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5596                                       RdxDesc.getRecurrenceType(),
5597                                       TargetTransformInfo::ReductionFlags()))
5598           continue;
5599         T = RdxDesc.getRecurrenceType();
5600       }
5601 
5602       // Examine the stored values.
5603       if (auto *ST = dyn_cast<StoreInst>(&I))
5604         T = ST->getValueOperand()->getType();
5605 
5606       assert(T->isSized() &&
5607              "Expected the load/store/recurrence type to be sized");
5608 
5609       ElementTypesInLoop.insert(T);
5610     }
5611   }
5612 }
5613 
5614 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5615                                                            unsigned LoopCost) {
5616   // -- The interleave heuristics --
5617   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5618   // There are many micro-architectural considerations that we can't predict
5619   // at this level. For example, frontend pressure (on decode or fetch) due to
5620   // code size, or the number and capabilities of the execution ports.
5621   //
5622   // We use the following heuristics to select the interleave count:
5623   // 1. If the code has reductions, then we interleave to break the cross
5624   // iteration dependency.
5625   // 2. If the loop is really small, then we interleave to reduce the loop
5626   // overhead.
5627   // 3. We don't interleave if we think that we will spill registers to memory
5628   // due to the increased register pressure.
5629 
5630   if (!isScalarEpilogueAllowed())
5631     return 1;
5632 
5633   // We used the distance for the interleave count.
5634   if (Legal->getMaxSafeDepDistBytes() != -1U)
5635     return 1;
5636 
5637   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5638   const bool HasReductions = !Legal->getReductionVars().empty();
5639   // Do not interleave loops with a relatively small known or estimated trip
5640   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5641   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5642   // because with the above conditions interleaving can expose ILP and break
5643   // cross iteration dependences for reductions.
5644   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5645       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5646     return 1;
5647 
5648   // If we did not calculate the cost for VF (because the user selected the VF)
5649   // then we calculate the cost of VF here.
5650   if (LoopCost == 0) {
5651     InstructionCost C = expectedCost(VF).first;
5652     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5653     LoopCost = *C.getValue();
5654 
5655     // Loop body is free and there is no need for interleaving.
5656     if (LoopCost == 0)
5657       return 1;
5658   }
5659 
5660   RegisterUsage R = calculateRegisterUsage({VF})[0];
5661   // We divide by these constants so assume that we have at least one
5662   // instruction that uses at least one register.
5663   for (auto& pair : R.MaxLocalUsers) {
5664     pair.second = std::max(pair.second, 1U);
5665   }
5666 
5667   // We calculate the interleave count using the following formula.
5668   // Subtract the number of loop invariants from the number of available
5669   // registers. These registers are used by all of the interleaved instances.
5670   // Next, divide the remaining registers by the number of registers that is
5671   // required by the loop, in order to estimate how many parallel instances
5672   // fit without causing spills. All of this is rounded down if necessary to be
5673   // a power of two. We want power of two interleave count to simplify any
5674   // addressing operations or alignment considerations.
5675   // We also want power of two interleave counts to ensure that the induction
5676   // variable of the vector loop wraps to zero, when tail is folded by masking;
5677   // this currently happens when OptForSize, in which case IC is set to 1 above.
5678   unsigned IC = UINT_MAX;
5679 
5680   for (auto& pair : R.MaxLocalUsers) {
5681     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5682     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5683                       << " registers of "
5684                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5685     if (VF.isScalar()) {
5686       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5687         TargetNumRegisters = ForceTargetNumScalarRegs;
5688     } else {
5689       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5690         TargetNumRegisters = ForceTargetNumVectorRegs;
5691     }
5692     unsigned MaxLocalUsers = pair.second;
5693     unsigned LoopInvariantRegs = 0;
5694     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5695       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5696 
5697     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5698     // Don't count the induction variable as interleaved.
5699     if (EnableIndVarRegisterHeur) {
5700       TmpIC =
5701           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5702                         std::max(1U, (MaxLocalUsers - 1)));
5703     }
5704 
5705     IC = std::min(IC, TmpIC);
5706   }
5707 
5708   // Clamp the interleave ranges to reasonable counts.
5709   unsigned MaxInterleaveCount =
5710       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5711 
5712   // Check if the user has overridden the max.
5713   if (VF.isScalar()) {
5714     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5715       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5716   } else {
5717     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5718       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5719   }
5720 
5721   // If trip count is known or estimated compile time constant, limit the
5722   // interleave count to be less than the trip count divided by VF, provided it
5723   // is at least 1.
5724   //
5725   // For scalable vectors we can't know if interleaving is beneficial. It may
5726   // not be beneficial for small loops if none of the lanes in the second vector
5727   // iterations is enabled. However, for larger loops, there is likely to be a
5728   // similar benefit as for fixed-width vectors. For now, we choose to leave
5729   // the InterleaveCount as if vscale is '1', although if some information about
5730   // the vector is known (e.g. min vector size), we can make a better decision.
5731   if (BestKnownTC) {
5732     MaxInterleaveCount =
5733         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5734     // Make sure MaxInterleaveCount is greater than 0.
5735     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5736   }
5737 
5738   assert(MaxInterleaveCount > 0 &&
5739          "Maximum interleave count must be greater than 0");
5740 
5741   // Clamp the calculated IC to be between the 1 and the max interleave count
5742   // that the target and trip count allows.
5743   if (IC > MaxInterleaveCount)
5744     IC = MaxInterleaveCount;
5745   else
5746     // Make sure IC is greater than 0.
5747     IC = std::max(1u, IC);
5748 
5749   assert(IC > 0 && "Interleave count must be greater than 0.");
5750 
5751   // Interleave if we vectorized this loop and there is a reduction that could
5752   // benefit from interleaving.
5753   if (VF.isVector() && HasReductions) {
5754     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5755     return IC;
5756   }
5757 
5758   // For any scalar loop that either requires runtime checks or predication we
5759   // are better off leaving this to the unroller. Note that if we've already
5760   // vectorized the loop we will have done the runtime check and so interleaving
5761   // won't require further checks.
5762   bool ScalarInterleavingRequiresPredication =
5763       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5764          return Legal->blockNeedsPredication(BB);
5765        }));
5766   bool ScalarInterleavingRequiresRuntimePointerCheck =
5767       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5768 
5769   // We want to interleave small loops in order to reduce the loop overhead and
5770   // potentially expose ILP opportunities.
5771   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5772                     << "LV: IC is " << IC << '\n'
5773                     << "LV: VF is " << VF << '\n');
5774   const bool AggressivelyInterleaveReductions =
5775       TTI.enableAggressiveInterleaving(HasReductions);
5776   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5777       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5778     // We assume that the cost overhead is 1 and we use the cost model
5779     // to estimate the cost of the loop and interleave until the cost of the
5780     // loop overhead is about 5% of the cost of the loop.
5781     unsigned SmallIC =
5782         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5783 
5784     // Interleave until store/load ports (estimated by max interleave count) are
5785     // saturated.
5786     unsigned NumStores = Legal->getNumStores();
5787     unsigned NumLoads = Legal->getNumLoads();
5788     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5789     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5790 
5791     // There is little point in interleaving for reductions containing selects
5792     // and compares when VF=1 since it may just create more overhead than it's
5793     // worth for loops with small trip counts. This is because we still have to
5794     // do the final reduction after the loop.
5795     bool HasSelectCmpReductions =
5796         HasReductions &&
5797         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5798           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5799           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5800               RdxDesc.getRecurrenceKind());
5801         });
5802     if (HasSelectCmpReductions) {
5803       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5804       return 1;
5805     }
5806 
5807     // If we have a scalar reduction (vector reductions are already dealt with
5808     // by this point), we can increase the critical path length if the loop
5809     // we're interleaving is inside another loop. For tree-wise reductions
5810     // set the limit to 2, and for ordered reductions it's best to disable
5811     // interleaving entirely.
5812     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5813       bool HasOrderedReductions =
5814           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5815             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5816             return RdxDesc.isOrdered();
5817           });
5818       if (HasOrderedReductions) {
5819         LLVM_DEBUG(
5820             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5821         return 1;
5822       }
5823 
5824       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5825       SmallIC = std::min(SmallIC, F);
5826       StoresIC = std::min(StoresIC, F);
5827       LoadsIC = std::min(LoadsIC, F);
5828     }
5829 
5830     if (EnableLoadStoreRuntimeInterleave &&
5831         std::max(StoresIC, LoadsIC) > SmallIC) {
5832       LLVM_DEBUG(
5833           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5834       return std::max(StoresIC, LoadsIC);
5835     }
5836 
5837     // If there are scalar reductions and TTI has enabled aggressive
5838     // interleaving for reductions, we will interleave to expose ILP.
5839     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5840         AggressivelyInterleaveReductions) {
5841       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5842       // Interleave no less than SmallIC but not as aggressive as the normal IC
5843       // to satisfy the rare situation when resources are too limited.
5844       return std::max(IC / 2, SmallIC);
5845     } else {
5846       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5847       return SmallIC;
5848     }
5849   }
5850 
5851   // Interleave if this is a large loop (small loops are already dealt with by
5852   // this point) that could benefit from interleaving.
5853   if (AggressivelyInterleaveReductions) {
5854     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5855     return IC;
5856   }
5857 
5858   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5859   return 1;
5860 }
5861 
5862 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5863 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5864   // This function calculates the register usage by measuring the highest number
5865   // of values that are alive at a single location. Obviously, this is a very
5866   // rough estimation. We scan the loop in a topological order in order and
5867   // assign a number to each instruction. We use RPO to ensure that defs are
5868   // met before their users. We assume that each instruction that has in-loop
5869   // users starts an interval. We record every time that an in-loop value is
5870   // used, so we have a list of the first and last occurrences of each
5871   // instruction. Next, we transpose this data structure into a multi map that
5872   // holds the list of intervals that *end* at a specific location. This multi
5873   // map allows us to perform a linear search. We scan the instructions linearly
5874   // and record each time that a new interval starts, by placing it in a set.
5875   // If we find this value in the multi-map then we remove it from the set.
5876   // The max register usage is the maximum size of the set.
5877   // We also search for instructions that are defined outside the loop, but are
5878   // used inside the loop. We need this number separately from the max-interval
5879   // usage number because when we unroll, loop-invariant values do not take
5880   // more register.
5881   LoopBlocksDFS DFS(TheLoop);
5882   DFS.perform(LI);
5883 
5884   RegisterUsage RU;
5885 
5886   // Each 'key' in the map opens a new interval. The values
5887   // of the map are the index of the 'last seen' usage of the
5888   // instruction that is the key.
5889   using IntervalMap = DenseMap<Instruction *, unsigned>;
5890 
5891   // Maps instruction to its index.
5892   SmallVector<Instruction *, 64> IdxToInstr;
5893   // Marks the end of each interval.
5894   IntervalMap EndPoint;
5895   // Saves the list of instruction indices that are used in the loop.
5896   SmallPtrSet<Instruction *, 8> Ends;
5897   // Saves the list of values that are used in the loop but are
5898   // defined outside the loop, such as arguments and constants.
5899   SmallPtrSet<Value *, 8> LoopInvariants;
5900 
5901   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5902     for (Instruction &I : BB->instructionsWithoutDebug()) {
5903       IdxToInstr.push_back(&I);
5904 
5905       // Save the end location of each USE.
5906       for (Value *U : I.operands()) {
5907         auto *Instr = dyn_cast<Instruction>(U);
5908 
5909         // Ignore non-instruction values such as arguments, constants, etc.
5910         if (!Instr)
5911           continue;
5912 
5913         // If this instruction is outside the loop then record it and continue.
5914         if (!TheLoop->contains(Instr)) {
5915           LoopInvariants.insert(Instr);
5916           continue;
5917         }
5918 
5919         // Overwrite previous end points.
5920         EndPoint[Instr] = IdxToInstr.size();
5921         Ends.insert(Instr);
5922       }
5923     }
5924   }
5925 
5926   // Saves the list of intervals that end with the index in 'key'.
5927   using InstrList = SmallVector<Instruction *, 2>;
5928   DenseMap<unsigned, InstrList> TransposeEnds;
5929 
5930   // Transpose the EndPoints to a list of values that end at each index.
5931   for (auto &Interval : EndPoint)
5932     TransposeEnds[Interval.second].push_back(Interval.first);
5933 
5934   SmallPtrSet<Instruction *, 8> OpenIntervals;
5935   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5936   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5937 
5938   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5939 
5940   const auto &TTICapture = TTI;
5941   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5942     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5943       return 0;
5944     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5945   };
5946 
5947   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5948     Instruction *I = IdxToInstr[i];
5949 
5950     // Remove all of the instructions that end at this location.
5951     InstrList &List = TransposeEnds[i];
5952     for (Instruction *ToRemove : List)
5953       OpenIntervals.erase(ToRemove);
5954 
5955     // Ignore instructions that are never used within the loop.
5956     if (!Ends.count(I))
5957       continue;
5958 
5959     // Skip ignored values.
5960     if (ValuesToIgnore.count(I))
5961       continue;
5962 
5963     // For each VF find the maximum usage of registers.
5964     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5965       // Count the number of live intervals.
5966       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5967 
5968       if (VFs[j].isScalar()) {
5969         for (auto Inst : OpenIntervals) {
5970           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5971           if (RegUsage.find(ClassID) == RegUsage.end())
5972             RegUsage[ClassID] = 1;
5973           else
5974             RegUsage[ClassID] += 1;
5975         }
5976       } else {
5977         collectUniformsAndScalars(VFs[j]);
5978         for (auto Inst : OpenIntervals) {
5979           // Skip ignored values for VF > 1.
5980           if (VecValuesToIgnore.count(Inst))
5981             continue;
5982           if (isScalarAfterVectorization(Inst, VFs[j])) {
5983             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5984             if (RegUsage.find(ClassID) == RegUsage.end())
5985               RegUsage[ClassID] = 1;
5986             else
5987               RegUsage[ClassID] += 1;
5988           } else {
5989             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5990             if (RegUsage.find(ClassID) == RegUsage.end())
5991               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5992             else
5993               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5994           }
5995         }
5996       }
5997 
5998       for (auto& pair : RegUsage) {
5999         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6000           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6001         else
6002           MaxUsages[j][pair.first] = pair.second;
6003       }
6004     }
6005 
6006     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6007                       << OpenIntervals.size() << '\n');
6008 
6009     // Add the current instruction to the list of open intervals.
6010     OpenIntervals.insert(I);
6011   }
6012 
6013   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6014     SmallMapVector<unsigned, unsigned, 4> Invariant;
6015 
6016     for (auto Inst : LoopInvariants) {
6017       unsigned Usage =
6018           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6019       unsigned ClassID =
6020           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6021       if (Invariant.find(ClassID) == Invariant.end())
6022         Invariant[ClassID] = Usage;
6023       else
6024         Invariant[ClassID] += Usage;
6025     }
6026 
6027     LLVM_DEBUG({
6028       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6029       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6030              << " item\n";
6031       for (const auto &pair : MaxUsages[i]) {
6032         dbgs() << "LV(REG): RegisterClass: "
6033                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6034                << " registers\n";
6035       }
6036       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6037              << " item\n";
6038       for (const auto &pair : Invariant) {
6039         dbgs() << "LV(REG): RegisterClass: "
6040                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6041                << " registers\n";
6042       }
6043     });
6044 
6045     RU.LoopInvariantRegs = Invariant;
6046     RU.MaxLocalUsers = MaxUsages[i];
6047     RUs[i] = RU;
6048   }
6049 
6050   return RUs;
6051 }
6052 
6053 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6054                                                            ElementCount VF) {
6055   // TODO: Cost model for emulated masked load/store is completely
6056   // broken. This hack guides the cost model to use an artificially
6057   // high enough value to practically disable vectorization with such
6058   // operations, except where previously deployed legality hack allowed
6059   // using very low cost values. This is to avoid regressions coming simply
6060   // from moving "masked load/store" check from legality to cost model.
6061   // Masked Load/Gather emulation was previously never allowed.
6062   // Limited number of Masked Store/Scatter emulation was allowed.
6063   assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
6064          "Expecting a scalar emulated instruction");
6065   return isa<LoadInst>(I) ||
6066          (isa<StoreInst>(I) &&
6067           NumPredStores > NumberOfStoresToPredicate);
6068 }
6069 
6070 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6071   // If we aren't vectorizing the loop, or if we've already collected the
6072   // instructions to scalarize, there's nothing to do. Collection may already
6073   // have occurred if we have a user-selected VF and are now computing the
6074   // expected cost for interleaving.
6075   if (VF.isScalar() || VF.isZero() ||
6076       InstsToScalarize.find(VF) != InstsToScalarize.end())
6077     return;
6078 
6079   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6080   // not profitable to scalarize any instructions, the presence of VF in the
6081   // map will indicate that we've analyzed it already.
6082   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6083 
6084   PredicatedBBsAfterVectorization[VF].clear();
6085 
6086   // Find all the instructions that are scalar with predication in the loop and
6087   // determine if it would be better to not if-convert the blocks they are in.
6088   // If so, we also record the instructions to scalarize.
6089   for (BasicBlock *BB : TheLoop->blocks()) {
6090     if (!blockNeedsPredicationForAnyReason(BB))
6091       continue;
6092     for (Instruction &I : *BB)
6093       if (isScalarWithPredication(&I, VF)) {
6094         ScalarCostsTy ScalarCosts;
6095         // Do not apply discount if scalable, because that would lead to
6096         // invalid scalarization costs.
6097         // Do not apply discount logic if hacked cost is needed
6098         // for emulated masked memrefs.
6099         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6100             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6101           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6102         // Remember that BB will remain after vectorization.
6103         PredicatedBBsAfterVectorization[VF].insert(BB);
6104       }
6105   }
6106 }
6107 
6108 int LoopVectorizationCostModel::computePredInstDiscount(
6109     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6110   assert(!isUniformAfterVectorization(PredInst, VF) &&
6111          "Instruction marked uniform-after-vectorization will be predicated");
6112 
6113   // Initialize the discount to zero, meaning that the scalar version and the
6114   // vector version cost the same.
6115   InstructionCost Discount = 0;
6116 
6117   // Holds instructions to analyze. The instructions we visit are mapped in
6118   // ScalarCosts. Those instructions are the ones that would be scalarized if
6119   // we find that the scalar version costs less.
6120   SmallVector<Instruction *, 8> Worklist;
6121 
6122   // Returns true if the given instruction can be scalarized.
6123   auto canBeScalarized = [&](Instruction *I) -> bool {
6124     // We only attempt to scalarize instructions forming a single-use chain
6125     // from the original predicated block that would otherwise be vectorized.
6126     // Although not strictly necessary, we give up on instructions we know will
6127     // already be scalar to avoid traversing chains that are unlikely to be
6128     // beneficial.
6129     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6130         isScalarAfterVectorization(I, VF))
6131       return false;
6132 
6133     // If the instruction is scalar with predication, it will be analyzed
6134     // separately. We ignore it within the context of PredInst.
6135     if (isScalarWithPredication(I, VF))
6136       return false;
6137 
6138     // If any of the instruction's operands are uniform after vectorization,
6139     // the instruction cannot be scalarized. This prevents, for example, a
6140     // masked load from being scalarized.
6141     //
6142     // We assume we will only emit a value for lane zero of an instruction
6143     // marked uniform after vectorization, rather than VF identical values.
6144     // Thus, if we scalarize an instruction that uses a uniform, we would
6145     // create uses of values corresponding to the lanes we aren't emitting code
6146     // for. This behavior can be changed by allowing getScalarValue to clone
6147     // the lane zero values for uniforms rather than asserting.
6148     for (Use &U : I->operands())
6149       if (auto *J = dyn_cast<Instruction>(U.get()))
6150         if (isUniformAfterVectorization(J, VF))
6151           return false;
6152 
6153     // Otherwise, we can scalarize the instruction.
6154     return true;
6155   };
6156 
6157   // Compute the expected cost discount from scalarizing the entire expression
6158   // feeding the predicated instruction. We currently only consider expressions
6159   // that are single-use instruction chains.
6160   Worklist.push_back(PredInst);
6161   while (!Worklist.empty()) {
6162     Instruction *I = Worklist.pop_back_val();
6163 
6164     // If we've already analyzed the instruction, there's nothing to do.
6165     if (ScalarCosts.find(I) != ScalarCosts.end())
6166       continue;
6167 
6168     // Compute the cost of the vector instruction. Note that this cost already
6169     // includes the scalarization overhead of the predicated instruction.
6170     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6171 
6172     // Compute the cost of the scalarized instruction. This cost is the cost of
6173     // the instruction as if it wasn't if-converted and instead remained in the
6174     // predicated block. We will scale this cost by block probability after
6175     // computing the scalarization overhead.
6176     InstructionCost ScalarCost =
6177         VF.getFixedValue() *
6178         getInstructionCost(I, ElementCount::getFixed(1)).first;
6179 
6180     // Compute the scalarization overhead of needed insertelement instructions
6181     // and phi nodes.
6182     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6183       ScalarCost += TTI.getScalarizationOverhead(
6184           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6185           APInt::getAllOnes(VF.getFixedValue()), true, false);
6186       ScalarCost +=
6187           VF.getFixedValue() *
6188           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6189     }
6190 
6191     // Compute the scalarization overhead of needed extractelement
6192     // instructions. For each of the instruction's operands, if the operand can
6193     // be scalarized, add it to the worklist; otherwise, account for the
6194     // overhead.
6195     for (Use &U : I->operands())
6196       if (auto *J = dyn_cast<Instruction>(U.get())) {
6197         assert(VectorType::isValidElementType(J->getType()) &&
6198                "Instruction has non-scalar type");
6199         if (canBeScalarized(J))
6200           Worklist.push_back(J);
6201         else if (needsExtract(J, VF)) {
6202           ScalarCost += TTI.getScalarizationOverhead(
6203               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6204               APInt::getAllOnes(VF.getFixedValue()), false, true);
6205         }
6206       }
6207 
6208     // Scale the total scalar cost by block probability.
6209     ScalarCost /= getReciprocalPredBlockProb();
6210 
6211     // Compute the discount. A non-negative discount means the vector version
6212     // of the instruction costs more, and scalarizing would be beneficial.
6213     Discount += VectorCost - ScalarCost;
6214     ScalarCosts[I] = ScalarCost;
6215   }
6216 
6217   return *Discount.getValue();
6218 }
6219 
6220 LoopVectorizationCostModel::VectorizationCostTy
6221 LoopVectorizationCostModel::expectedCost(
6222     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6223   VectorizationCostTy Cost;
6224 
6225   // For each block.
6226   for (BasicBlock *BB : TheLoop->blocks()) {
6227     VectorizationCostTy BlockCost;
6228 
6229     // For each instruction in the old loop.
6230     for (Instruction &I : BB->instructionsWithoutDebug()) {
6231       // Skip ignored values.
6232       if (ValuesToIgnore.count(&I) ||
6233           (VF.isVector() && VecValuesToIgnore.count(&I)))
6234         continue;
6235 
6236       VectorizationCostTy C = getInstructionCost(&I, VF);
6237 
6238       // Check if we should override the cost.
6239       if (C.first.isValid() &&
6240           ForceTargetInstructionCost.getNumOccurrences() > 0)
6241         C.first = InstructionCost(ForceTargetInstructionCost);
6242 
6243       // Keep a list of instructions with invalid costs.
6244       if (Invalid && !C.first.isValid())
6245         Invalid->emplace_back(&I, VF);
6246 
6247       BlockCost.first += C.first;
6248       BlockCost.second |= C.second;
6249       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6250                         << " for VF " << VF << " For instruction: " << I
6251                         << '\n');
6252     }
6253 
6254     // If we are vectorizing a predicated block, it will have been
6255     // if-converted. This means that the block's instructions (aside from
6256     // stores and instructions that may divide by zero) will now be
6257     // unconditionally executed. For the scalar case, we may not always execute
6258     // the predicated block, if it is an if-else block. Thus, scale the block's
6259     // cost by the probability of executing it. blockNeedsPredication from
6260     // Legal is used so as to not include all blocks in tail folded loops.
6261     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6262       BlockCost.first /= getReciprocalPredBlockProb();
6263 
6264     Cost.first += BlockCost.first;
6265     Cost.second |= BlockCost.second;
6266   }
6267 
6268   return Cost;
6269 }
6270 
6271 /// Gets Address Access SCEV after verifying that the access pattern
6272 /// is loop invariant except the induction variable dependence.
6273 ///
6274 /// This SCEV can be sent to the Target in order to estimate the address
6275 /// calculation cost.
6276 static const SCEV *getAddressAccessSCEV(
6277               Value *Ptr,
6278               LoopVectorizationLegality *Legal,
6279               PredicatedScalarEvolution &PSE,
6280               const Loop *TheLoop) {
6281 
6282   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6283   if (!Gep)
6284     return nullptr;
6285 
6286   // We are looking for a gep with all loop invariant indices except for one
6287   // which should be an induction variable.
6288   auto SE = PSE.getSE();
6289   unsigned NumOperands = Gep->getNumOperands();
6290   for (unsigned i = 1; i < NumOperands; ++i) {
6291     Value *Opd = Gep->getOperand(i);
6292     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6293         !Legal->isInductionVariable(Opd))
6294       return nullptr;
6295   }
6296 
6297   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6298   return PSE.getSCEV(Ptr);
6299 }
6300 
6301 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6302   return Legal->hasStride(I->getOperand(0)) ||
6303          Legal->hasStride(I->getOperand(1));
6304 }
6305 
6306 InstructionCost
6307 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6308                                                         ElementCount VF) {
6309   assert(VF.isVector() &&
6310          "Scalarization cost of instruction implies vectorization.");
6311   if (VF.isScalable())
6312     return InstructionCost::getInvalid();
6313 
6314   Type *ValTy = getLoadStoreType(I);
6315   auto SE = PSE.getSE();
6316 
6317   unsigned AS = getLoadStoreAddressSpace(I);
6318   Value *Ptr = getLoadStorePointerOperand(I);
6319   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6320   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6321   //       that it is being called from this specific place.
6322 
6323   // Figure out whether the access is strided and get the stride value
6324   // if it's known in compile time
6325   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6326 
6327   // Get the cost of the scalar memory instruction and address computation.
6328   InstructionCost Cost =
6329       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6330 
6331   // Don't pass *I here, since it is scalar but will actually be part of a
6332   // vectorized loop where the user of it is a vectorized instruction.
6333   const Align Alignment = getLoadStoreAlignment(I);
6334   Cost += VF.getKnownMinValue() *
6335           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6336                               AS, TTI::TCK_RecipThroughput);
6337 
6338   // Get the overhead of the extractelement and insertelement instructions
6339   // we might create due to scalarization.
6340   Cost += getScalarizationOverhead(I, VF);
6341 
6342   // If we have a predicated load/store, it will need extra i1 extracts and
6343   // conditional branches, but may not be executed for each vector lane. Scale
6344   // the cost by the probability of executing the predicated block.
6345   if (isPredicatedInst(I, VF)) {
6346     Cost /= getReciprocalPredBlockProb();
6347 
6348     // Add the cost of an i1 extract and a branch
6349     auto *Vec_i1Ty =
6350         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6351     Cost += TTI.getScalarizationOverhead(
6352         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6353         /*Insert=*/false, /*Extract=*/true);
6354     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6355 
6356     if (useEmulatedMaskMemRefHack(I, VF))
6357       // Artificially setting to a high enough value to practically disable
6358       // vectorization with such operations.
6359       Cost = 3000000;
6360   }
6361 
6362   return Cost;
6363 }
6364 
6365 InstructionCost
6366 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6367                                                     ElementCount VF) {
6368   Type *ValTy = getLoadStoreType(I);
6369   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6370   Value *Ptr = getLoadStorePointerOperand(I);
6371   unsigned AS = getLoadStoreAddressSpace(I);
6372   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6373   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6374 
6375   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6376          "Stride should be 1 or -1 for consecutive memory access");
6377   const Align Alignment = getLoadStoreAlignment(I);
6378   InstructionCost Cost = 0;
6379   if (Legal->isMaskRequired(I))
6380     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6381                                       CostKind);
6382   else
6383     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6384                                 CostKind, I);
6385 
6386   bool Reverse = ConsecutiveStride < 0;
6387   if (Reverse)
6388     Cost +=
6389         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6390   return Cost;
6391 }
6392 
6393 InstructionCost
6394 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6395                                                 ElementCount VF) {
6396   assert(Legal->isUniformMemOp(*I));
6397 
6398   Type *ValTy = getLoadStoreType(I);
6399   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6400   const Align Alignment = getLoadStoreAlignment(I);
6401   unsigned AS = getLoadStoreAddressSpace(I);
6402   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6403   if (isa<LoadInst>(I)) {
6404     return TTI.getAddressComputationCost(ValTy) +
6405            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6406                                CostKind) +
6407            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6408   }
6409   StoreInst *SI = cast<StoreInst>(I);
6410 
6411   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6412   return TTI.getAddressComputationCost(ValTy) +
6413          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6414                              CostKind) +
6415          (isLoopInvariantStoreValue
6416               ? 0
6417               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6418                                        VF.getKnownMinValue() - 1));
6419 }
6420 
6421 InstructionCost
6422 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6423                                                  ElementCount VF) {
6424   Type *ValTy = getLoadStoreType(I);
6425   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6426   const Align Alignment = getLoadStoreAlignment(I);
6427   const Value *Ptr = getLoadStorePointerOperand(I);
6428 
6429   return TTI.getAddressComputationCost(VectorTy) +
6430          TTI.getGatherScatterOpCost(
6431              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6432              TargetTransformInfo::TCK_RecipThroughput, I);
6433 }
6434 
6435 InstructionCost
6436 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6437                                                    ElementCount VF) {
6438   // TODO: Once we have support for interleaving with scalable vectors
6439   // we can calculate the cost properly here.
6440   if (VF.isScalable())
6441     return InstructionCost::getInvalid();
6442 
6443   Type *ValTy = getLoadStoreType(I);
6444   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6445   unsigned AS = getLoadStoreAddressSpace(I);
6446 
6447   auto Group = getInterleavedAccessGroup(I);
6448   assert(Group && "Fail to get an interleaved access group.");
6449 
6450   unsigned InterleaveFactor = Group->getFactor();
6451   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6452 
6453   // Holds the indices of existing members in the interleaved group.
6454   SmallVector<unsigned, 4> Indices;
6455   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6456     if (Group->getMember(IF))
6457       Indices.push_back(IF);
6458 
6459   // Calculate the cost of the whole interleaved group.
6460   bool UseMaskForGaps =
6461       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6462       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6463   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6464       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6465       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6466 
6467   if (Group->isReverse()) {
6468     // TODO: Add support for reversed masked interleaved access.
6469     assert(!Legal->isMaskRequired(I) &&
6470            "Reverse masked interleaved access not supported.");
6471     Cost +=
6472         Group->getNumMembers() *
6473         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6474   }
6475   return Cost;
6476 }
6477 
6478 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6479     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6480   using namespace llvm::PatternMatch;
6481   // Early exit for no inloop reductions
6482   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6483     return None;
6484   auto *VectorTy = cast<VectorType>(Ty);
6485 
6486   // We are looking for a pattern of, and finding the minimal acceptable cost:
6487   //  reduce(mul(ext(A), ext(B))) or
6488   //  reduce(mul(A, B)) or
6489   //  reduce(ext(A)) or
6490   //  reduce(A).
6491   // The basic idea is that we walk down the tree to do that, finding the root
6492   // reduction instruction in InLoopReductionImmediateChains. From there we find
6493   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6494   // of the components. If the reduction cost is lower then we return it for the
6495   // reduction instruction and 0 for the other instructions in the pattern. If
6496   // it is not we return an invalid cost specifying the orignal cost method
6497   // should be used.
6498   Instruction *RetI = I;
6499   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6500     if (!RetI->hasOneUser())
6501       return None;
6502     RetI = RetI->user_back();
6503   }
6504   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6505       RetI->user_back()->getOpcode() == Instruction::Add) {
6506     if (!RetI->hasOneUser())
6507       return None;
6508     RetI = RetI->user_back();
6509   }
6510 
6511   // Test if the found instruction is a reduction, and if not return an invalid
6512   // cost specifying the parent to use the original cost modelling.
6513   if (!InLoopReductionImmediateChains.count(RetI))
6514     return None;
6515 
6516   // Find the reduction this chain is a part of and calculate the basic cost of
6517   // the reduction on its own.
6518   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6519   Instruction *ReductionPhi = LastChain;
6520   while (!isa<PHINode>(ReductionPhi))
6521     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6522 
6523   const RecurrenceDescriptor &RdxDesc =
6524       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6525 
6526   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6527       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6528 
6529   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6530   // normal fmul instruction to the cost of the fadd reduction.
6531   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6532     BaseCost +=
6533         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6534 
6535   // If we're using ordered reductions then we can just return the base cost
6536   // here, since getArithmeticReductionCost calculates the full ordered
6537   // reduction cost when FP reassociation is not allowed.
6538   if (useOrderedReductions(RdxDesc))
6539     return BaseCost;
6540 
6541   // Get the operand that was not the reduction chain and match it to one of the
6542   // patterns, returning the better cost if it is found.
6543   Instruction *RedOp = RetI->getOperand(1) == LastChain
6544                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6545                            : dyn_cast<Instruction>(RetI->getOperand(1));
6546 
6547   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6548 
6549   Instruction *Op0, *Op1;
6550   if (RedOp &&
6551       match(RedOp,
6552             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6553       match(Op0, m_ZExtOrSExt(m_Value())) &&
6554       Op0->getOpcode() == Op1->getOpcode() &&
6555       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6556       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6557       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6558 
6559     // Matched reduce(ext(mul(ext(A), ext(B)))
6560     // Note that the extend opcodes need to all match, or if A==B they will have
6561     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6562     // which is equally fine.
6563     bool IsUnsigned = isa<ZExtInst>(Op0);
6564     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6565     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6566 
6567     InstructionCost ExtCost =
6568         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6569                              TTI::CastContextHint::None, CostKind, Op0);
6570     InstructionCost MulCost =
6571         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6572     InstructionCost Ext2Cost =
6573         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6574                              TTI::CastContextHint::None, CostKind, RedOp);
6575 
6576     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6577         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6578         CostKind);
6579 
6580     if (RedCost.isValid() &&
6581         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6582       return I == RetI ? RedCost : 0;
6583   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6584              !TheLoop->isLoopInvariant(RedOp)) {
6585     // Matched reduce(ext(A))
6586     bool IsUnsigned = isa<ZExtInst>(RedOp);
6587     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6588     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6589         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6590         CostKind);
6591 
6592     InstructionCost ExtCost =
6593         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6594                              TTI::CastContextHint::None, CostKind, RedOp);
6595     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6596       return I == RetI ? RedCost : 0;
6597   } else if (RedOp &&
6598              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6599     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6600         Op0->getOpcode() == Op1->getOpcode() &&
6601         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6602       bool IsUnsigned = isa<ZExtInst>(Op0);
6603       Type *Op0Ty = Op0->getOperand(0)->getType();
6604       Type *Op1Ty = Op1->getOperand(0)->getType();
6605       Type *LargestOpTy =
6606           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6607                                                                     : Op0Ty;
6608       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6609 
6610       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6611       // different sizes. We take the largest type as the ext to reduce, and add
6612       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6613       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6614           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6615           TTI::CastContextHint::None, CostKind, Op0);
6616       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6617           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6618           TTI::CastContextHint::None, CostKind, Op1);
6619       InstructionCost MulCost =
6620           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6621 
6622       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6623           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6624           CostKind);
6625       InstructionCost ExtraExtCost = 0;
6626       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6627         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6628         ExtraExtCost = TTI.getCastInstrCost(
6629             ExtraExtOp->getOpcode(), ExtType,
6630             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6631             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6632       }
6633 
6634       if (RedCost.isValid() &&
6635           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6636         return I == RetI ? RedCost : 0;
6637     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6638       // Matched reduce(mul())
6639       InstructionCost MulCost =
6640           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6641 
6642       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6643           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6644           CostKind);
6645 
6646       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6647         return I == RetI ? RedCost : 0;
6648     }
6649   }
6650 
6651   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6652 }
6653 
6654 InstructionCost
6655 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6656                                                      ElementCount VF) {
6657   // Calculate scalar cost only. Vectorization cost should be ready at this
6658   // moment.
6659   if (VF.isScalar()) {
6660     Type *ValTy = getLoadStoreType(I);
6661     const Align Alignment = getLoadStoreAlignment(I);
6662     unsigned AS = getLoadStoreAddressSpace(I);
6663 
6664     return TTI.getAddressComputationCost(ValTy) +
6665            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6666                                TTI::TCK_RecipThroughput, I);
6667   }
6668   return getWideningCost(I, VF);
6669 }
6670 
6671 LoopVectorizationCostModel::VectorizationCostTy
6672 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6673                                                ElementCount VF) {
6674   // If we know that this instruction will remain uniform, check the cost of
6675   // the scalar version.
6676   if (isUniformAfterVectorization(I, VF))
6677     VF = ElementCount::getFixed(1);
6678 
6679   if (VF.isVector() && isProfitableToScalarize(I, VF))
6680     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6681 
6682   // Forced scalars do not have any scalarization overhead.
6683   auto ForcedScalar = ForcedScalars.find(VF);
6684   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6685     auto InstSet = ForcedScalar->second;
6686     if (InstSet.count(I))
6687       return VectorizationCostTy(
6688           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6689            VF.getKnownMinValue()),
6690           false);
6691   }
6692 
6693   Type *VectorTy;
6694   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6695 
6696   bool TypeNotScalarized = false;
6697   if (VF.isVector() && VectorTy->isVectorTy()) {
6698     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6699       if (VF.isScalable())
6700         // <vscale x 1 x iN> is assumed to be profitable over iN because
6701         // scalable registers are a distinct register class from scalar ones.
6702         // If we ever find a target which wants to lower scalable vectors
6703         // back to scalars, we'll need to update this code to explicitly
6704         // ask TTI about the register class uses for each part.
6705         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6706       else
6707         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6708     } else
6709       C = InstructionCost::getInvalid();
6710   }
6711   return VectorizationCostTy(C, TypeNotScalarized);
6712 }
6713 
6714 InstructionCost
6715 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6716                                                      ElementCount VF) const {
6717 
6718   // There is no mechanism yet to create a scalable scalarization loop,
6719   // so this is currently Invalid.
6720   if (VF.isScalable())
6721     return InstructionCost::getInvalid();
6722 
6723   if (VF.isScalar())
6724     return 0;
6725 
6726   InstructionCost Cost = 0;
6727   Type *RetTy = ToVectorTy(I->getType(), VF);
6728   if (!RetTy->isVoidTy() &&
6729       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6730     Cost += TTI.getScalarizationOverhead(
6731         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6732         false);
6733 
6734   // Some targets keep addresses scalar.
6735   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6736     return Cost;
6737 
6738   // Some targets support efficient element stores.
6739   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6740     return Cost;
6741 
6742   // Collect operands to consider.
6743   CallInst *CI = dyn_cast<CallInst>(I);
6744   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6745 
6746   // Skip operands that do not require extraction/scalarization and do not incur
6747   // any overhead.
6748   SmallVector<Type *> Tys;
6749   for (auto *V : filterExtractingOperands(Ops, VF))
6750     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6751   return Cost + TTI.getOperandsScalarizationOverhead(
6752                     filterExtractingOperands(Ops, VF), Tys);
6753 }
6754 
6755 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6756   if (VF.isScalar())
6757     return;
6758   NumPredStores = 0;
6759   for (BasicBlock *BB : TheLoop->blocks()) {
6760     // For each instruction in the old loop.
6761     for (Instruction &I : *BB) {
6762       Value *Ptr =  getLoadStorePointerOperand(&I);
6763       if (!Ptr)
6764         continue;
6765 
6766       // TODO: We should generate better code and update the cost model for
6767       // predicated uniform stores. Today they are treated as any other
6768       // predicated store (see added test cases in
6769       // invariant-store-vectorization.ll).
6770       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6771         NumPredStores++;
6772 
6773       if (Legal->isUniformMemOp(I)) {
6774         // Lowering story for uniform memory ops is currently a bit complicated.
6775         // Scalarization works for everything which isn't a store with scalable
6776         // VF.  Fixed len VFs just scalarize and then DCE later; scalarization
6777         // knows how to handle uniform-per-part values (i.e. the first lane
6778         // in each unrolled VF) and can thus handle scalable loads too.  For
6779         // scalable stores, we use a scatter if legal.  If not, we have no way
6780         // to lower (currently) and thus have to abort vectorization.
6781         if (isa<StoreInst>(&I) && VF.isScalable()) {
6782           if (isLegalGatherOrScatter(&I, VF))
6783             setWideningDecision(&I, VF, CM_GatherScatter,
6784                                 getGatherScatterCost(&I, VF));
6785           else
6786             // Error case, abort vectorization
6787             setWideningDecision(&I, VF, CM_Scalarize,
6788                                 InstructionCost::getInvalid());
6789           continue;
6790         }
6791         // Load: Scalar load + broadcast
6792         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6793         // TODO: Avoid replicating loads and stores instead of relying on
6794         // instcombine to remove them.
6795         setWideningDecision(&I, VF, CM_Scalarize,
6796                             getUniformMemOpCost(&I, VF));
6797         continue;
6798       }
6799 
6800       // We assume that widening is the best solution when possible.
6801       if (memoryInstructionCanBeWidened(&I, VF)) {
6802         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6803         int ConsecutiveStride = Legal->isConsecutivePtr(
6804             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6805         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6806                "Expected consecutive stride.");
6807         InstWidening Decision =
6808             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6809         setWideningDecision(&I, VF, Decision, Cost);
6810         continue;
6811       }
6812 
6813       // Choose between Interleaving, Gather/Scatter or Scalarization.
6814       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6815       unsigned NumAccesses = 1;
6816       if (isAccessInterleaved(&I)) {
6817         auto Group = getInterleavedAccessGroup(&I);
6818         assert(Group && "Fail to get an interleaved access group.");
6819 
6820         // Make one decision for the whole group.
6821         if (getWideningDecision(&I, VF) != CM_Unknown)
6822           continue;
6823 
6824         NumAccesses = Group->getNumMembers();
6825         if (interleavedAccessCanBeWidened(&I, VF))
6826           InterleaveCost = getInterleaveGroupCost(&I, VF);
6827       }
6828 
6829       InstructionCost GatherScatterCost =
6830           isLegalGatherOrScatter(&I, VF)
6831               ? getGatherScatterCost(&I, VF) * NumAccesses
6832               : InstructionCost::getInvalid();
6833 
6834       InstructionCost ScalarizationCost =
6835           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6836 
6837       // Choose better solution for the current VF,
6838       // write down this decision and use it during vectorization.
6839       InstructionCost Cost;
6840       InstWidening Decision;
6841       if (InterleaveCost <= GatherScatterCost &&
6842           InterleaveCost < ScalarizationCost) {
6843         Decision = CM_Interleave;
6844         Cost = InterleaveCost;
6845       } else if (GatherScatterCost < ScalarizationCost) {
6846         Decision = CM_GatherScatter;
6847         Cost = GatherScatterCost;
6848       } else {
6849         Decision = CM_Scalarize;
6850         Cost = ScalarizationCost;
6851       }
6852       // If the instructions belongs to an interleave group, the whole group
6853       // receives the same decision. The whole group receives the cost, but
6854       // the cost will actually be assigned to one instruction.
6855       if (auto Group = getInterleavedAccessGroup(&I))
6856         setWideningDecision(Group, VF, Decision, Cost);
6857       else
6858         setWideningDecision(&I, VF, Decision, Cost);
6859     }
6860   }
6861 
6862   // Make sure that any load of address and any other address computation
6863   // remains scalar unless there is gather/scatter support. This avoids
6864   // inevitable extracts into address registers, and also has the benefit of
6865   // activating LSR more, since that pass can't optimize vectorized
6866   // addresses.
6867   if (TTI.prefersVectorizedAddressing())
6868     return;
6869 
6870   // Start with all scalar pointer uses.
6871   SmallPtrSet<Instruction *, 8> AddrDefs;
6872   for (BasicBlock *BB : TheLoop->blocks())
6873     for (Instruction &I : *BB) {
6874       Instruction *PtrDef =
6875         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6876       if (PtrDef && TheLoop->contains(PtrDef) &&
6877           getWideningDecision(&I, VF) != CM_GatherScatter)
6878         AddrDefs.insert(PtrDef);
6879     }
6880 
6881   // Add all instructions used to generate the addresses.
6882   SmallVector<Instruction *, 4> Worklist;
6883   append_range(Worklist, AddrDefs);
6884   while (!Worklist.empty()) {
6885     Instruction *I = Worklist.pop_back_val();
6886     for (auto &Op : I->operands())
6887       if (auto *InstOp = dyn_cast<Instruction>(Op))
6888         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6889             AddrDefs.insert(InstOp).second)
6890           Worklist.push_back(InstOp);
6891   }
6892 
6893   for (auto *I : AddrDefs) {
6894     if (isa<LoadInst>(I)) {
6895       // Setting the desired widening decision should ideally be handled in
6896       // by cost functions, but since this involves the task of finding out
6897       // if the loaded register is involved in an address computation, it is
6898       // instead changed here when we know this is the case.
6899       InstWidening Decision = getWideningDecision(I, VF);
6900       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6901         // Scalarize a widened load of address.
6902         setWideningDecision(
6903             I, VF, CM_Scalarize,
6904             (VF.getKnownMinValue() *
6905              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6906       else if (auto Group = getInterleavedAccessGroup(I)) {
6907         // Scalarize an interleave group of address loads.
6908         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6909           if (Instruction *Member = Group->getMember(I))
6910             setWideningDecision(
6911                 Member, VF, CM_Scalarize,
6912                 (VF.getKnownMinValue() *
6913                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6914         }
6915       }
6916     } else
6917       // Make sure I gets scalarized and a cost estimate without
6918       // scalarization overhead.
6919       ForcedScalars[VF].insert(I);
6920   }
6921 }
6922 
6923 InstructionCost
6924 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6925                                                Type *&VectorTy) {
6926   Type *RetTy = I->getType();
6927   if (canTruncateToMinimalBitwidth(I, VF))
6928     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6929   auto SE = PSE.getSE();
6930   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6931 
6932   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6933                                                 ElementCount VF) -> bool {
6934     if (VF.isScalar())
6935       return true;
6936 
6937     auto Scalarized = InstsToScalarize.find(VF);
6938     assert(Scalarized != InstsToScalarize.end() &&
6939            "VF not yet analyzed for scalarization profitability");
6940     return !Scalarized->second.count(I) &&
6941            llvm::all_of(I->users(), [&](User *U) {
6942              auto *UI = cast<Instruction>(U);
6943              return !Scalarized->second.count(UI);
6944            });
6945   };
6946   (void) hasSingleCopyAfterVectorization;
6947 
6948   if (isScalarAfterVectorization(I, VF)) {
6949     // With the exception of GEPs and PHIs, after scalarization there should
6950     // only be one copy of the instruction generated in the loop. This is
6951     // because the VF is either 1, or any instructions that need scalarizing
6952     // have already been dealt with by the the time we get here. As a result,
6953     // it means we don't have to multiply the instruction cost by VF.
6954     assert(I->getOpcode() == Instruction::GetElementPtr ||
6955            I->getOpcode() == Instruction::PHI ||
6956            (I->getOpcode() == Instruction::BitCast &&
6957             I->getType()->isPointerTy()) ||
6958            hasSingleCopyAfterVectorization(I, VF));
6959     VectorTy = RetTy;
6960   } else
6961     VectorTy = ToVectorTy(RetTy, VF);
6962 
6963   // TODO: We need to estimate the cost of intrinsic calls.
6964   switch (I->getOpcode()) {
6965   case Instruction::GetElementPtr:
6966     // We mark this instruction as zero-cost because the cost of GEPs in
6967     // vectorized code depends on whether the corresponding memory instruction
6968     // is scalarized or not. Therefore, we handle GEPs with the memory
6969     // instruction cost.
6970     return 0;
6971   case Instruction::Br: {
6972     // In cases of scalarized and predicated instructions, there will be VF
6973     // predicated blocks in the vectorized loop. Each branch around these
6974     // blocks requires also an extract of its vector compare i1 element.
6975     bool ScalarPredicatedBB = false;
6976     BranchInst *BI = cast<BranchInst>(I);
6977     if (VF.isVector() && BI->isConditional() &&
6978         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6979          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6980       ScalarPredicatedBB = true;
6981 
6982     if (ScalarPredicatedBB) {
6983       // Not possible to scalarize scalable vector with predicated instructions.
6984       if (VF.isScalable())
6985         return InstructionCost::getInvalid();
6986       // Return cost for branches around scalarized and predicated blocks.
6987       auto *Vec_i1Ty =
6988           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6989       return (
6990           TTI.getScalarizationOverhead(
6991               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
6992           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6993     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6994       // The back-edge branch will remain, as will all scalar branches.
6995       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6996     else
6997       // This branch will be eliminated by if-conversion.
6998       return 0;
6999     // Note: We currently assume zero cost for an unconditional branch inside
7000     // a predicated block since it will become a fall-through, although we
7001     // may decide in the future to call TTI for all branches.
7002   }
7003   case Instruction::PHI: {
7004     auto *Phi = cast<PHINode>(I);
7005 
7006     // First-order recurrences are replaced by vector shuffles inside the loop.
7007     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7008     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7009       return TTI.getShuffleCost(
7010           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7011           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7012 
7013     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7014     // converted into select instructions. We require N - 1 selects per phi
7015     // node, where N is the number of incoming values.
7016     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7017       return (Phi->getNumIncomingValues() - 1) *
7018              TTI.getCmpSelInstrCost(
7019                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7020                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7021                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7022 
7023     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7024   }
7025   case Instruction::UDiv:
7026   case Instruction::SDiv:
7027   case Instruction::URem:
7028   case Instruction::SRem:
7029     // If we have a predicated instruction, it may not be executed for each
7030     // vector lane. Get the scalarization cost and scale this amount by the
7031     // probability of executing the predicated block. If the instruction is not
7032     // predicated, we fall through to the next case.
7033     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7034       InstructionCost Cost = 0;
7035 
7036       // These instructions have a non-void type, so account for the phi nodes
7037       // that we will create. This cost is likely to be zero. The phi node
7038       // cost, if any, should be scaled by the block probability because it
7039       // models a copy at the end of each predicated block.
7040       Cost += VF.getKnownMinValue() *
7041               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7042 
7043       // The cost of the non-predicated instruction.
7044       Cost += VF.getKnownMinValue() *
7045               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7046 
7047       // The cost of insertelement and extractelement instructions needed for
7048       // scalarization.
7049       Cost += getScalarizationOverhead(I, VF);
7050 
7051       // Scale the cost by the probability of executing the predicated blocks.
7052       // This assumes the predicated block for each vector lane is equally
7053       // likely.
7054       return Cost / getReciprocalPredBlockProb();
7055     }
7056     LLVM_FALLTHROUGH;
7057   case Instruction::Add:
7058   case Instruction::FAdd:
7059   case Instruction::Sub:
7060   case Instruction::FSub:
7061   case Instruction::Mul:
7062   case Instruction::FMul:
7063   case Instruction::FDiv:
7064   case Instruction::FRem:
7065   case Instruction::Shl:
7066   case Instruction::LShr:
7067   case Instruction::AShr:
7068   case Instruction::And:
7069   case Instruction::Or:
7070   case Instruction::Xor: {
7071     // Since we will replace the stride by 1 the multiplication should go away.
7072     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7073       return 0;
7074 
7075     // Detect reduction patterns
7076     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7077       return *RedCost;
7078 
7079     // Certain instructions can be cheaper to vectorize if they have a constant
7080     // second vector operand. One example of this are shifts on x86.
7081     Value *Op2 = I->getOperand(1);
7082     TargetTransformInfo::OperandValueProperties Op2VP;
7083     TargetTransformInfo::OperandValueKind Op2VK =
7084         TTI.getOperandInfo(Op2, Op2VP);
7085     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7086       Op2VK = TargetTransformInfo::OK_UniformValue;
7087 
7088     SmallVector<const Value *, 4> Operands(I->operand_values());
7089     return TTI.getArithmeticInstrCost(
7090         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7091         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7092   }
7093   case Instruction::FNeg: {
7094     return TTI.getArithmeticInstrCost(
7095         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7096         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7097         TargetTransformInfo::OP_None, I->getOperand(0), I);
7098   }
7099   case Instruction::Select: {
7100     SelectInst *SI = cast<SelectInst>(I);
7101     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7102     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7103 
7104     const Value *Op0, *Op1;
7105     using namespace llvm::PatternMatch;
7106     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7107                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7108       // select x, y, false --> x & y
7109       // select x, true, y --> x | y
7110       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7111       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7112       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7113       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7114       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7115               Op1->getType()->getScalarSizeInBits() == 1);
7116 
7117       SmallVector<const Value *, 2> Operands{Op0, Op1};
7118       return TTI.getArithmeticInstrCost(
7119           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7120           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7121     }
7122 
7123     Type *CondTy = SI->getCondition()->getType();
7124     if (!ScalarCond)
7125       CondTy = VectorType::get(CondTy, VF);
7126 
7127     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7128     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7129       Pred = Cmp->getPredicate();
7130     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7131                                   CostKind, I);
7132   }
7133   case Instruction::ICmp:
7134   case Instruction::FCmp: {
7135     Type *ValTy = I->getOperand(0)->getType();
7136     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7137     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7138       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7139     VectorTy = ToVectorTy(ValTy, VF);
7140     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7141                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7142                                   I);
7143   }
7144   case Instruction::Store:
7145   case Instruction::Load: {
7146     ElementCount Width = VF;
7147     if (Width.isVector()) {
7148       InstWidening Decision = getWideningDecision(I, Width);
7149       assert(Decision != CM_Unknown &&
7150              "CM decision should be taken at this point");
7151       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7152         return InstructionCost::getInvalid();
7153       if (Decision == CM_Scalarize)
7154         Width = ElementCount::getFixed(1);
7155     }
7156     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7157     return getMemoryInstructionCost(I, VF);
7158   }
7159   case Instruction::BitCast:
7160     if (I->getType()->isPointerTy())
7161       return 0;
7162     LLVM_FALLTHROUGH;
7163   case Instruction::ZExt:
7164   case Instruction::SExt:
7165   case Instruction::FPToUI:
7166   case Instruction::FPToSI:
7167   case Instruction::FPExt:
7168   case Instruction::PtrToInt:
7169   case Instruction::IntToPtr:
7170   case Instruction::SIToFP:
7171   case Instruction::UIToFP:
7172   case Instruction::Trunc:
7173   case Instruction::FPTrunc: {
7174     // Computes the CastContextHint from a Load/Store instruction.
7175     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7176       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7177              "Expected a load or a store!");
7178 
7179       if (VF.isScalar() || !TheLoop->contains(I))
7180         return TTI::CastContextHint::Normal;
7181 
7182       switch (getWideningDecision(I, VF)) {
7183       case LoopVectorizationCostModel::CM_GatherScatter:
7184         return TTI::CastContextHint::GatherScatter;
7185       case LoopVectorizationCostModel::CM_Interleave:
7186         return TTI::CastContextHint::Interleave;
7187       case LoopVectorizationCostModel::CM_Scalarize:
7188       case LoopVectorizationCostModel::CM_Widen:
7189         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7190                                         : TTI::CastContextHint::Normal;
7191       case LoopVectorizationCostModel::CM_Widen_Reverse:
7192         return TTI::CastContextHint::Reversed;
7193       case LoopVectorizationCostModel::CM_Unknown:
7194         llvm_unreachable("Instr did not go through cost modelling?");
7195       }
7196 
7197       llvm_unreachable("Unhandled case!");
7198     };
7199 
7200     unsigned Opcode = I->getOpcode();
7201     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7202     // For Trunc, the context is the only user, which must be a StoreInst.
7203     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7204       if (I->hasOneUse())
7205         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7206           CCH = ComputeCCH(Store);
7207     }
7208     // For Z/Sext, the context is the operand, which must be a LoadInst.
7209     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7210              Opcode == Instruction::FPExt) {
7211       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7212         CCH = ComputeCCH(Load);
7213     }
7214 
7215     // We optimize the truncation of induction variables having constant
7216     // integer steps. The cost of these truncations is the same as the scalar
7217     // operation.
7218     if (isOptimizableIVTruncate(I, VF)) {
7219       auto *Trunc = cast<TruncInst>(I);
7220       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7221                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7222     }
7223 
7224     // Detect reduction patterns
7225     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7226       return *RedCost;
7227 
7228     Type *SrcScalarTy = I->getOperand(0)->getType();
7229     Type *SrcVecTy =
7230         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7231     if (canTruncateToMinimalBitwidth(I, VF)) {
7232       // This cast is going to be shrunk. This may remove the cast or it might
7233       // turn it into slightly different cast. For example, if MinBW == 16,
7234       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7235       //
7236       // Calculate the modified src and dest types.
7237       Type *MinVecTy = VectorTy;
7238       if (Opcode == Instruction::Trunc) {
7239         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7240         VectorTy =
7241             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7242       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7243         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7244         VectorTy =
7245             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7246       }
7247     }
7248 
7249     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7250   }
7251   case Instruction::Call: {
7252     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7253       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7254         return *RedCost;
7255     bool NeedToScalarize;
7256     CallInst *CI = cast<CallInst>(I);
7257     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7258     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7259       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7260       return std::min(CallCost, IntrinsicCost);
7261     }
7262     return CallCost;
7263   }
7264   case Instruction::ExtractValue:
7265     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7266   case Instruction::Alloca:
7267     // We cannot easily widen alloca to a scalable alloca, as
7268     // the result would need to be a vector of pointers.
7269     if (VF.isScalable())
7270       return InstructionCost::getInvalid();
7271     LLVM_FALLTHROUGH;
7272   default:
7273     // This opcode is unknown. Assume that it is the same as 'mul'.
7274     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7275   } // end of switch.
7276 }
7277 
7278 char LoopVectorize::ID = 0;
7279 
7280 static const char lv_name[] = "Loop Vectorization";
7281 
7282 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7283 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7284 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7285 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7286 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7287 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7288 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7289 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7290 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7291 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7292 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7293 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7294 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7295 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7296 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7297 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7298 
7299 namespace llvm {
7300 
7301 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7302 
7303 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7304                               bool VectorizeOnlyWhenForced) {
7305   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7306 }
7307 
7308 } // end namespace llvm
7309 
7310 void LoopVectorizationCostModel::collectValuesToIgnore() {
7311   // Ignore ephemeral values.
7312   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7313 
7314   // Find all stores to invariant variables. Since they are going to sink
7315   // outside the loop we do not need calculate cost for them.
7316   for (BasicBlock *BB : TheLoop->blocks())
7317     for (Instruction &I : *BB) {
7318       StoreInst *SI;
7319       if ((SI = dyn_cast<StoreInst>(&I)) &&
7320           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7321         ValuesToIgnore.insert(&I);
7322     }
7323 
7324   // Ignore type-promoting instructions we identified during reduction
7325   // detection.
7326   for (auto &Reduction : Legal->getReductionVars()) {
7327     const RecurrenceDescriptor &RedDes = Reduction.second;
7328     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7329     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7330   }
7331   // Ignore type-casting instructions we identified during induction
7332   // detection.
7333   for (auto &Induction : Legal->getInductionVars()) {
7334     const InductionDescriptor &IndDes = Induction.second;
7335     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7336     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7337   }
7338 }
7339 
7340 void LoopVectorizationCostModel::collectInLoopReductions() {
7341   for (auto &Reduction : Legal->getReductionVars()) {
7342     PHINode *Phi = Reduction.first;
7343     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7344 
7345     // We don't collect reductions that are type promoted (yet).
7346     if (RdxDesc.getRecurrenceType() != Phi->getType())
7347       continue;
7348 
7349     // If the target would prefer this reduction to happen "in-loop", then we
7350     // want to record it as such.
7351     unsigned Opcode = RdxDesc.getOpcode();
7352     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7353         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7354                                    TargetTransformInfo::ReductionFlags()))
7355       continue;
7356 
7357     // Check that we can correctly put the reductions into the loop, by
7358     // finding the chain of operations that leads from the phi to the loop
7359     // exit value.
7360     SmallVector<Instruction *, 4> ReductionOperations =
7361         RdxDesc.getReductionOpChain(Phi, TheLoop);
7362     bool InLoop = !ReductionOperations.empty();
7363     if (InLoop) {
7364       InLoopReductionChains[Phi] = ReductionOperations;
7365       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7366       Instruction *LastChain = Phi;
7367       for (auto *I : ReductionOperations) {
7368         InLoopReductionImmediateChains[I] = LastChain;
7369         LastChain = I;
7370       }
7371     }
7372     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7373                       << " reduction for phi: " << *Phi << "\n");
7374   }
7375 }
7376 
7377 // TODO: we could return a pair of values that specify the max VF and
7378 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7379 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7380 // doesn't have a cost model that can choose which plan to execute if
7381 // more than one is generated.
7382 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7383                                  LoopVectorizationCostModel &CM) {
7384   unsigned WidestType;
7385   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7386   return WidestVectorRegBits / WidestType;
7387 }
7388 
7389 VectorizationFactor
7390 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7391   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7392   ElementCount VF = UserVF;
7393   // Outer loop handling: They may require CFG and instruction level
7394   // transformations before even evaluating whether vectorization is profitable.
7395   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7396   // the vectorization pipeline.
7397   if (!OrigLoop->isInnermost()) {
7398     // If the user doesn't provide a vectorization factor, determine a
7399     // reasonable one.
7400     if (UserVF.isZero()) {
7401       VF = ElementCount::getFixed(determineVPlanVF(
7402           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7403               .getFixedSize(),
7404           CM));
7405       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7406 
7407       // Make sure we have a VF > 1 for stress testing.
7408       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7409         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7410                           << "overriding computed VF.\n");
7411         VF = ElementCount::getFixed(4);
7412       }
7413     }
7414     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7415     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7416            "VF needs to be a power of two");
7417     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7418                       << "VF " << VF << " to build VPlans.\n");
7419     buildVPlans(VF, VF);
7420 
7421     // For VPlan build stress testing, we bail out after VPlan construction.
7422     if (VPlanBuildStressTest)
7423       return VectorizationFactor::Disabled();
7424 
7425     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7426   }
7427 
7428   LLVM_DEBUG(
7429       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7430                 "VPlan-native path.\n");
7431   return VectorizationFactor::Disabled();
7432 }
7433 
7434 Optional<VectorizationFactor>
7435 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7436   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7437   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7438   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7439     return None;
7440 
7441   // Invalidate interleave groups if all blocks of loop will be predicated.
7442   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7443       !useMaskedInterleavedAccesses(*TTI)) {
7444     LLVM_DEBUG(
7445         dbgs()
7446         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7447            "which requires masked-interleaved support.\n");
7448     if (CM.InterleaveInfo.invalidateGroups())
7449       // Invalidating interleave groups also requires invalidating all decisions
7450       // based on them, which includes widening decisions and uniform and scalar
7451       // values.
7452       CM.invalidateCostModelingDecisions();
7453   }
7454 
7455   ElementCount MaxUserVF =
7456       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7457   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7458   if (!UserVF.isZero() && UserVFIsLegal) {
7459     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7460            "VF needs to be a power of two");
7461     // Collect the instructions (and their associated costs) that will be more
7462     // profitable to scalarize.
7463     if (CM.selectUserVectorizationFactor(UserVF)) {
7464       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7465       CM.collectInLoopReductions();
7466       buildVPlansWithVPRecipes(UserVF, UserVF);
7467       LLVM_DEBUG(printPlans(dbgs()));
7468       return {{UserVF, 0, 0}};
7469     } else
7470       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7471                               "InvalidCost", ORE, OrigLoop);
7472   }
7473 
7474   // Populate the set of Vectorization Factor Candidates.
7475   ElementCountSet VFCandidates;
7476   for (auto VF = ElementCount::getFixed(1);
7477        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7478     VFCandidates.insert(VF);
7479   for (auto VF = ElementCount::getScalable(1);
7480        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7481     VFCandidates.insert(VF);
7482 
7483   for (const auto &VF : VFCandidates) {
7484     // Collect Uniform and Scalar instructions after vectorization with VF.
7485     CM.collectUniformsAndScalars(VF);
7486 
7487     // Collect the instructions (and their associated costs) that will be more
7488     // profitable to scalarize.
7489     if (VF.isVector())
7490       CM.collectInstsToScalarize(VF);
7491   }
7492 
7493   CM.collectInLoopReductions();
7494   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7495   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7496 
7497   LLVM_DEBUG(printPlans(dbgs()));
7498   if (!MaxFactors.hasVector())
7499     return VectorizationFactor::Disabled();
7500 
7501   // Select the optimal vectorization factor.
7502   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7503   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7504   return VF;
7505 }
7506 
7507 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7508   assert(count_if(VPlans,
7509                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7510              1 &&
7511          "Best VF has not a single VPlan.");
7512 
7513   for (const VPlanPtr &Plan : VPlans) {
7514     if (Plan->hasVF(VF))
7515       return *Plan.get();
7516   }
7517   llvm_unreachable("No plan found!");
7518 }
7519 
7520 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7521   SmallVector<Metadata *, 4> MDs;
7522   // Reserve first location for self reference to the LoopID metadata node.
7523   MDs.push_back(nullptr);
7524   bool IsUnrollMetadata = false;
7525   MDNode *LoopID = L->getLoopID();
7526   if (LoopID) {
7527     // First find existing loop unrolling disable metadata.
7528     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7529       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7530       if (MD) {
7531         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7532         IsUnrollMetadata =
7533             S && S->getString().startswith("llvm.loop.unroll.disable");
7534       }
7535       MDs.push_back(LoopID->getOperand(i));
7536     }
7537   }
7538 
7539   if (!IsUnrollMetadata) {
7540     // Add runtime unroll disable metadata.
7541     LLVMContext &Context = L->getHeader()->getContext();
7542     SmallVector<Metadata *, 1> DisableOperands;
7543     DisableOperands.push_back(
7544         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7545     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7546     MDs.push_back(DisableNode);
7547     MDNode *NewLoopID = MDNode::get(Context, MDs);
7548     // Set operand 0 to refer to the loop id itself.
7549     NewLoopID->replaceOperandWith(0, NewLoopID);
7550     L->setLoopID(NewLoopID);
7551   }
7552 }
7553 
7554 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7555                                            VPlan &BestVPlan,
7556                                            InnerLoopVectorizer &ILV,
7557                                            DominatorTree *DT,
7558                                            bool IsEpilogueVectorization) {
7559   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7560                     << '\n');
7561 
7562   // Perform the actual loop transformation.
7563 
7564   // 1. Set up the skeleton for vectorization, including vector pre-header and
7565   // middle block. The vector loop is created during VPlan execution.
7566   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7567   Value *CanonicalIVStartValue;
7568   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7569       ILV.createVectorizedLoopSkeleton();
7570 
7571   // Only use noalias metadata when using memory checks guaranteeing no overlap
7572   // across all iterations.
7573   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7574   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7575       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7576 
7577     //  We currently don't use LoopVersioning for the actual loop cloning but we
7578     //  still use it to add the noalias metadata.
7579     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7580     //        metadata.
7581     State.LVer = std::make_unique<LoopVersioning>(
7582         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7583         PSE.getSE());
7584     State.LVer->prepareNoAliasMetadata();
7585   }
7586 
7587   ILV.collectPoisonGeneratingRecipes(State);
7588 
7589   ILV.printDebugTracesAtStart();
7590 
7591   //===------------------------------------------------===//
7592   //
7593   // Notice: any optimization or new instruction that go
7594   // into the code below should also be implemented in
7595   // the cost-model.
7596   //
7597   //===------------------------------------------------===//
7598 
7599   // 2. Copy and widen instructions from the old loop into the new loop.
7600   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7601                              ILV.getOrCreateVectorTripCount(nullptr),
7602                              CanonicalIVStartValue, State,
7603                              IsEpilogueVectorization);
7604 
7605   BestVPlan.execute(&State);
7606 
7607   // Keep all loop hints from the original loop on the vector loop (we'll
7608   // replace the vectorizer-specific hints below).
7609   MDNode *OrigLoopID = OrigLoop->getLoopID();
7610 
7611   Optional<MDNode *> VectorizedLoopID =
7612       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7613                                       LLVMLoopVectorizeFollowupVectorized});
7614 
7615   VPBasicBlock *HeaderVPBB =
7616       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7617   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7618   if (VectorizedLoopID)
7619     L->setLoopID(VectorizedLoopID.value());
7620   else {
7621     // Keep all loop hints from the original loop on the vector loop (we'll
7622     // replace the vectorizer-specific hints below).
7623     if (MDNode *LID = OrigLoop->getLoopID())
7624       L->setLoopID(LID);
7625 
7626     LoopVectorizeHints Hints(L, true, *ORE);
7627     Hints.setAlreadyVectorized();
7628   }
7629   // Disable runtime unrolling when vectorizing the epilogue loop.
7630   if (CanonicalIVStartValue)
7631     AddRuntimeUnrollDisableMetaData(L);
7632 
7633   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7634   //    predication, updating analyses.
7635   ILV.fixVectorizedLoop(State, BestVPlan);
7636 
7637   ILV.printDebugTracesAtEnd();
7638 }
7639 
7640 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7641 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7642   for (const auto &Plan : VPlans)
7643     if (PrintVPlansInDotFormat)
7644       Plan->printDOT(O);
7645     else
7646       Plan->print(O);
7647 }
7648 #endif
7649 
7650 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7651 
7652 //===--------------------------------------------------------------------===//
7653 // EpilogueVectorizerMainLoop
7654 //===--------------------------------------------------------------------===//
7655 
7656 /// This function is partially responsible for generating the control flow
7657 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7658 std::pair<BasicBlock *, Value *>
7659 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7660   MDNode *OrigLoopID = OrigLoop->getLoopID();
7661 
7662   // Workaround!  Compute the trip count of the original loop and cache it
7663   // before we start modifying the CFG.  This code has a systemic problem
7664   // wherein it tries to run analysis over partially constructed IR; this is
7665   // wrong, and not simply for SCEV.  The trip count of the original loop
7666   // simply happens to be prone to hitting this in practice.  In theory, we
7667   // can hit the same issue for any SCEV, or ValueTracking query done during
7668   // mutation.  See PR49900.
7669   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7670   createVectorLoopSkeleton("");
7671 
7672   // Generate the code to check the minimum iteration count of the vector
7673   // epilogue (see below).
7674   EPI.EpilogueIterationCountCheck =
7675       emitIterationCountCheck(LoopScalarPreHeader, true);
7676   EPI.EpilogueIterationCountCheck->setName("iter.check");
7677 
7678   // Generate the code to check any assumptions that we've made for SCEV
7679   // expressions.
7680   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7681 
7682   // Generate the code that checks at runtime if arrays overlap. We put the
7683   // checks into a separate block to make the more common case of few elements
7684   // faster.
7685   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7686 
7687   // Generate the iteration count check for the main loop, *after* the check
7688   // for the epilogue loop, so that the path-length is shorter for the case
7689   // that goes directly through the vector epilogue. The longer-path length for
7690   // the main loop is compensated for, by the gain from vectorizing the larger
7691   // trip count. Note: the branch will get updated later on when we vectorize
7692   // the epilogue.
7693   EPI.MainLoopIterationCountCheck =
7694       emitIterationCountCheck(LoopScalarPreHeader, false);
7695 
7696   // Generate the induction variable.
7697   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7698 
7699   // Skip induction resume value creation here because they will be created in
7700   // the second pass. If we created them here, they wouldn't be used anyway,
7701   // because the vplan in the second pass still contains the inductions from the
7702   // original loop.
7703 
7704   return {completeLoopSkeleton(OrigLoopID), nullptr};
7705 }
7706 
7707 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7708   LLVM_DEBUG({
7709     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7710            << "Main Loop VF:" << EPI.MainLoopVF
7711            << ", Main Loop UF:" << EPI.MainLoopUF
7712            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7713            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7714   });
7715 }
7716 
7717 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7718   DEBUG_WITH_TYPE(VerboseDebug, {
7719     dbgs() << "intermediate fn:\n"
7720            << *OrigLoop->getHeader()->getParent() << "\n";
7721   });
7722 }
7723 
7724 BasicBlock *
7725 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7726                                                     bool ForEpilogue) {
7727   assert(Bypass && "Expected valid bypass basic block.");
7728   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7729   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7730   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7731   // Reuse existing vector loop preheader for TC checks.
7732   // Note that new preheader block is generated for vector loop.
7733   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7734   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7735 
7736   // Generate code to check if the loop's trip count is less than VF * UF of the
7737   // main vector loop.
7738   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7739       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7740 
7741   Value *CheckMinIters = Builder.CreateICmp(
7742       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7743       "min.iters.check");
7744 
7745   if (!ForEpilogue)
7746     TCCheckBlock->setName("vector.main.loop.iter.check");
7747 
7748   // Create new preheader for vector loop.
7749   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7750                                    DT, LI, nullptr, "vector.ph");
7751 
7752   if (ForEpilogue) {
7753     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7754                                  DT->getNode(Bypass)->getIDom()) &&
7755            "TC check is expected to dominate Bypass");
7756 
7757     // Update dominator for Bypass & LoopExit.
7758     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7759     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7760       // For loops with multiple exits, there's no edge from the middle block
7761       // to exit blocks (as the epilogue must run) and thus no need to update
7762       // the immediate dominator of the exit blocks.
7763       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7764 
7765     LoopBypassBlocks.push_back(TCCheckBlock);
7766 
7767     // Save the trip count so we don't have to regenerate it in the
7768     // vec.epilog.iter.check. This is safe to do because the trip count
7769     // generated here dominates the vector epilog iter check.
7770     EPI.TripCount = Count;
7771   }
7772 
7773   ReplaceInstWithInst(
7774       TCCheckBlock->getTerminator(),
7775       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7776 
7777   return TCCheckBlock;
7778 }
7779 
7780 //===--------------------------------------------------------------------===//
7781 // EpilogueVectorizerEpilogueLoop
7782 //===--------------------------------------------------------------------===//
7783 
7784 /// This function is partially responsible for generating the control flow
7785 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7786 std::pair<BasicBlock *, Value *>
7787 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7788   MDNode *OrigLoopID = OrigLoop->getLoopID();
7789   createVectorLoopSkeleton("vec.epilog.");
7790 
7791   // Now, compare the remaining count and if there aren't enough iterations to
7792   // execute the vectorized epilogue skip to the scalar part.
7793   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7794   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7795   LoopVectorPreHeader =
7796       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7797                  LI, nullptr, "vec.epilog.ph");
7798   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7799                                           VecEpilogueIterationCountCheck);
7800 
7801   // Adjust the control flow taking the state info from the main loop
7802   // vectorization into account.
7803   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7804          "expected this to be saved from the previous pass.");
7805   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7806       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7807 
7808   DT->changeImmediateDominator(LoopVectorPreHeader,
7809                                EPI.MainLoopIterationCountCheck);
7810 
7811   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7812       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7813 
7814   if (EPI.SCEVSafetyCheck)
7815     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7816         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7817   if (EPI.MemSafetyCheck)
7818     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7819         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7820 
7821   DT->changeImmediateDominator(
7822       VecEpilogueIterationCountCheck,
7823       VecEpilogueIterationCountCheck->getSinglePredecessor());
7824 
7825   DT->changeImmediateDominator(LoopScalarPreHeader,
7826                                EPI.EpilogueIterationCountCheck);
7827   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7828     // If there is an epilogue which must run, there's no edge from the
7829     // middle block to exit blocks  and thus no need to update the immediate
7830     // dominator of the exit blocks.
7831     DT->changeImmediateDominator(LoopExitBlock,
7832                                  EPI.EpilogueIterationCountCheck);
7833 
7834   // Keep track of bypass blocks, as they feed start values to the induction
7835   // phis in the scalar loop preheader.
7836   if (EPI.SCEVSafetyCheck)
7837     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7838   if (EPI.MemSafetyCheck)
7839     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7840   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7841 
7842   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7843   // merge control-flow from the latch block and the middle block. Update the
7844   // incoming values here and move the Phi into the preheader.
7845   SmallVector<PHINode *, 4> PhisInBlock;
7846   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7847     PhisInBlock.push_back(&Phi);
7848 
7849   for (PHINode *Phi : PhisInBlock) {
7850     Phi->replaceIncomingBlockWith(
7851         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7852         VecEpilogueIterationCountCheck);
7853     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7854     if (EPI.SCEVSafetyCheck)
7855       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7856     if (EPI.MemSafetyCheck)
7857       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7858     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7859   }
7860 
7861   // Generate a resume induction for the vector epilogue and put it in the
7862   // vector epilogue preheader
7863   Type *IdxTy = Legal->getWidestInductionType();
7864   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7865                                          LoopVectorPreHeader->getFirstNonPHI());
7866   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7867   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7868                            EPI.MainLoopIterationCountCheck);
7869 
7870   // Generate induction resume values. These variables save the new starting
7871   // indexes for the scalar loop. They are used to test if there are any tail
7872   // iterations left once the vector loop has completed.
7873   // Note that when the vectorized epilogue is skipped due to iteration count
7874   // check, then the resume value for the induction variable comes from
7875   // the trip count of the main vector loop, hence passing the AdditionalBypass
7876   // argument.
7877   createInductionResumeValues({VecEpilogueIterationCountCheck,
7878                                EPI.VectorTripCount} /* AdditionalBypass */);
7879 
7880   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7881 }
7882 
7883 BasicBlock *
7884 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7885     BasicBlock *Bypass, BasicBlock *Insert) {
7886 
7887   assert(EPI.TripCount &&
7888          "Expected trip count to have been safed in the first pass.");
7889   assert(
7890       (!isa<Instruction>(EPI.TripCount) ||
7891        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7892       "saved trip count does not dominate insertion point.");
7893   Value *TC = EPI.TripCount;
7894   IRBuilder<> Builder(Insert->getTerminator());
7895   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7896 
7897   // Generate code to check if the loop's trip count is less than VF * UF of the
7898   // vector epilogue loop.
7899   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7900       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7901 
7902   Value *CheckMinIters =
7903       Builder.CreateICmp(P, Count,
7904                          createStepForVF(Builder, Count->getType(),
7905                                          EPI.EpilogueVF, EPI.EpilogueUF),
7906                          "min.epilog.iters.check");
7907 
7908   ReplaceInstWithInst(
7909       Insert->getTerminator(),
7910       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7911 
7912   LoopBypassBlocks.push_back(Insert);
7913   return Insert;
7914 }
7915 
7916 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7917   LLVM_DEBUG({
7918     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7919            << "Epilogue Loop VF:" << EPI.EpilogueVF
7920            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7921   });
7922 }
7923 
7924 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7925   DEBUG_WITH_TYPE(VerboseDebug, {
7926     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7927   });
7928 }
7929 
7930 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7931     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7932   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7933   bool PredicateAtRangeStart = Predicate(Range.Start);
7934 
7935   for (ElementCount TmpVF = Range.Start * 2;
7936        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7937     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7938       Range.End = TmpVF;
7939       break;
7940     }
7941 
7942   return PredicateAtRangeStart;
7943 }
7944 
7945 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7946 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7947 /// of VF's starting at a given VF and extending it as much as possible. Each
7948 /// vectorization decision can potentially shorten this sub-range during
7949 /// buildVPlan().
7950 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7951                                            ElementCount MaxVF) {
7952   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7953   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7954     VFRange SubRange = {VF, MaxVFPlusOne};
7955     VPlans.push_back(buildVPlan(SubRange));
7956     VF = SubRange.End;
7957   }
7958 }
7959 
7960 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7961                                          VPlanPtr &Plan) {
7962   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7963 
7964   // Look for cached value.
7965   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7966   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7967   if (ECEntryIt != EdgeMaskCache.end())
7968     return ECEntryIt->second;
7969 
7970   VPValue *SrcMask = createBlockInMask(Src, Plan);
7971 
7972   // The terminator has to be a branch inst!
7973   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7974   assert(BI && "Unexpected terminator found");
7975 
7976   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7977     return EdgeMaskCache[Edge] = SrcMask;
7978 
7979   // If source is an exiting block, we know the exit edge is dynamically dead
7980   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7981   // adding uses of an otherwise potentially dead instruction.
7982   if (OrigLoop->isLoopExiting(Src))
7983     return EdgeMaskCache[Edge] = SrcMask;
7984 
7985   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7986   assert(EdgeMask && "No Edge Mask found for condition");
7987 
7988   if (BI->getSuccessor(0) != Dst)
7989     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7990 
7991   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7992     // The condition is 'SrcMask && EdgeMask', which is equivalent to
7993     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7994     // The select version does not introduce new UB if SrcMask is false and
7995     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
7996     VPValue *False = Plan->getOrAddVPValue(
7997         ConstantInt::getFalse(BI->getCondition()->getType()));
7998     EdgeMask =
7999         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8000   }
8001 
8002   return EdgeMaskCache[Edge] = EdgeMask;
8003 }
8004 
8005 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8006   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8007 
8008   // Look for cached value.
8009   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8010   if (BCEntryIt != BlockMaskCache.end())
8011     return BCEntryIt->second;
8012 
8013   // All-one mask is modelled as no-mask following the convention for masked
8014   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8015   VPValue *BlockMask = nullptr;
8016 
8017   if (OrigLoop->getHeader() == BB) {
8018     if (!CM.blockNeedsPredicationForAnyReason(BB))
8019       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8020 
8021     assert(CM.foldTailByMasking() && "must fold the tail");
8022 
8023     // If we're using the active lane mask for control flow, then we get the
8024     // mask from the active lane mask PHI that is cached in the VPlan.
8025     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8026     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8027       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8028 
8029     // Introduce the early-exit compare IV <= BTC to form header block mask.
8030     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8031     // constructing the desired canonical IV in the header block as its first
8032     // non-phi instructions.
8033 
8034     VPBasicBlock *HeaderVPBB =
8035         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8036     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8037     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8038     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8039 
8040     VPBuilder::InsertPointGuard Guard(Builder);
8041     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8042     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8043       VPValue *TC = Plan->getOrCreateTripCount();
8044       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8045                                        nullptr, "active.lane.mask");
8046     } else {
8047       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8048       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8049     }
8050     return BlockMaskCache[BB] = BlockMask;
8051   }
8052 
8053   // This is the block mask. We OR all incoming edges.
8054   for (auto *Predecessor : predecessors(BB)) {
8055     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8056     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8057       return BlockMaskCache[BB] = EdgeMask;
8058 
8059     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8060       BlockMask = EdgeMask;
8061       continue;
8062     }
8063 
8064     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8065   }
8066 
8067   return BlockMaskCache[BB] = BlockMask;
8068 }
8069 
8070 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8071                                                 ArrayRef<VPValue *> Operands,
8072                                                 VFRange &Range,
8073                                                 VPlanPtr &Plan) {
8074   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8075          "Must be called with either a load or store");
8076 
8077   auto willWiden = [&](ElementCount VF) -> bool {
8078     LoopVectorizationCostModel::InstWidening Decision =
8079         CM.getWideningDecision(I, VF);
8080     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8081            "CM decision should be taken at this point.");
8082     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8083       return true;
8084     if (CM.isScalarAfterVectorization(I, VF) ||
8085         CM.isProfitableToScalarize(I, VF))
8086       return false;
8087     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8088   };
8089 
8090   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8091     return nullptr;
8092 
8093   VPValue *Mask = nullptr;
8094   if (Legal->isMaskRequired(I))
8095     Mask = createBlockInMask(I->getParent(), Plan);
8096 
8097   // Determine if the pointer operand of the access is either consecutive or
8098   // reverse consecutive.
8099   LoopVectorizationCostModel::InstWidening Decision =
8100       CM.getWideningDecision(I, Range.Start);
8101   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8102   bool Consecutive =
8103       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8104 
8105   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8106     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8107                                               Consecutive, Reverse);
8108 
8109   StoreInst *Store = cast<StoreInst>(I);
8110   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8111                                             Mask, Consecutive, Reverse);
8112 }
8113 
8114 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8115 /// insert a recipe to expand the step for the induction recipe.
8116 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8117     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8118     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8119     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8120   // Returns true if an instruction \p I should be scalarized instead of
8121   // vectorized for the chosen vectorization factor.
8122   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8123     return CM.isScalarAfterVectorization(I, VF) ||
8124            CM.isProfitableToScalarize(I, VF);
8125   };
8126 
8127   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8128       [&](ElementCount VF) {
8129         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8130       },
8131       Range);
8132   assert(IndDesc.getStartValue() ==
8133          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8134   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8135          "step must be loop invariant");
8136 
8137   VPValue *Step =
8138       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8139   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8140     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8141                                              !NeedsScalarIVOnly);
8142   }
8143   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8144   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8145                                            !NeedsScalarIVOnly);
8146 }
8147 
8148 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8149     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8150 
8151   // Check if this is an integer or fp induction. If so, build the recipe that
8152   // produces its scalar and vector values.
8153   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8154     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8155                                        *PSE.getSE(), *OrigLoop, Range);
8156 
8157   // Check if this is pointer induction. If so, build the recipe for it.
8158   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8159     return new VPWidenPointerInductionRecipe(
8160         Phi, Operands[0], *II, *PSE.getSE(),
8161         LoopVectorizationPlanner::getDecisionAndClampRange(
8162             [&](ElementCount VF) {
8163               return CM.isScalarAfterVectorization(Phi, VF);
8164             },
8165             Range));
8166   }
8167   return nullptr;
8168 }
8169 
8170 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8171     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8172   // Optimize the special case where the source is a constant integer
8173   // induction variable. Notice that we can only optimize the 'trunc' case
8174   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8175   // (c) other casts depend on pointer size.
8176 
8177   // Determine whether \p K is a truncation based on an induction variable that
8178   // can be optimized.
8179   auto isOptimizableIVTruncate =
8180       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8181     return [=](ElementCount VF) -> bool {
8182       return CM.isOptimizableIVTruncate(K, VF);
8183     };
8184   };
8185 
8186   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8187           isOptimizableIVTruncate(I), Range)) {
8188 
8189     auto *Phi = cast<PHINode>(I->getOperand(0));
8190     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8191     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8192     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8193                                        *PSE.getSE(), *OrigLoop, Range);
8194   }
8195   return nullptr;
8196 }
8197 
8198 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8199                                                 ArrayRef<VPValue *> Operands,
8200                                                 VPlanPtr &Plan) {
8201   // If all incoming values are equal, the incoming VPValue can be used directly
8202   // instead of creating a new VPBlendRecipe.
8203   VPValue *FirstIncoming = Operands[0];
8204   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8205         return FirstIncoming == Inc;
8206       })) {
8207     return Operands[0];
8208   }
8209 
8210   unsigned NumIncoming = Phi->getNumIncomingValues();
8211   // For in-loop reductions, we do not need to create an additional select.
8212   VPValue *InLoopVal = nullptr;
8213   for (unsigned In = 0; In < NumIncoming; In++) {
8214     PHINode *PhiOp =
8215         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8216     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8217       assert(!InLoopVal && "Found more than one in-loop reduction!");
8218       InLoopVal = Operands[In];
8219     }
8220   }
8221 
8222   assert((!InLoopVal || NumIncoming == 2) &&
8223          "Found an in-loop reduction for PHI with unexpected number of "
8224          "incoming values");
8225   if (InLoopVal)
8226     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8227 
8228   // We know that all PHIs in non-header blocks are converted into selects, so
8229   // we don't have to worry about the insertion order and we can just use the
8230   // builder. At this point we generate the predication tree. There may be
8231   // duplications since this is a simple recursive scan, but future
8232   // optimizations will clean it up.
8233   SmallVector<VPValue *, 2> OperandsWithMask;
8234 
8235   for (unsigned In = 0; In < NumIncoming; In++) {
8236     VPValue *EdgeMask =
8237       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8238     assert((EdgeMask || NumIncoming == 1) &&
8239            "Multiple predecessors with one having a full mask");
8240     OperandsWithMask.push_back(Operands[In]);
8241     if (EdgeMask)
8242       OperandsWithMask.push_back(EdgeMask);
8243   }
8244   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8245 }
8246 
8247 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8248                                                    ArrayRef<VPValue *> Operands,
8249                                                    VFRange &Range) const {
8250 
8251   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8252       [this, CI](ElementCount VF) {
8253         return CM.isScalarWithPredication(CI, VF);
8254       },
8255       Range);
8256 
8257   if (IsPredicated)
8258     return nullptr;
8259 
8260   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8261   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8262              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8263              ID == Intrinsic::pseudoprobe ||
8264              ID == Intrinsic::experimental_noalias_scope_decl))
8265     return nullptr;
8266 
8267   auto willWiden = [&](ElementCount VF) -> bool {
8268     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8269     // The following case may be scalarized depending on the VF.
8270     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8271     // version of the instruction.
8272     // Is it beneficial to perform intrinsic call compared to lib call?
8273     bool NeedToScalarize = false;
8274     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8275     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8276     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8277     return UseVectorIntrinsic || !NeedToScalarize;
8278   };
8279 
8280   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8281     return nullptr;
8282 
8283   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8284   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8285 }
8286 
8287 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8288   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8289          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8290   // Instruction should be widened, unless it is scalar after vectorization,
8291   // scalarization is profitable or it is predicated.
8292   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8293     return CM.isScalarAfterVectorization(I, VF) ||
8294            CM.isProfitableToScalarize(I, VF) ||
8295            CM.isScalarWithPredication(I, VF);
8296   };
8297   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8298                                                              Range);
8299 }
8300 
8301 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8302                                            ArrayRef<VPValue *> Operands) const {
8303   auto IsVectorizableOpcode = [](unsigned Opcode) {
8304     switch (Opcode) {
8305     case Instruction::Add:
8306     case Instruction::And:
8307     case Instruction::AShr:
8308     case Instruction::BitCast:
8309     case Instruction::FAdd:
8310     case Instruction::FCmp:
8311     case Instruction::FDiv:
8312     case Instruction::FMul:
8313     case Instruction::FNeg:
8314     case Instruction::FPExt:
8315     case Instruction::FPToSI:
8316     case Instruction::FPToUI:
8317     case Instruction::FPTrunc:
8318     case Instruction::FRem:
8319     case Instruction::FSub:
8320     case Instruction::ICmp:
8321     case Instruction::IntToPtr:
8322     case Instruction::LShr:
8323     case Instruction::Mul:
8324     case Instruction::Or:
8325     case Instruction::PtrToInt:
8326     case Instruction::SDiv:
8327     case Instruction::Select:
8328     case Instruction::SExt:
8329     case Instruction::Shl:
8330     case Instruction::SIToFP:
8331     case Instruction::SRem:
8332     case Instruction::Sub:
8333     case Instruction::Trunc:
8334     case Instruction::UDiv:
8335     case Instruction::UIToFP:
8336     case Instruction::URem:
8337     case Instruction::Xor:
8338     case Instruction::ZExt:
8339     case Instruction::Freeze:
8340       return true;
8341     }
8342     return false;
8343   };
8344 
8345   if (!IsVectorizableOpcode(I->getOpcode()))
8346     return nullptr;
8347 
8348   // Success: widen this instruction.
8349   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8350 }
8351 
8352 void VPRecipeBuilder::fixHeaderPhis() {
8353   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8354   for (VPHeaderPHIRecipe *R : PhisToFix) {
8355     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8356     VPRecipeBase *IncR =
8357         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8358     R->addOperand(IncR->getVPSingleValue());
8359   }
8360 }
8361 
8362 VPBasicBlock *VPRecipeBuilder::handleReplication(
8363     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8364     VPlanPtr &Plan) {
8365   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8366       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8367       Range);
8368 
8369   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8370       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
8371       Range);
8372 
8373   // Even if the instruction is not marked as uniform, there are certain
8374   // intrinsic calls that can be effectively treated as such, so we check for
8375   // them here. Conservatively, we only do this for scalable vectors, since
8376   // for fixed-width VFs we can always fall back on full scalarization.
8377   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8378     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8379     case Intrinsic::assume:
8380     case Intrinsic::lifetime_start:
8381     case Intrinsic::lifetime_end:
8382       // For scalable vectors if one of the operands is variant then we still
8383       // want to mark as uniform, which will generate one instruction for just
8384       // the first lane of the vector. We can't scalarize the call in the same
8385       // way as for fixed-width vectors because we don't know how many lanes
8386       // there are.
8387       //
8388       // The reasons for doing it this way for scalable vectors are:
8389       //   1. For the assume intrinsic generating the instruction for the first
8390       //      lane is still be better than not generating any at all. For
8391       //      example, the input may be a splat across all lanes.
8392       //   2. For the lifetime start/end intrinsics the pointer operand only
8393       //      does anything useful when the input comes from a stack object,
8394       //      which suggests it should always be uniform. For non-stack objects
8395       //      the effect is to poison the object, which still allows us to
8396       //      remove the call.
8397       IsUniform = true;
8398       break;
8399     default:
8400       break;
8401     }
8402   }
8403 
8404   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8405                                        IsUniform, IsPredicated);
8406 
8407   // Find if I uses a predicated instruction. If so, it will use its scalar
8408   // value. Avoid hoisting the insert-element which packs the scalar value into
8409   // a vector value, as that happens iff all users use the vector value.
8410   for (VPValue *Op : Recipe->operands()) {
8411     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8412     if (!PredR)
8413       continue;
8414     auto *RepR =
8415         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8416     assert(RepR->isPredicated() &&
8417            "expected Replicate recipe to be predicated");
8418     RepR->setAlsoPack(false);
8419   }
8420 
8421   // Finalize the recipe for Instr, first if it is not predicated.
8422   if (!IsPredicated) {
8423     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8424     setRecipe(I, Recipe);
8425     Plan->addVPValue(I, Recipe);
8426     VPBB->appendRecipe(Recipe);
8427     return VPBB;
8428   }
8429   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8430 
8431   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8432   assert(SingleSucc && "VPBB must have a single successor when handling "
8433                        "predicated replication.");
8434   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8435   // Record predicated instructions for above packing optimizations.
8436   VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8437   VPBlockUtils::insertBlockAfter(Region, VPBB);
8438   auto *RegSucc = new VPBasicBlock();
8439   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8440   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8441   return RegSucc;
8442 }
8443 
8444 VPRegionBlock *
8445 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8446                                        VPlanPtr &Plan) {
8447   Instruction *Instr = PredRecipe->getUnderlyingInstr();
8448   // Instructions marked for predication are replicated and placed under an
8449   // if-then construct to prevent side-effects.
8450   // Generate recipes to compute the block mask for this region.
8451   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8452 
8453   // Build the triangular if-then region.
8454   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8455   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8456   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8457   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8458   auto *PHIRecipe = Instr->getType()->isVoidTy()
8459                         ? nullptr
8460                         : new VPPredInstPHIRecipe(PredRecipe);
8461   if (PHIRecipe) {
8462     setRecipe(Instr, PHIRecipe);
8463     Plan->addVPValue(Instr, PHIRecipe);
8464   } else {
8465     setRecipe(Instr, PredRecipe);
8466     Plan->addVPValue(Instr, PredRecipe);
8467   }
8468 
8469   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8470   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8471   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8472 
8473   // Note: first set Entry as region entry and then connect successors starting
8474   // from it in order, to propagate the "parent" of each VPBasicBlock.
8475   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8476   VPBlockUtils::connectBlocks(Pred, Exiting);
8477 
8478   return Region;
8479 }
8480 
8481 VPRecipeOrVPValueTy
8482 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8483                                         ArrayRef<VPValue *> Operands,
8484                                         VFRange &Range, VPlanPtr &Plan) {
8485   // First, check for specific widening recipes that deal with inductions, Phi
8486   // nodes, calls and memory operations.
8487   VPRecipeBase *Recipe;
8488   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8489     if (Phi->getParent() != OrigLoop->getHeader())
8490       return tryToBlend(Phi, Operands, Plan);
8491     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8492       return toVPRecipeResult(Recipe);
8493 
8494     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8495     assert((Legal->isReductionVariable(Phi) ||
8496             Legal->isFirstOrderRecurrence(Phi)) &&
8497            "can only widen reductions and first-order recurrences here");
8498     VPValue *StartV = Operands[0];
8499     if (Legal->isReductionVariable(Phi)) {
8500       const RecurrenceDescriptor &RdxDesc =
8501           Legal->getReductionVars().find(Phi)->second;
8502       assert(RdxDesc.getRecurrenceStartValue() ==
8503              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8504       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8505                                            CM.isInLoopReduction(Phi),
8506                                            CM.useOrderedReductions(RdxDesc));
8507     } else {
8508       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8509     }
8510 
8511     // Record the incoming value from the backedge, so we can add the incoming
8512     // value from the backedge after all recipes have been created.
8513     recordRecipeOf(cast<Instruction>(
8514         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8515     PhisToFix.push_back(PhiRecipe);
8516     return toVPRecipeResult(PhiRecipe);
8517   }
8518 
8519   if (isa<TruncInst>(Instr) &&
8520       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8521                                                Range, *Plan)))
8522     return toVPRecipeResult(Recipe);
8523 
8524   // All widen recipes below deal only with VF > 1.
8525   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8526           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8527     return nullptr;
8528 
8529   if (auto *CI = dyn_cast<CallInst>(Instr))
8530     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8531 
8532   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8533     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8534 
8535   if (!shouldWiden(Instr, Range))
8536     return nullptr;
8537 
8538   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8539     return toVPRecipeResult(new VPWidenGEPRecipe(
8540         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8541 
8542   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8543     bool InvariantCond =
8544         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8545     return toVPRecipeResult(new VPWidenSelectRecipe(
8546         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8547   }
8548 
8549   return toVPRecipeResult(tryToWiden(Instr, Operands));
8550 }
8551 
8552 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8553                                                         ElementCount MaxVF) {
8554   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8555 
8556   // Add assume instructions we need to drop to DeadInstructions, to prevent
8557   // them from being added to the VPlan.
8558   // TODO: We only need to drop assumes in blocks that get flattend. If the
8559   // control flow is preserved, we should keep them.
8560   SmallPtrSet<Instruction *, 4> DeadInstructions;
8561   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8562   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8563 
8564   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8565   // Dead instructions do not need sinking. Remove them from SinkAfter.
8566   for (Instruction *I : DeadInstructions)
8567     SinkAfter.erase(I);
8568 
8569   // Cannot sink instructions after dead instructions (there won't be any
8570   // recipes for them). Instead, find the first non-dead previous instruction.
8571   for (auto &P : Legal->getSinkAfter()) {
8572     Instruction *SinkTarget = P.second;
8573     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8574     (void)FirstInst;
8575     while (DeadInstructions.contains(SinkTarget)) {
8576       assert(
8577           SinkTarget != FirstInst &&
8578           "Must find a live instruction (at least the one feeding the "
8579           "first-order recurrence PHI) before reaching beginning of the block");
8580       SinkTarget = SinkTarget->getPrevNode();
8581       assert(SinkTarget != P.first &&
8582              "sink source equals target, no sinking required");
8583     }
8584     P.second = SinkTarget;
8585   }
8586 
8587   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8588   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8589     VFRange SubRange = {VF, MaxVFPlusOne};
8590     VPlans.push_back(
8591         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8592     VF = SubRange.End;
8593   }
8594 }
8595 
8596 // Add the necessary canonical IV and branch recipes required to control the
8597 // loop.
8598 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8599                                   bool HasNUW,
8600                                   bool UseLaneMaskForLoopControlFlow) {
8601   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8602   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8603 
8604   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8605   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8606   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8607   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8608   Header->insert(CanonicalIVPHI, Header->begin());
8609 
8610   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8611   // IV by VF * UF.
8612   auto *CanonicalIVIncrement =
8613       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8614                                : VPInstruction::CanonicalIVIncrement,
8615                         {CanonicalIVPHI}, DL, "index.next");
8616   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8617 
8618   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8619   EB->appendRecipe(CanonicalIVIncrement);
8620 
8621   if (UseLaneMaskForLoopControlFlow) {
8622     // Create the active lane mask instruction in the vplan preheader.
8623     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8624 
8625     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8626     // we have to take unrolling into account. Each part needs to start at
8627     //   Part * VF
8628     auto *CanonicalIVIncrementParts =
8629         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8630                                  : VPInstruction::CanonicalIVIncrementForPart,
8631                           {StartV}, DL, "index.part.next");
8632     Preheader->appendRecipe(CanonicalIVIncrementParts);
8633 
8634     // Create the ActiveLaneMask instruction using the correct start values.
8635     VPValue *TC = Plan.getOrCreateTripCount();
8636     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8637                                        {CanonicalIVIncrementParts, TC}, DL,
8638                                        "active.lane.mask.entry");
8639     Preheader->appendRecipe(EntryALM);
8640 
8641     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8642     // preheader ActiveLaneMask instruction.
8643     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8644     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8645 
8646     // Create the active lane mask for the next iteration of the loop.
8647     CanonicalIVIncrementParts =
8648         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8649                                  : VPInstruction::CanonicalIVIncrementForPart,
8650                           {CanonicalIVIncrement}, DL);
8651     EB->appendRecipe(CanonicalIVIncrementParts);
8652 
8653     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8654                                   {CanonicalIVIncrementParts, TC}, DL,
8655                                   "active.lane.mask.next");
8656     EB->appendRecipe(ALM);
8657     LaneMaskPhi->addOperand(ALM);
8658 
8659     // We have to invert the mask here because a true condition means jumping
8660     // to the exit block.
8661     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8662     EB->appendRecipe(NotMask);
8663 
8664     VPInstruction *BranchBack =
8665         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8666     EB->appendRecipe(BranchBack);
8667   } else {
8668     // Add the BranchOnCount VPInstruction to the latch.
8669     VPInstruction *BranchBack = new VPInstruction(
8670         VPInstruction::BranchOnCount,
8671         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8672     EB->appendRecipe(BranchBack);
8673   }
8674 }
8675 
8676 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8677 // original exit block.
8678 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8679                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8680                                 VPlan &Plan) {
8681   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8682   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8683   // Only handle single-exit loops with unique exit blocks for now.
8684   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8685     return;
8686 
8687   // Introduce VPUsers modeling the exit values.
8688   for (PHINode &ExitPhi : ExitBB->phis()) {
8689     Value *IncomingValue =
8690         ExitPhi.getIncomingValueForBlock(ExitingBB);
8691     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8692     Plan.addLiveOut(&ExitPhi, V);
8693   }
8694 }
8695 
8696 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8697     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8698     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8699 
8700   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8701 
8702   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8703 
8704   // ---------------------------------------------------------------------------
8705   // Pre-construction: record ingredients whose recipes we'll need to further
8706   // process after constructing the initial VPlan.
8707   // ---------------------------------------------------------------------------
8708 
8709   // Mark instructions we'll need to sink later and their targets as
8710   // ingredients whose recipe we'll need to record.
8711   for (auto &Entry : SinkAfter) {
8712     RecipeBuilder.recordRecipeOf(Entry.first);
8713     RecipeBuilder.recordRecipeOf(Entry.second);
8714   }
8715   for (auto &Reduction : CM.getInLoopReductionChains()) {
8716     PHINode *Phi = Reduction.first;
8717     RecurKind Kind =
8718         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8719     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8720 
8721     RecipeBuilder.recordRecipeOf(Phi);
8722     for (auto &R : ReductionOperations) {
8723       RecipeBuilder.recordRecipeOf(R);
8724       // For min/max reductions, where we have a pair of icmp/select, we also
8725       // need to record the ICmp recipe, so it can be removed later.
8726       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8727              "Only min/max recurrences allowed for inloop reductions");
8728       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8729         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8730     }
8731   }
8732 
8733   // For each interleave group which is relevant for this (possibly trimmed)
8734   // Range, add it to the set of groups to be later applied to the VPlan and add
8735   // placeholders for its members' Recipes which we'll be replacing with a
8736   // single VPInterleaveRecipe.
8737   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8738     auto applyIG = [IG, this](ElementCount VF) -> bool {
8739       return (VF.isVector() && // Query is illegal for VF == 1
8740               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8741                   LoopVectorizationCostModel::CM_Interleave);
8742     };
8743     if (!getDecisionAndClampRange(applyIG, Range))
8744       continue;
8745     InterleaveGroups.insert(IG);
8746     for (unsigned i = 0; i < IG->getFactor(); i++)
8747       if (Instruction *Member = IG->getMember(i))
8748         RecipeBuilder.recordRecipeOf(Member);
8749   };
8750 
8751   // ---------------------------------------------------------------------------
8752   // Build initial VPlan: Scan the body of the loop in a topological order to
8753   // visit each basic block after having visited its predecessor basic blocks.
8754   // ---------------------------------------------------------------------------
8755 
8756   // Create initial VPlan skeleton, starting with a block for the pre-header,
8757   // followed by a region for the vector loop, followed by the middle block. The
8758   // skeleton vector loop region contains a header and latch block.
8759   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8760   auto Plan = std::make_unique<VPlan>(Preheader);
8761 
8762   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8763   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8764   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8765   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8766   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8767   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8768   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8769 
8770   Instruction *DLInst =
8771       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8772   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8773                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8774                         !CM.foldTailByMasking(),
8775                         CM.useActiveLaneMaskForControlFlow());
8776 
8777   // Scan the body of the loop in a topological order to visit each basic block
8778   // after having visited its predecessor basic blocks.
8779   LoopBlocksDFS DFS(OrigLoop);
8780   DFS.perform(LI);
8781 
8782   VPBasicBlock *VPBB = HeaderVPBB;
8783   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8784   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8785     // Relevant instructions from basic block BB will be grouped into VPRecipe
8786     // ingredients and fill a new VPBasicBlock.
8787     unsigned VPBBsForBB = 0;
8788     if (VPBB != HeaderVPBB)
8789       VPBB->setName(BB->getName());
8790     Builder.setInsertPoint(VPBB);
8791 
8792     // Introduce each ingredient into VPlan.
8793     // TODO: Model and preserve debug intrinsics in VPlan.
8794     for (Instruction &I : BB->instructionsWithoutDebug()) {
8795       Instruction *Instr = &I;
8796 
8797       // First filter out irrelevant instructions, to ensure no recipes are
8798       // built for them.
8799       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8800         continue;
8801 
8802       SmallVector<VPValue *, 4> Operands;
8803       auto *Phi = dyn_cast<PHINode>(Instr);
8804       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8805         Operands.push_back(Plan->getOrAddVPValue(
8806             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8807       } else {
8808         auto OpRange = Plan->mapToVPValues(Instr->operands());
8809         Operands = {OpRange.begin(), OpRange.end()};
8810       }
8811 
8812       // Invariant stores inside loop will be deleted and a single store
8813       // with the final reduction value will be added to the exit block
8814       StoreInst *SI;
8815       if ((SI = dyn_cast<StoreInst>(&I)) &&
8816           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8817         continue;
8818 
8819       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8820               Instr, Operands, Range, Plan)) {
8821         // If Instr can be simplified to an existing VPValue, use it.
8822         if (RecipeOrValue.is<VPValue *>()) {
8823           auto *VPV = RecipeOrValue.get<VPValue *>();
8824           Plan->addVPValue(Instr, VPV);
8825           // If the re-used value is a recipe, register the recipe for the
8826           // instruction, in case the recipe for Instr needs to be recorded.
8827           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8828             RecipeBuilder.setRecipe(Instr, R);
8829           continue;
8830         }
8831         // Otherwise, add the new recipe.
8832         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8833         for (auto *Def : Recipe->definedValues()) {
8834           auto *UV = Def->getUnderlyingValue();
8835           Plan->addVPValue(UV, Def);
8836         }
8837 
8838         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8839             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8840           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8841           // of the header block. That can happen for truncates of induction
8842           // variables. Those recipes are moved to the phi section of the header
8843           // block after applying SinkAfter, which relies on the original
8844           // position of the trunc.
8845           assert(isa<TruncInst>(Instr));
8846           InductionsToMove.push_back(
8847               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8848         }
8849         RecipeBuilder.setRecipe(Instr, Recipe);
8850         VPBB->appendRecipe(Recipe);
8851         continue;
8852       }
8853 
8854       // Otherwise, if all widening options failed, Instruction is to be
8855       // replicated. This may create a successor for VPBB.
8856       VPBasicBlock *NextVPBB =
8857           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8858       if (NextVPBB != VPBB) {
8859         VPBB = NextVPBB;
8860         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8861                                     : "");
8862       }
8863     }
8864 
8865     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8866     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8867   }
8868 
8869   HeaderVPBB->setName("vector.body");
8870 
8871   // Fold the last, empty block into its predecessor.
8872   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8873   assert(VPBB && "expected to fold last (empty) block");
8874   // After here, VPBB should not be used.
8875   VPBB = nullptr;
8876 
8877   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8878 
8879   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8880          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8881          "entry block must be set to a VPRegionBlock having a non-empty entry "
8882          "VPBasicBlock");
8883   RecipeBuilder.fixHeaderPhis();
8884 
8885   // ---------------------------------------------------------------------------
8886   // Transform initial VPlan: Apply previously taken decisions, in order, to
8887   // bring the VPlan to its final state.
8888   // ---------------------------------------------------------------------------
8889 
8890   // Apply Sink-After legal constraints.
8891   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8892     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8893     if (Region && Region->isReplicator()) {
8894       assert(Region->getNumSuccessors() == 1 &&
8895              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8896       assert(R->getParent()->size() == 1 &&
8897              "A recipe in an original replicator region must be the only "
8898              "recipe in its block");
8899       return Region;
8900     }
8901     return nullptr;
8902   };
8903   for (auto &Entry : SinkAfter) {
8904     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8905     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8906 
8907     auto *TargetRegion = GetReplicateRegion(Target);
8908     auto *SinkRegion = GetReplicateRegion(Sink);
8909     if (!SinkRegion) {
8910       // If the sink source is not a replicate region, sink the recipe directly.
8911       if (TargetRegion) {
8912         // The target is in a replication region, make sure to move Sink to
8913         // the block after it, not into the replication region itself.
8914         VPBasicBlock *NextBlock =
8915             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8916         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8917       } else
8918         Sink->moveAfter(Target);
8919       continue;
8920     }
8921 
8922     // The sink source is in a replicate region. Unhook the region from the CFG.
8923     auto *SinkPred = SinkRegion->getSinglePredecessor();
8924     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8925     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8926     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8927     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8928 
8929     if (TargetRegion) {
8930       // The target recipe is also in a replicate region, move the sink region
8931       // after the target region.
8932       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8933       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8934       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8935       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8936     } else {
8937       // The sink source is in a replicate region, we need to move the whole
8938       // replicate region, which should only contain a single recipe in the
8939       // main block.
8940       auto *SplitBlock =
8941           Target->getParent()->splitAt(std::next(Target->getIterator()));
8942 
8943       auto *SplitPred = SplitBlock->getSinglePredecessor();
8944 
8945       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8946       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8947       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8948     }
8949   }
8950 
8951   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8952   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8953 
8954   // Now that sink-after is done, move induction recipes for optimized truncates
8955   // to the phi section of the header block.
8956   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8957     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8958 
8959   // Adjust the recipes for any inloop reductions.
8960   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8961                              RecipeBuilder, Range.Start);
8962 
8963   // Introduce a recipe to combine the incoming and previous values of a
8964   // first-order recurrence.
8965   for (VPRecipeBase &R :
8966        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8967     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8968     if (!RecurPhi)
8969       continue;
8970 
8971     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8972     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8973     auto *Region = GetReplicateRegion(PrevRecipe);
8974     if (Region)
8975       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
8976     if (!InsertBlock) {
8977       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
8978       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
8979     }
8980     if (Region || PrevRecipe->isPhi())
8981       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8982     else
8983       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8984 
8985     auto *RecurSplice = cast<VPInstruction>(
8986         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8987                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8988 
8989     RecurPhi->replaceAllUsesWith(RecurSplice);
8990     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8991     // all users.
8992     RecurSplice->setOperand(0, RecurPhi);
8993   }
8994 
8995   // Interleave memory: for each Interleave Group we marked earlier as relevant
8996   // for this VPlan, replace the Recipes widening its memory instructions with a
8997   // single VPInterleaveRecipe at its insertion point.
8998   for (auto IG : InterleaveGroups) {
8999     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9000         RecipeBuilder.getRecipe(IG->getInsertPos()));
9001     SmallVector<VPValue *, 4> StoredValues;
9002     for (unsigned i = 0; i < IG->getFactor(); ++i)
9003       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9004         auto *StoreR =
9005             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9006         StoredValues.push_back(StoreR->getStoredValue());
9007       }
9008 
9009     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9010                                         Recipe->getMask());
9011     VPIG->insertBefore(Recipe);
9012     unsigned J = 0;
9013     for (unsigned i = 0; i < IG->getFactor(); ++i)
9014       if (Instruction *Member = IG->getMember(i)) {
9015         if (!Member->getType()->isVoidTy()) {
9016           VPValue *OriginalV = Plan->getVPValue(Member);
9017           Plan->removeVPValueFor(Member);
9018           Plan->addVPValue(Member, VPIG->getVPValue(J));
9019           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9020           J++;
9021         }
9022         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9023       }
9024   }
9025 
9026   std::string PlanName;
9027   raw_string_ostream RSO(PlanName);
9028   ElementCount VF = Range.Start;
9029   Plan->addVF(VF);
9030   RSO << "Initial VPlan for VF={" << VF;
9031   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9032     Plan->addVF(VF);
9033     RSO << "," << VF;
9034   }
9035   RSO << "},UF>=1";
9036   RSO.flush();
9037   Plan->setName(PlanName);
9038 
9039   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9040   // in ways that accessing values using original IR values is incorrect.
9041   Plan->disableValue2VPValue();
9042 
9043   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9044   VPlanTransforms::sinkScalarOperands(*Plan);
9045   VPlanTransforms::removeDeadRecipes(*Plan);
9046   VPlanTransforms::mergeReplicateRegions(*Plan);
9047   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9048 
9049   // Fold Exit block into its predecessor if possible.
9050   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9051   // VPBasicBlock as exit.
9052   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9053 
9054   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9055   return Plan;
9056 }
9057 
9058 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9059   // Outer loop handling: They may require CFG and instruction level
9060   // transformations before even evaluating whether vectorization is profitable.
9061   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9062   // the vectorization pipeline.
9063   assert(!OrigLoop->isInnermost());
9064   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9065 
9066   // Create new empty VPlan
9067   auto Plan = std::make_unique<VPlan>();
9068 
9069   // Build hierarchical CFG
9070   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9071   HCFGBuilder.buildHierarchicalCFG();
9072 
9073   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9074        VF *= 2)
9075     Plan->addVF(VF);
9076 
9077   SmallPtrSet<Instruction *, 1> DeadInstructions;
9078   VPlanTransforms::VPInstructionsToVPRecipes(
9079       OrigLoop, Plan,
9080       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9081       DeadInstructions, *PSE.getSE());
9082 
9083   // Remove the existing terminator of the exiting block of the top-most region.
9084   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9085   auto *Term =
9086       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9087   Term->eraseFromParent();
9088 
9089   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9090                         true, CM.useActiveLaneMaskForControlFlow());
9091   return Plan;
9092 }
9093 
9094 // Adjust the recipes for reductions. For in-loop reductions the chain of
9095 // instructions leading from the loop exit instr to the phi need to be converted
9096 // to reductions, with one operand being vector and the other being the scalar
9097 // reduction chain. For other reductions, a select is introduced between the phi
9098 // and live-out recipes when folding the tail.
9099 void LoopVectorizationPlanner::adjustRecipesForReductions(
9100     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9101     ElementCount MinVF) {
9102   for (auto &Reduction : CM.getInLoopReductionChains()) {
9103     PHINode *Phi = Reduction.first;
9104     const RecurrenceDescriptor &RdxDesc =
9105         Legal->getReductionVars().find(Phi)->second;
9106     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9107 
9108     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9109       continue;
9110 
9111     // ReductionOperations are orders top-down from the phi's use to the
9112     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9113     // which of the two operands will remain scalar and which will be reduced.
9114     // For minmax the chain will be the select instructions.
9115     Instruction *Chain = Phi;
9116     for (Instruction *R : ReductionOperations) {
9117       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9118       RecurKind Kind = RdxDesc.getRecurrenceKind();
9119 
9120       VPValue *ChainOp = Plan->getVPValue(Chain);
9121       unsigned FirstOpId;
9122       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9123              "Only min/max recurrences allowed for inloop reductions");
9124       // Recognize a call to the llvm.fmuladd intrinsic.
9125       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9126       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9127              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9128       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9129         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9130                "Expected to replace a VPWidenSelectSC");
9131         FirstOpId = 1;
9132       } else {
9133         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9134                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9135                "Expected to replace a VPWidenSC");
9136         FirstOpId = 0;
9137       }
9138       unsigned VecOpId =
9139           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9140       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9141 
9142       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9143                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9144                          : nullptr;
9145 
9146       if (IsFMulAdd) {
9147         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9148         // need to create an fmul recipe to use as the vector operand for the
9149         // fadd reduction.
9150         VPInstruction *FMulRecipe = new VPInstruction(
9151             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9152         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9153         WidenRecipe->getParent()->insert(FMulRecipe,
9154                                          WidenRecipe->getIterator());
9155         VecOp = FMulRecipe;
9156       }
9157       VPReductionRecipe *RedRecipe =
9158           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9159       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9160       Plan->removeVPValueFor(R);
9161       Plan->addVPValue(R, RedRecipe);
9162       // Append the recipe to the end of the VPBasicBlock because we need to
9163       // ensure that it comes after all of it's inputs, including CondOp.
9164       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9165       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9166       WidenRecipe->eraseFromParent();
9167 
9168       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9169         VPRecipeBase *CompareRecipe =
9170             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9171         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9172                "Expected to replace a VPWidenSC");
9173         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9174                "Expected no remaining users");
9175         CompareRecipe->eraseFromParent();
9176       }
9177       Chain = R;
9178     }
9179   }
9180 
9181   // If tail is folded by masking, introduce selects between the phi
9182   // and the live-out instruction of each reduction, at the beginning of the
9183   // dedicated latch block.
9184   if (CM.foldTailByMasking()) {
9185     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9186     for (VPRecipeBase &R :
9187          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9188       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9189       if (!PhiR || PhiR->isInLoop())
9190         continue;
9191       VPValue *Cond =
9192           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9193       VPValue *Red = PhiR->getBackedgeValue();
9194       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9195              "reduction recipe must be defined before latch");
9196       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9197     }
9198   }
9199 }
9200 
9201 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9202 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9203                                VPSlotTracker &SlotTracker) const {
9204   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9205   IG->getInsertPos()->printAsOperand(O, false);
9206   O << ", ";
9207   getAddr()->printAsOperand(O, SlotTracker);
9208   VPValue *Mask = getMask();
9209   if (Mask) {
9210     O << ", ";
9211     Mask->printAsOperand(O, SlotTracker);
9212   }
9213 
9214   unsigned OpIdx = 0;
9215   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9216     if (!IG->getMember(i))
9217       continue;
9218     if (getNumStoreOperands() > 0) {
9219       O << "\n" << Indent << "  store ";
9220       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9221       O << " to index " << i;
9222     } else {
9223       O << "\n" << Indent << "  ";
9224       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9225       O << " = load from index " << i;
9226     }
9227     ++OpIdx;
9228   }
9229 }
9230 #endif
9231 
9232 void VPWidenCallRecipe::execute(VPTransformState &State) {
9233   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9234                                   *this, State);
9235 }
9236 
9237 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9238   assert(!State.Instance && "Int or FP induction being replicated.");
9239 
9240   Value *Start = getStartValue()->getLiveInIRValue();
9241   const InductionDescriptor &ID = getInductionDescriptor();
9242   TruncInst *Trunc = getTruncInst();
9243   IRBuilderBase &Builder = State.Builder;
9244   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9245   assert(State.VF.isVector() && "must have vector VF");
9246 
9247   // The value from the original loop to which we are mapping the new induction
9248   // variable.
9249   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9250 
9251   // Fast-math-flags propagate from the original induction instruction.
9252   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9253   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9254     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9255 
9256   // Now do the actual transformations, and start with fetching the step value.
9257   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9258 
9259   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9260          "Expected either an induction phi-node or a truncate of it!");
9261 
9262   // Construct the initial value of the vector IV in the vector loop preheader
9263   auto CurrIP = Builder.saveIP();
9264   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9265   Builder.SetInsertPoint(VectorPH->getTerminator());
9266   if (isa<TruncInst>(EntryVal)) {
9267     assert(Start->getType()->isIntegerTy() &&
9268            "Truncation requires an integer type");
9269     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9270     Step = Builder.CreateTrunc(Step, TruncType);
9271     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9272   }
9273 
9274   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9275   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9276   Value *SteppedStart = getStepVector(
9277       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9278 
9279   // We create vector phi nodes for both integer and floating-point induction
9280   // variables. Here, we determine the kind of arithmetic we will perform.
9281   Instruction::BinaryOps AddOp;
9282   Instruction::BinaryOps MulOp;
9283   if (Step->getType()->isIntegerTy()) {
9284     AddOp = Instruction::Add;
9285     MulOp = Instruction::Mul;
9286   } else {
9287     AddOp = ID.getInductionOpcode();
9288     MulOp = Instruction::FMul;
9289   }
9290 
9291   // Multiply the vectorization factor by the step using integer or
9292   // floating-point arithmetic as appropriate.
9293   Type *StepType = Step->getType();
9294   Value *RuntimeVF;
9295   if (Step->getType()->isFloatingPointTy())
9296     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9297   else
9298     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9299   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9300 
9301   // Create a vector splat to use in the induction update.
9302   //
9303   // FIXME: If the step is non-constant, we create the vector splat with
9304   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9305   //        handle a constant vector splat.
9306   Value *SplatVF = isa<Constant>(Mul)
9307                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9308                        : Builder.CreateVectorSplat(State.VF, Mul);
9309   Builder.restoreIP(CurrIP);
9310 
9311   // We may need to add the step a number of times, depending on the unroll
9312   // factor. The last of those goes into the PHI.
9313   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9314                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9315   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9316   Instruction *LastInduction = VecInd;
9317   for (unsigned Part = 0; Part < State.UF; ++Part) {
9318     State.set(this, LastInduction, Part);
9319 
9320     if (isa<TruncInst>(EntryVal))
9321       State.addMetadata(LastInduction, EntryVal);
9322 
9323     LastInduction = cast<Instruction>(
9324         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9325     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9326   }
9327 
9328   LastInduction->setName("vec.ind.next");
9329   VecInd->addIncoming(SteppedStart, VectorPH);
9330   // Add induction update using an incorrect block temporarily. The phi node
9331   // will be fixed after VPlan execution. Note that at this point the latch
9332   // block cannot be used, as it does not exist yet.
9333   // TODO: Model increment value in VPlan, by turning the recipe into a
9334   // multi-def and a subclass of VPHeaderPHIRecipe.
9335   VecInd->addIncoming(LastInduction, VectorPH);
9336 }
9337 
9338 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9339   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9340          "Not a pointer induction according to InductionDescriptor!");
9341   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9342          "Unexpected type.");
9343 
9344   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9345   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9346 
9347   if (onlyScalarsGenerated(State.VF)) {
9348     // This is the normalized GEP that starts counting at zero.
9349     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9350         CanonicalIV, IndDesc.getStep()->getType());
9351     // Determine the number of scalars we need to generate for each unroll
9352     // iteration. If the instruction is uniform, we only need to generate the
9353     // first lane. Otherwise, we generate all VF values.
9354     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9355     assert((IsUniform || !State.VF.isScalable()) &&
9356            "Cannot scalarize a scalable VF");
9357     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9358 
9359     for (unsigned Part = 0; Part < State.UF; ++Part) {
9360       Value *PartStart =
9361           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9362 
9363       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9364         Value *Idx = State.Builder.CreateAdd(
9365             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9366         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9367 
9368         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9369                                       State.CFG.PrevBB->getTerminator());
9370         Value *SclrGep = emitTransformedIndex(
9371             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9372         SclrGep->setName("next.gep");
9373         State.set(this, SclrGep, VPIteration(Part, Lane));
9374       }
9375     }
9376     return;
9377   }
9378 
9379   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9380          "Induction step not a SCEV constant!");
9381   Type *PhiType = IndDesc.getStep()->getType();
9382 
9383   // Build a pointer phi
9384   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9385   Type *ScStValueType = ScalarStartValue->getType();
9386   PHINode *NewPointerPhi =
9387       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9388 
9389   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9390   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9391 
9392   // A pointer induction, performed by using a gep
9393   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9394   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9395 
9396   const SCEV *ScalarStep = IndDesc.getStep();
9397   SCEVExpander Exp(SE, DL, "induction");
9398   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9399   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9400   Value *NumUnrolledElems =
9401       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9402   Value *InductionGEP = GetElementPtrInst::Create(
9403       IndDesc.getElementType(), NewPointerPhi,
9404       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9405       InductionLoc);
9406   // Add induction update using an incorrect block temporarily. The phi node
9407   // will be fixed after VPlan execution. Note that at this point the latch
9408   // block cannot be used, as it does not exist yet.
9409   // TODO: Model increment value in VPlan, by turning the recipe into a
9410   // multi-def and a subclass of VPHeaderPHIRecipe.
9411   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9412 
9413   // Create UF many actual address geps that use the pointer
9414   // phi as base and a vectorized version of the step value
9415   // (<step*0, ..., step*N>) as offset.
9416   for (unsigned Part = 0; Part < State.UF; ++Part) {
9417     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9418     Value *StartOffsetScalar =
9419         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9420     Value *StartOffset =
9421         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9422     // Create a vector of consecutive numbers from zero to VF.
9423     StartOffset = State.Builder.CreateAdd(
9424         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9425 
9426     Value *GEP = State.Builder.CreateGEP(
9427         IndDesc.getElementType(), NewPointerPhi,
9428         State.Builder.CreateMul(
9429             StartOffset,
9430             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9431             "vector.gep"));
9432     State.set(this, GEP, Part);
9433   }
9434 }
9435 
9436 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9437   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9438 
9439   // Fast-math-flags propagate from the original induction instruction.
9440   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9441   if (IndDesc.getInductionBinOp() &&
9442       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9443     State.Builder.setFastMathFlags(
9444         IndDesc.getInductionBinOp()->getFastMathFlags());
9445 
9446   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9447   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9448     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9449     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9450     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9451       ScalarIV =
9452           Ty->isIntegerTy()
9453               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9454               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9455       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9456                                       getStartValue()->getLiveInIRValue(), Step,
9457                                       IndDesc);
9458       ScalarIV->setName("offset.idx");
9459     }
9460     if (TruncToTy) {
9461       assert(Step->getType()->isIntegerTy() &&
9462              "Truncation requires an integer step");
9463       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9464       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9465     }
9466     return ScalarIV;
9467   };
9468 
9469   Value *ScalarIV = CreateScalarIV(Step);
9470   if (State.VF.isVector()) {
9471     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9472     return;
9473   }
9474 
9475   for (unsigned Part = 0; Part < State.UF; ++Part) {
9476     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9477     Value *EntryPart;
9478     if (Step->getType()->isFloatingPointTy()) {
9479       Value *StartIdx =
9480           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9481       // Floating-point operations inherit FMF via the builder's flags.
9482       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9483       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9484                                             ScalarIV, MulOp);
9485     } else {
9486       Value *StartIdx =
9487           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9488       EntryPart = State.Builder.CreateAdd(
9489           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9490     }
9491     State.set(this, EntryPart, Part);
9492   }
9493 }
9494 
9495 void VPInterleaveRecipe::execute(VPTransformState &State) {
9496   assert(!State.Instance && "Interleave group being replicated.");
9497   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9498                                       getStoredValues(), getMask());
9499 }
9500 
9501 void VPReductionRecipe::execute(VPTransformState &State) {
9502   assert(!State.Instance && "Reduction being replicated.");
9503   Value *PrevInChain = State.get(getChainOp(), 0);
9504   RecurKind Kind = RdxDesc->getRecurrenceKind();
9505   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9506   // Propagate the fast-math flags carried by the underlying instruction.
9507   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9508   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9509   for (unsigned Part = 0; Part < State.UF; ++Part) {
9510     Value *NewVecOp = State.get(getVecOp(), Part);
9511     if (VPValue *Cond = getCondOp()) {
9512       Value *NewCond = State.get(Cond, Part);
9513       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9514       Value *Iden = RdxDesc->getRecurrenceIdentity(
9515           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9516       Value *IdenVec =
9517           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9518       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9519       NewVecOp = Select;
9520     }
9521     Value *NewRed;
9522     Value *NextInChain;
9523     if (IsOrdered) {
9524       if (State.VF.isVector())
9525         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9526                                         PrevInChain);
9527       else
9528         NewRed = State.Builder.CreateBinOp(
9529             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9530             NewVecOp);
9531       PrevInChain = NewRed;
9532     } else {
9533       PrevInChain = State.get(getChainOp(), Part);
9534       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9535     }
9536     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9537       NextInChain =
9538           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9539                          NewRed, PrevInChain);
9540     } else if (IsOrdered)
9541       NextInChain = NewRed;
9542     else
9543       NextInChain = State.Builder.CreateBinOp(
9544           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9545           PrevInChain);
9546     State.set(this, NextInChain, Part);
9547   }
9548 }
9549 
9550 void VPReplicateRecipe::execute(VPTransformState &State) {
9551   if (State.Instance) { // Generate a single instance.
9552     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9553     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9554                                     IsPredicated, State);
9555     // Insert scalar instance packing it into a vector.
9556     if (AlsoPack && State.VF.isVector()) {
9557       // If we're constructing lane 0, initialize to start from poison.
9558       if (State.Instance->Lane.isFirstLane()) {
9559         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9560         Value *Poison = PoisonValue::get(
9561             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9562         State.set(this, Poison, State.Instance->Part);
9563       }
9564       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9565     }
9566     return;
9567   }
9568 
9569   if (IsUniform) {
9570     // Uniform within VL means we need to generate lane 0 only for each
9571     // unrolled copy.
9572     for (unsigned Part = 0; Part < State.UF; ++Part)
9573       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9574                                       VPIteration(Part, 0), IsPredicated,
9575                                       State);
9576     return;
9577   }
9578 
9579   // Generate scalar instances for all VF lanes of all UF parts.
9580   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9581   const unsigned EndLane = State.VF.getKnownMinValue();
9582   for (unsigned Part = 0; Part < State.UF; ++Part)
9583     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9584       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9585                                       VPIteration(Part, Lane), IsPredicated,
9586                                       State);
9587 }
9588 
9589 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9590   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9591 
9592   // Attempt to issue a wide load.
9593   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9594   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9595 
9596   assert((LI || SI) && "Invalid Load/Store instruction");
9597   assert((!SI || StoredValue) && "No stored value provided for widened store");
9598   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9599 
9600   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9601 
9602   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9603   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9604   bool CreateGatherScatter = !Consecutive;
9605 
9606   auto &Builder = State.Builder;
9607   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9608   bool isMaskRequired = getMask();
9609   if (isMaskRequired)
9610     for (unsigned Part = 0; Part < State.UF; ++Part)
9611       BlockInMaskParts[Part] = State.get(getMask(), Part);
9612 
9613   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9614     // Calculate the pointer for the specific unroll-part.
9615     GetElementPtrInst *PartPtr = nullptr;
9616 
9617     bool InBounds = false;
9618     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9619       InBounds = gep->isInBounds();
9620     if (Reverse) {
9621       // If the address is consecutive but reversed, then the
9622       // wide store needs to start at the last vector element.
9623       // RunTimeVF =  VScale * VF.getKnownMinValue()
9624       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9625       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9626       // NumElt = -Part * RunTimeVF
9627       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9628       // LastLane = 1 - RunTimeVF
9629       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9630       PartPtr =
9631           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9632       PartPtr->setIsInBounds(InBounds);
9633       PartPtr = cast<GetElementPtrInst>(
9634           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9635       PartPtr->setIsInBounds(InBounds);
9636       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9637         BlockInMaskParts[Part] =
9638             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9639     } else {
9640       Value *Increment =
9641           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9642       PartPtr = cast<GetElementPtrInst>(
9643           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9644       PartPtr->setIsInBounds(InBounds);
9645     }
9646 
9647     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9648     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9649   };
9650 
9651   // Handle Stores:
9652   if (SI) {
9653     State.setDebugLocFromInst(SI);
9654 
9655     for (unsigned Part = 0; Part < State.UF; ++Part) {
9656       Instruction *NewSI = nullptr;
9657       Value *StoredVal = State.get(StoredValue, Part);
9658       if (CreateGatherScatter) {
9659         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9660         Value *VectorGep = State.get(getAddr(), Part);
9661         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9662                                             MaskPart);
9663       } else {
9664         if (Reverse) {
9665           // If we store to reverse consecutive memory locations, then we need
9666           // to reverse the order of elements in the stored value.
9667           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9668           // We don't want to update the value in the map as it might be used in
9669           // another expression. So don't call resetVectorValue(StoredVal).
9670         }
9671         auto *VecPtr =
9672             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9673         if (isMaskRequired)
9674           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9675                                             BlockInMaskParts[Part]);
9676         else
9677           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9678       }
9679       State.addMetadata(NewSI, SI);
9680     }
9681     return;
9682   }
9683 
9684   // Handle loads.
9685   assert(LI && "Must have a load instruction");
9686   State.setDebugLocFromInst(LI);
9687   for (unsigned Part = 0; Part < State.UF; ++Part) {
9688     Value *NewLI;
9689     if (CreateGatherScatter) {
9690       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9691       Value *VectorGep = State.get(getAddr(), Part);
9692       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9693                                          nullptr, "wide.masked.gather");
9694       State.addMetadata(NewLI, LI);
9695     } else {
9696       auto *VecPtr =
9697           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9698       if (isMaskRequired)
9699         NewLI = Builder.CreateMaskedLoad(
9700             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9701             PoisonValue::get(DataTy), "wide.masked.load");
9702       else
9703         NewLI =
9704             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9705 
9706       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9707       State.addMetadata(NewLI, LI);
9708       if (Reverse)
9709         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9710     }
9711 
9712     State.set(getVPSingleValue(), NewLI, Part);
9713   }
9714 }
9715 
9716 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9717 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9718 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9719 // for predication.
9720 static ScalarEpilogueLowering getScalarEpilogueLowering(
9721     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9722     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9723     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9724     LoopVectorizationLegality &LVL) {
9725   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9726   // don't look at hints or options, and don't request a scalar epilogue.
9727   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9728   // LoopAccessInfo (due to code dependency and not being able to reliably get
9729   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9730   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9731   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9732   // back to the old way and vectorize with versioning when forced. See D81345.)
9733   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9734                                                       PGSOQueryType::IRPass) &&
9735                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9736     return CM_ScalarEpilogueNotAllowedOptSize;
9737 
9738   // 2) If set, obey the directives
9739   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9740     switch (PreferPredicateOverEpilogue) {
9741     case PreferPredicateTy::ScalarEpilogue:
9742       return CM_ScalarEpilogueAllowed;
9743     case PreferPredicateTy::PredicateElseScalarEpilogue:
9744       return CM_ScalarEpilogueNotNeededUsePredicate;
9745     case PreferPredicateTy::PredicateOrDontVectorize:
9746       return CM_ScalarEpilogueNotAllowedUsePredicate;
9747     };
9748   }
9749 
9750   // 3) If set, obey the hints
9751   switch (Hints.getPredicate()) {
9752   case LoopVectorizeHints::FK_Enabled:
9753     return CM_ScalarEpilogueNotNeededUsePredicate;
9754   case LoopVectorizeHints::FK_Disabled:
9755     return CM_ScalarEpilogueAllowed;
9756   };
9757 
9758   // 4) if the TTI hook indicates this is profitable, request predication.
9759   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
9760     return CM_ScalarEpilogueNotNeededUsePredicate;
9761 
9762   return CM_ScalarEpilogueAllowed;
9763 }
9764 
9765 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9766   // If Values have been set for this Def return the one relevant for \p Part.
9767   if (hasVectorValue(Def, Part))
9768     return Data.PerPartOutput[Def][Part];
9769 
9770   if (!hasScalarValue(Def, {Part, 0})) {
9771     Value *IRV = Def->getLiveInIRValue();
9772     Value *B = ILV->getBroadcastInstrs(IRV);
9773     set(Def, B, Part);
9774     return B;
9775   }
9776 
9777   Value *ScalarValue = get(Def, {Part, 0});
9778   // If we aren't vectorizing, we can just copy the scalar map values over
9779   // to the vector map.
9780   if (VF.isScalar()) {
9781     set(Def, ScalarValue, Part);
9782     return ScalarValue;
9783   }
9784 
9785   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9786   bool IsUniform = RepR && RepR->isUniform();
9787 
9788   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9789   // Check if there is a scalar value for the selected lane.
9790   if (!hasScalarValue(Def, {Part, LastLane})) {
9791     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9792     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
9793             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
9794            "unexpected recipe found to be invariant");
9795     IsUniform = true;
9796     LastLane = 0;
9797   }
9798 
9799   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9800   // Set the insert point after the last scalarized instruction or after the
9801   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9802   // will directly follow the scalar definitions.
9803   auto OldIP = Builder.saveIP();
9804   auto NewIP =
9805       isa<PHINode>(LastInst)
9806           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9807           : std::next(BasicBlock::iterator(LastInst));
9808   Builder.SetInsertPoint(&*NewIP);
9809 
9810   // However, if we are vectorizing, we need to construct the vector values.
9811   // If the value is known to be uniform after vectorization, we can just
9812   // broadcast the scalar value corresponding to lane zero for each unroll
9813   // iteration. Otherwise, we construct the vector values using
9814   // insertelement instructions. Since the resulting vectors are stored in
9815   // State, we will only generate the insertelements once.
9816   Value *VectorValue = nullptr;
9817   if (IsUniform) {
9818     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9819     set(Def, VectorValue, Part);
9820   } else {
9821     // Initialize packing with insertelements to start from undef.
9822     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9823     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9824     set(Def, Undef, Part);
9825     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9826       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9827     VectorValue = get(Def, Part);
9828   }
9829   Builder.restoreIP(OldIP);
9830   return VectorValue;
9831 }
9832 
9833 // Process the loop in the VPlan-native vectorization path. This path builds
9834 // VPlan upfront in the vectorization pipeline, which allows to apply
9835 // VPlan-to-VPlan transformations from the very beginning without modifying the
9836 // input LLVM IR.
9837 static bool processLoopInVPlanNativePath(
9838     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9839     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9840     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9841     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9842     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9843     LoopVectorizationRequirements &Requirements) {
9844 
9845   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9846     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9847     return false;
9848   }
9849   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9850   Function *F = L->getHeader()->getParent();
9851   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9852 
9853   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9854       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9855 
9856   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9857                                 &Hints, IAI);
9858   // Use the planner for outer loop vectorization.
9859   // TODO: CM is not used at this point inside the planner. Turn CM into an
9860   // optional argument if we don't need it in the future.
9861   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9862 
9863   // Get user vectorization factor.
9864   ElementCount UserVF = Hints.getWidth();
9865 
9866   CM.collectElementTypesForWidening();
9867 
9868   // Plan how to best vectorize, return the best VF and its cost.
9869   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9870 
9871   // If we are stress testing VPlan builds, do not attempt to generate vector
9872   // code. Masked vector code generation support will follow soon.
9873   // Also, do not attempt to vectorize if no vector code will be produced.
9874   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9875     return false;
9876 
9877   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9878 
9879   {
9880     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9881                              F->getParent()->getDataLayout());
9882     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9883                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9884     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9885                       << L->getHeader()->getParent()->getName() << "\"\n");
9886     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9887   }
9888 
9889   // Mark the loop as already vectorized to avoid vectorizing again.
9890   Hints.setAlreadyVectorized();
9891   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9892   return true;
9893 }
9894 
9895 // Emit a remark if there are stores to floats that required a floating point
9896 // extension. If the vectorized loop was generated with floating point there
9897 // will be a performance penalty from the conversion overhead and the change in
9898 // the vector width.
9899 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9900   SmallVector<Instruction *, 4> Worklist;
9901   for (BasicBlock *BB : L->getBlocks()) {
9902     for (Instruction &Inst : *BB) {
9903       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9904         if (S->getValueOperand()->getType()->isFloatTy())
9905           Worklist.push_back(S);
9906       }
9907     }
9908   }
9909 
9910   // Traverse the floating point stores upwards searching, for floating point
9911   // conversions.
9912   SmallPtrSet<const Instruction *, 4> Visited;
9913   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9914   while (!Worklist.empty()) {
9915     auto *I = Worklist.pop_back_val();
9916     if (!L->contains(I))
9917       continue;
9918     if (!Visited.insert(I).second)
9919       continue;
9920 
9921     // Emit a remark if the floating point store required a floating
9922     // point conversion.
9923     // TODO: More work could be done to identify the root cause such as a
9924     // constant or a function return type and point the user to it.
9925     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9926       ORE->emit([&]() {
9927         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9928                                           I->getDebugLoc(), L->getHeader())
9929                << "floating point conversion changes vector width. "
9930                << "Mixed floating point precision requires an up/down "
9931                << "cast that will negatively impact performance.";
9932       });
9933 
9934     for (Use &Op : I->operands())
9935       if (auto *OpI = dyn_cast<Instruction>(Op))
9936         Worklist.push_back(OpI);
9937   }
9938 }
9939 
9940 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9941                                        VectorizationFactor &VF,
9942                                        Optional<unsigned> VScale, Loop *L,
9943                                        ScalarEvolution &SE) {
9944   InstructionCost CheckCost = Checks.getCost();
9945   if (!CheckCost.isValid())
9946     return false;
9947 
9948   // When interleaving only scalar and vector cost will be equal, which in turn
9949   // would lead to a divide by 0. Fall back to hard threshold.
9950   if (VF.Width.isScalar()) {
9951     if (CheckCost > VectorizeMemoryCheckThreshold) {
9952       LLVM_DEBUG(
9953           dbgs()
9954           << "LV: Interleaving only is not profitable due to runtime checks\n");
9955       return false;
9956     }
9957     return true;
9958   }
9959 
9960   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9961   double ScalarC = *VF.ScalarCost.getValue();
9962   if (ScalarC == 0)
9963     return true;
9964 
9965   // First, compute the minimum iteration count required so that the vector
9966   // loop outperforms the scalar loop.
9967   //  The total cost of the scalar loop is
9968   //   ScalarC * TC
9969   //  where
9970   //  * TC is the actual trip count of the loop.
9971   //  * ScalarC is the cost of a single scalar iteration.
9972   //
9973   //  The total cost of the vector loop is
9974   //    RtC + VecC * (TC / VF) + EpiC
9975   //  where
9976   //  * RtC is the cost of the generated runtime checks
9977   //  * VecC is the cost of a single vector iteration.
9978   //  * TC is the actual trip count of the loop
9979   //  * VF is the vectorization factor
9980   //  * EpiCost is the cost of the generated epilogue, including the cost
9981   //    of the remaining scalar operations.
9982   //
9983   // Vectorization is profitable once the total vector cost is less than the
9984   // total scalar cost:
9985   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9986   //
9987   // Now we can compute the minimum required trip count TC as
9988   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9989   //
9990   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9991   // the computations are performed on doubles, not integers and the result
9992   // is rounded up, hence we get an upper estimate of the TC.
9993   unsigned IntVF = VF.Width.getKnownMinValue();
9994   if (VF.Width.isScalable()) {
9995     unsigned AssumedMinimumVscale = 1;
9996     if (VScale)
9997       AssumedMinimumVscale = *VScale;
9998     IntVF *= AssumedMinimumVscale;
9999   }
10000   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10001   double RtC = *CheckCost.getValue();
10002   double MinTC1 = RtC / (ScalarC - VecCOverVF);
10003 
10004   // Second, compute a minimum iteration count so that the cost of the
10005   // runtime checks is only a fraction of the total scalar loop cost. This
10006   // adds a loop-dependent bound on the overhead incurred if the runtime
10007   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10008   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10009   // cost, compute
10010   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10011   double MinTC2 = RtC * 10 / ScalarC;
10012 
10013   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10014   // next closest multiple of VF. This should partly compensate for ignoring
10015   // the epilogue cost.
10016   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10017   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10018 
10019   LLVM_DEBUG(
10020       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10021              << VF.MinProfitableTripCount << "\n");
10022 
10023   // Skip vectorization if the expected trip count is less than the minimum
10024   // required trip count.
10025   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10026     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10027                                 VF.MinProfitableTripCount)) {
10028       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10029                            "trip count < minimum profitable VF ("
10030                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10031                         << ")\n");
10032 
10033       return false;
10034     }
10035   }
10036   return true;
10037 }
10038 
10039 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10040     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10041                                !EnableLoopInterleaving),
10042       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10043                               !EnableLoopVectorization) {}
10044 
10045 bool LoopVectorizePass::processLoop(Loop *L) {
10046   assert((EnableVPlanNativePath || L->isInnermost()) &&
10047          "VPlan-native path is not enabled. Only process inner loops.");
10048 
10049 #ifndef NDEBUG
10050   const std::string DebugLocStr = getDebugLocString(L);
10051 #endif /* NDEBUG */
10052 
10053   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10054                     << L->getHeader()->getParent()->getName() << "' from "
10055                     << DebugLocStr << "\n");
10056 
10057   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10058 
10059   LLVM_DEBUG(
10060       dbgs() << "LV: Loop hints:"
10061              << " force="
10062              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10063                      ? "disabled"
10064                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10065                             ? "enabled"
10066                             : "?"))
10067              << " width=" << Hints.getWidth()
10068              << " interleave=" << Hints.getInterleave() << "\n");
10069 
10070   // Function containing loop
10071   Function *F = L->getHeader()->getParent();
10072 
10073   // Looking at the diagnostic output is the only way to determine if a loop
10074   // was vectorized (other than looking at the IR or machine code), so it
10075   // is important to generate an optimization remark for each loop. Most of
10076   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10077   // generated as OptimizationRemark and OptimizationRemarkMissed are
10078   // less verbose reporting vectorized loops and unvectorized loops that may
10079   // benefit from vectorization, respectively.
10080 
10081   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10082     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10083     return false;
10084   }
10085 
10086   PredicatedScalarEvolution PSE(*SE, *L);
10087 
10088   // Check if it is legal to vectorize the loop.
10089   LoopVectorizationRequirements Requirements;
10090   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10091                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10092   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10093     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10094     Hints.emitRemarkWithHints();
10095     return false;
10096   }
10097 
10098   // Check the function attributes and profiles to find out if this function
10099   // should be optimized for size.
10100   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10101       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10102 
10103   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10104   // here. They may require CFG and instruction level transformations before
10105   // even evaluating whether vectorization is profitable. Since we cannot modify
10106   // the incoming IR, we need to build VPlan upfront in the vectorization
10107   // pipeline.
10108   if (!L->isInnermost())
10109     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10110                                         ORE, BFI, PSI, Hints, Requirements);
10111 
10112   assert(L->isInnermost() && "Inner loop expected.");
10113 
10114   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10115   // count by optimizing for size, to minimize overheads.
10116   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10117   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10118     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10119                       << "This loop is worth vectorizing only if no scalar "
10120                       << "iteration overheads are incurred.");
10121     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10122       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10123     else {
10124       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10125         LLVM_DEBUG(dbgs() << "\n");
10126         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10127       } else {
10128         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10129                              "small to consider vectorizing.\n");
10130         reportVectorizationFailure(
10131             "The trip count is below the minial threshold value.",
10132             "loop trip count is too low, avoiding vectorization",
10133             "LowTripCount", ORE, L);
10134         Hints.emitRemarkWithHints();
10135         return false;
10136       }
10137     }
10138   }
10139 
10140   // Check the function attributes to see if implicit floats are allowed.
10141   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10142   // an integer loop and the vector instructions selected are purely integer
10143   // vector instructions?
10144   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10145     reportVectorizationFailure(
10146         "Can't vectorize when the NoImplicitFloat attribute is used",
10147         "loop not vectorized due to NoImplicitFloat attribute",
10148         "NoImplicitFloat", ORE, L);
10149     Hints.emitRemarkWithHints();
10150     return false;
10151   }
10152 
10153   // Check if the target supports potentially unsafe FP vectorization.
10154   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10155   // for the target we're vectorizing for, to make sure none of the
10156   // additional fp-math flags can help.
10157   if (Hints.isPotentiallyUnsafe() &&
10158       TTI->isFPVectorizationPotentiallyUnsafe()) {
10159     reportVectorizationFailure(
10160         "Potentially unsafe FP op prevents vectorization",
10161         "loop not vectorized due to unsafe FP support.",
10162         "UnsafeFP", ORE, L);
10163     Hints.emitRemarkWithHints();
10164     return false;
10165   }
10166 
10167   bool AllowOrderedReductions;
10168   // If the flag is set, use that instead and override the TTI behaviour.
10169   if (ForceOrderedReductions.getNumOccurrences() > 0)
10170     AllowOrderedReductions = ForceOrderedReductions;
10171   else
10172     AllowOrderedReductions = TTI->enableOrderedReductions();
10173   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10174     ORE->emit([&]() {
10175       auto *ExactFPMathInst = Requirements.getExactFPInst();
10176       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10177                                                  ExactFPMathInst->getDebugLoc(),
10178                                                  ExactFPMathInst->getParent())
10179              << "loop not vectorized: cannot prove it is safe to reorder "
10180                 "floating-point operations";
10181     });
10182     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10183                          "reorder floating-point operations\n");
10184     Hints.emitRemarkWithHints();
10185     return false;
10186   }
10187 
10188   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10189   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10190 
10191   // If an override option has been passed in for interleaved accesses, use it.
10192   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10193     UseInterleaved = EnableInterleavedMemAccesses;
10194 
10195   // Analyze interleaved memory accesses.
10196   if (UseInterleaved) {
10197     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10198   }
10199 
10200   // Use the cost model.
10201   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10202                                 F, &Hints, IAI);
10203   CM.collectValuesToIgnore();
10204   CM.collectElementTypesForWidening();
10205 
10206   // Use the planner for vectorization.
10207   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10208 
10209   // Get user vectorization factor and interleave count.
10210   ElementCount UserVF = Hints.getWidth();
10211   unsigned UserIC = Hints.getInterleave();
10212 
10213   // Plan how to best vectorize, return the best VF and its cost.
10214   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10215 
10216   VectorizationFactor VF = VectorizationFactor::Disabled();
10217   unsigned IC = 1;
10218 
10219   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10220                            F->getParent()->getDataLayout());
10221   if (MaybeVF) {
10222     VF = *MaybeVF;
10223     // Select the interleave count.
10224     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10225 
10226     unsigned SelectedIC = std::max(IC, UserIC);
10227     //  Optimistically generate runtime checks if they are needed. Drop them if
10228     //  they turn out to not be profitable.
10229     if (VF.Width.isVector() || SelectedIC > 1)
10230       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10231 
10232     // Check if it is profitable to vectorize with runtime checks.
10233     bool ForceVectorization =
10234         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10235     if (!ForceVectorization &&
10236         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10237                                     *PSE.getSE())) {
10238       ORE->emit([&]() {
10239         return OptimizationRemarkAnalysisAliasing(
10240                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10241                    L->getHeader())
10242                << "loop not vectorized: cannot prove it is safe to reorder "
10243                   "memory operations";
10244       });
10245       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10246       Hints.emitRemarkWithHints();
10247       return false;
10248     }
10249   }
10250 
10251   // Identify the diagnostic messages that should be produced.
10252   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10253   bool VectorizeLoop = true, InterleaveLoop = true;
10254   if (VF.Width.isScalar()) {
10255     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10256     VecDiagMsg = std::make_pair(
10257         "VectorizationNotBeneficial",
10258         "the cost-model indicates that vectorization is not beneficial");
10259     VectorizeLoop = false;
10260   }
10261 
10262   if (!MaybeVF && UserIC > 1) {
10263     // Tell the user interleaving was avoided up-front, despite being explicitly
10264     // requested.
10265     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10266                          "interleaving should be avoided up front\n");
10267     IntDiagMsg = std::make_pair(
10268         "InterleavingAvoided",
10269         "Ignoring UserIC, because interleaving was avoided up front");
10270     InterleaveLoop = false;
10271   } else if (IC == 1 && UserIC <= 1) {
10272     // Tell the user interleaving is not beneficial.
10273     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10274     IntDiagMsg = std::make_pair(
10275         "InterleavingNotBeneficial",
10276         "the cost-model indicates that interleaving is not beneficial");
10277     InterleaveLoop = false;
10278     if (UserIC == 1) {
10279       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10280       IntDiagMsg.second +=
10281           " and is explicitly disabled or interleave count is set to 1";
10282     }
10283   } else if (IC > 1 && UserIC == 1) {
10284     // Tell the user interleaving is beneficial, but it explicitly disabled.
10285     LLVM_DEBUG(
10286         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10287     IntDiagMsg = std::make_pair(
10288         "InterleavingBeneficialButDisabled",
10289         "the cost-model indicates that interleaving is beneficial "
10290         "but is explicitly disabled or interleave count is set to 1");
10291     InterleaveLoop = false;
10292   }
10293 
10294   // Override IC if user provided an interleave count.
10295   IC = UserIC > 0 ? UserIC : IC;
10296 
10297   // Emit diagnostic messages, if any.
10298   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10299   if (!VectorizeLoop && !InterleaveLoop) {
10300     // Do not vectorize or interleaving the loop.
10301     ORE->emit([&]() {
10302       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10303                                       L->getStartLoc(), L->getHeader())
10304              << VecDiagMsg.second;
10305     });
10306     ORE->emit([&]() {
10307       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10308                                       L->getStartLoc(), L->getHeader())
10309              << IntDiagMsg.second;
10310     });
10311     return false;
10312   } else if (!VectorizeLoop && InterleaveLoop) {
10313     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10314     ORE->emit([&]() {
10315       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10316                                         L->getStartLoc(), L->getHeader())
10317              << VecDiagMsg.second;
10318     });
10319   } else if (VectorizeLoop && !InterleaveLoop) {
10320     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10321                       << ") in " << DebugLocStr << '\n');
10322     ORE->emit([&]() {
10323       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10324                                         L->getStartLoc(), L->getHeader())
10325              << IntDiagMsg.second;
10326     });
10327   } else if (VectorizeLoop && InterleaveLoop) {
10328     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10329                       << ") in " << DebugLocStr << '\n');
10330     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10331   }
10332 
10333   bool DisableRuntimeUnroll = false;
10334   MDNode *OrigLoopID = L->getLoopID();
10335   {
10336     using namespace ore;
10337     if (!VectorizeLoop) {
10338       assert(IC > 1 && "interleave count should not be 1 or 0");
10339       // If we decided that it is not legal to vectorize the loop, then
10340       // interleave it.
10341       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10342                                  &CM, BFI, PSI, Checks);
10343 
10344       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10345       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10346 
10347       ORE->emit([&]() {
10348         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10349                                   L->getHeader())
10350                << "interleaved loop (interleaved count: "
10351                << NV("InterleaveCount", IC) << ")";
10352       });
10353     } else {
10354       // If we decided that it is *legal* to vectorize the loop, then do it.
10355 
10356       // Consider vectorizing the epilogue too if it's profitable.
10357       VectorizationFactor EpilogueVF =
10358           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10359       if (EpilogueVF.Width.isVector()) {
10360 
10361         // The first pass vectorizes the main loop and creates a scalar epilogue
10362         // to be vectorized by executing the plan (potentially with a different
10363         // factor) again shortly afterwards.
10364         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10365         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10366                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10367 
10368         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10369         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10370                         DT, true);
10371         ++LoopsVectorized;
10372 
10373         // Second pass vectorizes the epilogue and adjusts the control flow
10374         // edges from the first pass.
10375         EPI.MainLoopVF = EPI.EpilogueVF;
10376         EPI.MainLoopUF = EPI.EpilogueUF;
10377         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10378                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10379                                                  Checks);
10380 
10381         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10382         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10383         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10384         Header->setName("vec.epilog.vector.body");
10385 
10386         // Ensure that the start values for any VPReductionPHIRecipes are
10387         // updated before vectorising the epilogue loop.
10388         for (VPRecipeBase &R : Header->phis()) {
10389           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10390             if (auto *Resume = MainILV.getReductionResumeValue(
10391                     ReductionPhi->getRecurrenceDescriptor())) {
10392               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10393               ReductionPhi->setOperand(0, StartVal);
10394             }
10395           }
10396         }
10397 
10398         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10399                         DT, true);
10400         ++LoopsEpilogueVectorized;
10401 
10402         if (!MainILV.areSafetyChecksAdded())
10403           DisableRuntimeUnroll = true;
10404       } else {
10405         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10406                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10407                                PSI, Checks);
10408 
10409         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10410         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10411         ++LoopsVectorized;
10412 
10413         // Add metadata to disable runtime unrolling a scalar loop when there
10414         // are no runtime checks about strides and memory. A scalar loop that is
10415         // rarely used is not worth unrolling.
10416         if (!LB.areSafetyChecksAdded())
10417           DisableRuntimeUnroll = true;
10418       }
10419       // Report the vectorization decision.
10420       ORE->emit([&]() {
10421         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10422                                   L->getHeader())
10423                << "vectorized loop (vectorization width: "
10424                << NV("VectorizationFactor", VF.Width)
10425                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10426       });
10427     }
10428 
10429     if (ORE->allowExtraAnalysis(LV_NAME))
10430       checkMixedPrecision(L, ORE);
10431   }
10432 
10433   Optional<MDNode *> RemainderLoopID =
10434       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10435                                       LLVMLoopVectorizeFollowupEpilogue});
10436   if (RemainderLoopID) {
10437     L->setLoopID(RemainderLoopID.value());
10438   } else {
10439     if (DisableRuntimeUnroll)
10440       AddRuntimeUnrollDisableMetaData(L);
10441 
10442     // Mark the loop as already vectorized to avoid vectorizing again.
10443     Hints.setAlreadyVectorized();
10444   }
10445 
10446   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10447   return true;
10448 }
10449 
10450 LoopVectorizeResult LoopVectorizePass::runImpl(
10451     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10452     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10453     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10454     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10455     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10456   SE = &SE_;
10457   LI = &LI_;
10458   TTI = &TTI_;
10459   DT = &DT_;
10460   BFI = &BFI_;
10461   TLI = TLI_;
10462   AA = &AA_;
10463   AC = &AC_;
10464   GetLAA = &GetLAA_;
10465   DB = &DB_;
10466   ORE = &ORE_;
10467   PSI = PSI_;
10468 
10469   // Don't attempt if
10470   // 1. the target claims to have no vector registers, and
10471   // 2. interleaving won't help ILP.
10472   //
10473   // The second condition is necessary because, even if the target has no
10474   // vector registers, loop vectorization may still enable scalar
10475   // interleaving.
10476   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10477       TTI->getMaxInterleaveFactor(1) < 2)
10478     return LoopVectorizeResult(false, false);
10479 
10480   bool Changed = false, CFGChanged = false;
10481 
10482   // The vectorizer requires loops to be in simplified form.
10483   // Since simplification may add new inner loops, it has to run before the
10484   // legality and profitability checks. This means running the loop vectorizer
10485   // will simplify all loops, regardless of whether anything end up being
10486   // vectorized.
10487   for (auto &L : *LI)
10488     Changed |= CFGChanged |=
10489         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10490 
10491   // Build up a worklist of inner-loops to vectorize. This is necessary as
10492   // the act of vectorizing or partially unrolling a loop creates new loops
10493   // and can invalidate iterators across the loops.
10494   SmallVector<Loop *, 8> Worklist;
10495 
10496   for (Loop *L : *LI)
10497     collectSupportedLoops(*L, LI, ORE, Worklist);
10498 
10499   LoopsAnalyzed += Worklist.size();
10500 
10501   // Now walk the identified inner loops.
10502   while (!Worklist.empty()) {
10503     Loop *L = Worklist.pop_back_val();
10504 
10505     // For the inner loops we actually process, form LCSSA to simplify the
10506     // transform.
10507     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10508 
10509     Changed |= CFGChanged |= processLoop(L);
10510   }
10511 
10512   // Process each loop nest in the function.
10513   return LoopVectorizeResult(Changed, CFGChanged);
10514 }
10515 
10516 PreservedAnalyses LoopVectorizePass::run(Function &F,
10517                                          FunctionAnalysisManager &AM) {
10518     auto &LI = AM.getResult<LoopAnalysis>(F);
10519     // There are no loops in the function. Return before computing other expensive
10520     // analyses.
10521     if (LI.empty())
10522       return PreservedAnalyses::all();
10523     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10524     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10525     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10526     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10527     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10528     auto &AA = AM.getResult<AAManager>(F);
10529     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10530     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10531     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10532 
10533     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10534     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10535         [&](Loop &L) -> const LoopAccessInfo & {
10536       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10537                                         TLI, TTI, nullptr, nullptr, nullptr};
10538       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10539     };
10540     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10541     ProfileSummaryInfo *PSI =
10542         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10543     LoopVectorizeResult Result =
10544         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10545     if (!Result.MadeAnyChange)
10546       return PreservedAnalyses::all();
10547     PreservedAnalyses PA;
10548 
10549     // We currently do not preserve loopinfo/dominator analyses with outer loop
10550     // vectorization. Until this is addressed, mark these analyses as preserved
10551     // only for non-VPlan-native path.
10552     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10553     if (!EnableVPlanNativePath) {
10554       PA.preserve<LoopAnalysis>();
10555       PA.preserve<DominatorTreeAnalysis>();
10556     }
10557 
10558     if (Result.MadeCFGChange) {
10559       // Making CFG changes likely means a loop got vectorized. Indicate that
10560       // extra simplification passes should be run.
10561       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10562       // be run if runtime checks have been added.
10563       AM.getResult<ShouldRunExtraVectorPasses>(F);
10564       PA.preserve<ShouldRunExtraVectorPasses>();
10565     } else {
10566       PA.preserveSet<CFGAnalyses>();
10567     }
10568     return PA;
10569 }
10570 
10571 void LoopVectorizePass::printPipeline(
10572     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10573   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10574       OS, MapClassName2PassName);
10575 
10576   OS << "<";
10577   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10578   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10579   OS << ">";
10580 }
10581