xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (revision 0eae32dcef82f6f06de6419a0d623d7def0cc8f6)
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
509   /// is provided, the integer induction variable will first be truncated to
510   /// the corresponding type.
511   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
512                              Value *Start, TruncInst *Trunc, VPValue *Def,
513                              VPTransformState &State);
514 
515   /// Construct the vector value of a scalarized value \p V one lane at a time.
516   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
517                                  VPTransformState &State);
518 
519   /// Try to vectorize interleaved access group \p Group with the base address
520   /// given in \p Addr, optionally masking the vector operations if \p
521   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
522   /// values in the vectorized loop.
523   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
524                                 ArrayRef<VPValue *> VPDefs,
525                                 VPTransformState &State, VPValue *Addr,
526                                 ArrayRef<VPValue *> StoredValues,
527                                 VPValue *BlockInMask = nullptr);
528 
529   /// Set the debug location in the builder \p Ptr using the debug location in
530   /// \p V. If \p Ptr is None then it uses the class member's Builder.
531   void setDebugLocFromInst(const Value *V,
532                            Optional<IRBuilder<> *> CustomBuilder = None);
533 
534   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
535   void fixNonInductionPHIs(VPTransformState &State);
536 
537   /// Returns true if the reordering of FP operations is not allowed, but we are
538   /// able to vectorize with strict in-order reductions for the given RdxDesc.
539   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
540 
541   /// Create a broadcast instruction. This method generates a broadcast
542   /// instruction (shuffle) for loop invariant values and for the induction
543   /// value. If this is the induction variable then we extend it to N, N+1, ...
544   /// this is needed because each iteration in the loop corresponds to a SIMD
545   /// element.
546   virtual Value *getBroadcastInstrs(Value *V);
547 
548   /// Add metadata from one instruction to another.
549   ///
550   /// This includes both the original MDs from \p From and additional ones (\see
551   /// addNewMetadata).  Use this for *newly created* instructions in the vector
552   /// loop.
553   void addMetadata(Instruction *To, Instruction *From);
554 
555   /// Similar to the previous function but it adds the metadata to a
556   /// vector of instructions.
557   void addMetadata(ArrayRef<Value *> To, Instruction *From);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Create a new induction variable inside L.
577   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
578                                    Value *Step, Instruction *DL);
579 
580   /// Handle all cross-iteration phis in the header.
581   void fixCrossIterationPHIs(VPTransformState &State);
582 
583   /// Create the exit value of first order recurrences in the middle block and
584   /// update their users.
585   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
586 
587   /// Create code for the loop exit value of the reduction.
588   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
592                                VPTransformState &State);
593 
594   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
595   /// means we need to add the appropriate incoming value from the middle
596   /// block as exiting edges from the scalar epilogue loop (if present) are
597   /// already in place, and we exit the vector loop exclusively to the middle
598   /// block.
599   void fixLCSSAPHIs(VPTransformState &State);
600 
601   /// Iteratively sink the scalarized operands of a predicated instruction into
602   /// the block that was created for it.
603   void sinkScalarOperands(Instruction *PredInst);
604 
605   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
606   /// represented as.
607   void truncateToMinimalBitwidths(VPTransformState &State);
608 
609   /// This function adds
610   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
611   /// to each vector element of Val. The sequence starts at StartIndex.
612   /// \p Opcode is relevant for FP induction variable.
613   virtual Value *
614   getStepVector(Value *Val, Value *StartIdx, Value *Step,
615                 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
616 
617   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
618   /// variable on which to base the steps, \p Step is the size of the step, and
619   /// \p EntryVal is the value from the original loop that maps to the steps.
620   /// Note that \p EntryVal doesn't have to be an induction variable - it
621   /// can also be a truncate instruction.
622   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
623                         const InductionDescriptor &ID, VPValue *Def,
624                         VPTransformState &State);
625 
626   /// Create a vector induction phi node based on an existing scalar one. \p
627   /// EntryVal is the value from the original loop that maps to the vector phi
628   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
629   /// truncate instruction, instead of widening the original IV, we widen a
630   /// version of the IV truncated to \p EntryVal's type.
631   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
632                                        Value *Step, Value *Start,
633                                        Instruction *EntryVal, VPValue *Def,
634                                        VPTransformState &State);
635 
636   /// Returns true if an instruction \p I should be scalarized instead of
637   /// vectorized for the chosen vectorization factor.
638   bool shouldScalarizeInstruction(Instruction *I) const;
639 
640   /// Returns true if we should generate a scalar version of \p IV.
641   bool needsScalarInduction(Instruction *IV) const;
642 
643   /// Generate a shuffle sequence that will reverse the vector Vec.
644   virtual Value *reverseVector(Value *Vec);
645 
646   /// Returns (and creates if needed) the original loop trip count.
647   Value *getOrCreateTripCount(Loop *NewLoop);
648 
649   /// Returns (and creates if needed) the trip count of the widened loop.
650   Value *getOrCreateVectorTripCount(Loop *NewLoop);
651 
652   /// Returns a bitcasted value to the requested vector type.
653   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
654   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
655                                 const DataLayout &DL);
656 
657   /// Emit a bypass check to see if the vector trip count is zero, including if
658   /// it overflows.
659   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit a bypass check to see if all of the SCEV assumptions we've
662   /// had to make are correct. Returns the block containing the checks or
663   /// nullptr if no checks have been added.
664   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
665 
666   /// Emit bypass checks to check any memory assumptions we may have made.
667   /// Returns the block containing the checks or nullptr if no checks have been
668   /// added.
669   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
670 
671   /// Compute the transformed value of Index at offset StartValue using step
672   /// StepValue.
673   /// For integer induction, returns StartValue + Index * StepValue.
674   /// For pointer induction, returns StartValue[Index * StepValue].
675   /// FIXME: The newly created binary instructions should contain nsw/nuw
676   /// flags, which can be found from the original scalar operations.
677   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
678                               const DataLayout &DL,
679                               const InductionDescriptor &ID,
680                               BasicBlock *VectorHeader) const;
681 
682   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
683   /// vector loop preheader, middle block and scalar preheader. Also
684   /// allocate a loop object for the new vector loop and return it.
685   Loop *createVectorLoopSkeleton(StringRef Prefix);
686 
687   /// Create new phi nodes for the induction variables to resume iteration count
688   /// in the scalar epilogue, from where the vectorized loop left off (given by
689   /// \p VectorTripCount).
690   /// In cases where the loop skeleton is more complicated (eg. epilogue
691   /// vectorization) and the resume values can come from an additional bypass
692   /// block, the \p AdditionalBypass pair provides information about the bypass
693   /// block and the end value on the edge from bypass to this loop.
694   void createInductionResumeValues(
695       Loop *L, Value *VectorTripCount,
696       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
697 
698   /// Complete the loop skeleton by adding debug MDs, creating appropriate
699   /// conditional branches in the middle block, preparing the builder and
700   /// running the verifier. Take in the vector loop \p L as argument, and return
701   /// the preheader of the completed vector loop.
702   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
703 
704   /// Add additional metadata to \p To that was not present on \p Orig.
705   ///
706   /// Currently this is used to add the noalias annotations based on the
707   /// inserted memchecks.  Use this for instructions that are *cloned* into the
708   /// vector loop.
709   void addNewMetadata(Instruction *To, const Instruction *Orig);
710 
711   /// Collect poison-generating recipes that may generate a poison value that is
712   /// used after vectorization, even when their operands are not poison. Those
713   /// recipes meet the following conditions:
714   ///  * Contribute to the address computation of a recipe generating a widen
715   ///    memory load/store (VPWidenMemoryInstructionRecipe or
716   ///    VPInterleaveRecipe).
717   ///  * Such a widen memory load/store has at least one underlying Instruction
718   ///    that is in a basic block that needs predication and after vectorization
719   ///    the generated instruction won't be predicated.
720   void collectPoisonGeneratingRecipes(VPTransformState &State);
721 
722   /// Allow subclasses to override and print debug traces before/after vplan
723   /// execution, when trace information is requested.
724   virtual void printDebugTracesAtStart(){};
725   virtual void printDebugTracesAtEnd(){};
726 
727   /// The original loop.
728   Loop *OrigLoop;
729 
730   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
731   /// dynamic knowledge to simplify SCEV expressions and converts them to a
732   /// more usable form.
733   PredicatedScalarEvolution &PSE;
734 
735   /// Loop Info.
736   LoopInfo *LI;
737 
738   /// Dominator Tree.
739   DominatorTree *DT;
740 
741   /// Alias Analysis.
742   AAResults *AA;
743 
744   /// Target Library Info.
745   const TargetLibraryInfo *TLI;
746 
747   /// Target Transform Info.
748   const TargetTransformInfo *TTI;
749 
750   /// Assumption Cache.
751   AssumptionCache *AC;
752 
753   /// Interface to emit optimization remarks.
754   OptimizationRemarkEmitter *ORE;
755 
756   /// LoopVersioning.  It's only set up (non-null) if memchecks were
757   /// used.
758   ///
759   /// This is currently only used to add no-alias metadata based on the
760   /// memchecks.  The actually versioning is performed manually.
761   std::unique_ptr<LoopVersioning> LVer;
762 
763   /// The vectorization SIMD factor to use. Each vector will have this many
764   /// vector elements.
765   ElementCount VF;
766 
767   /// The vectorization unroll factor to use. Each scalar is vectorized to this
768   /// many different vector instructions.
769   unsigned UF;
770 
771   /// The builder that we use
772   IRBuilder<> Builder;
773 
774   // --- Vectorization state ---
775 
776   /// The vector-loop preheader.
777   BasicBlock *LoopVectorPreHeader;
778 
779   /// The scalar-loop preheader.
780   BasicBlock *LoopScalarPreHeader;
781 
782   /// Middle Block between the vector and the scalar.
783   BasicBlock *LoopMiddleBlock;
784 
785   /// The unique ExitBlock of the scalar loop if one exists.  Note that
786   /// there can be multiple exiting edges reaching this block.
787   BasicBlock *LoopExitBlock;
788 
789   /// The vector loop body.
790   BasicBlock *LoopVectorBody;
791 
792   /// The scalar loop body.
793   BasicBlock *LoopScalarBody;
794 
795   /// A list of all bypass blocks. The first block is the entry of the loop.
796   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
797 
798   /// The new Induction variable which was added to the new block.
799   PHINode *Induction = nullptr;
800 
801   /// The induction variable of the old basic block.
802   PHINode *OldInduction = nullptr;
803 
804   /// Store instructions that were predicated.
805   SmallVector<Instruction *, 4> PredicatedInstructions;
806 
807   /// Trip count of the original loop.
808   Value *TripCount = nullptr;
809 
810   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
811   Value *VectorTripCount = nullptr;
812 
813   /// The legality analysis.
814   LoopVectorizationLegality *Legal;
815 
816   /// The profitablity analysis.
817   LoopVectorizationCostModel *Cost;
818 
819   // Record whether runtime checks are added.
820   bool AddedSafetyChecks = false;
821 
822   // Holds the end values for each induction variable. We save the end values
823   // so we can later fix-up the external users of the induction variables.
824   DenseMap<PHINode *, Value *> IVEndValues;
825 
826   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
827   // fixed up at the end of vector code generation.
828   SmallVector<PHINode *, 8> OrigPHIsToFix;
829 
830   /// BFI and PSI are used to check for profile guided size optimizations.
831   BlockFrequencyInfo *BFI;
832   ProfileSummaryInfo *PSI;
833 
834   // Whether this loop should be optimized for size based on profile guided size
835   // optimizatios.
836   bool OptForSizeBasedOnProfile;
837 
838   /// Structure to hold information about generated runtime checks, responsible
839   /// for cleaning the checks, if vectorization turns out unprofitable.
840   GeneratedRTChecks &RTChecks;
841 };
842 
843 class InnerLoopUnroller : public InnerLoopVectorizer {
844 public:
845   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
846                     LoopInfo *LI, DominatorTree *DT,
847                     const TargetLibraryInfo *TLI,
848                     const TargetTransformInfo *TTI, AssumptionCache *AC,
849                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
850                     LoopVectorizationLegality *LVL,
851                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
852                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
853       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
854                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
855                             BFI, PSI, Check) {}
856 
857 private:
858   Value *getBroadcastInstrs(Value *V) override;
859   Value *getStepVector(
860       Value *Val, Value *StartIdx, Value *Step,
861       Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
862   Value *reverseVector(Value *Vec) override;
863 };
864 
865 /// Encapsulate information regarding vectorization of a loop and its epilogue.
866 /// This information is meant to be updated and used across two stages of
867 /// epilogue vectorization.
868 struct EpilogueLoopVectorizationInfo {
869   ElementCount MainLoopVF = ElementCount::getFixed(0);
870   unsigned MainLoopUF = 0;
871   ElementCount EpilogueVF = ElementCount::getFixed(0);
872   unsigned EpilogueUF = 0;
873   BasicBlock *MainLoopIterationCountCheck = nullptr;
874   BasicBlock *EpilogueIterationCountCheck = nullptr;
875   BasicBlock *SCEVSafetyCheck = nullptr;
876   BasicBlock *MemSafetyCheck = nullptr;
877   Value *TripCount = nullptr;
878   Value *VectorTripCount = nullptr;
879 
880   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
881                                 ElementCount EVF, unsigned EUF)
882       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
883     assert(EUF == 1 &&
884            "A high UF for the epilogue loop is likely not beneficial.");
885   }
886 };
887 
888 /// An extension of the inner loop vectorizer that creates a skeleton for a
889 /// vectorized loop that has its epilogue (residual) also vectorized.
890 /// The idea is to run the vplan on a given loop twice, firstly to setup the
891 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
892 /// from the first step and vectorize the epilogue.  This is achieved by
893 /// deriving two concrete strategy classes from this base class and invoking
894 /// them in succession from the loop vectorizer planner.
895 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
896 public:
897   InnerLoopAndEpilogueVectorizer(
898       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
899       DominatorTree *DT, const TargetLibraryInfo *TLI,
900       const TargetTransformInfo *TTI, AssumptionCache *AC,
901       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
902       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
903       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
904       GeneratedRTChecks &Checks)
905       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
906                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
907                             Checks),
908         EPI(EPI) {}
909 
910   // Override this function to handle the more complex control flow around the
911   // three loops.
912   BasicBlock *createVectorizedLoopSkeleton() final override {
913     return createEpilogueVectorizedLoopSkeleton();
914   }
915 
916   /// The interface for creating a vectorized skeleton using one of two
917   /// different strategies, each corresponding to one execution of the vplan
918   /// as described above.
919   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
920 
921   /// Holds and updates state information required to vectorize the main loop
922   /// and its epilogue in two separate passes. This setup helps us avoid
923   /// regenerating and recomputing runtime safety checks. It also helps us to
924   /// shorten the iteration-count-check path length for the cases where the
925   /// iteration count of the loop is so small that the main vector loop is
926   /// completely skipped.
927   EpilogueLoopVectorizationInfo &EPI;
928 };
929 
930 /// A specialized derived class of inner loop vectorizer that performs
931 /// vectorization of *main* loops in the process of vectorizing loops and their
932 /// epilogues.
933 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
934 public:
935   EpilogueVectorizerMainLoop(
936       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
937       DominatorTree *DT, const TargetLibraryInfo *TLI,
938       const TargetTransformInfo *TTI, AssumptionCache *AC,
939       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
940       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
941       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
942       GeneratedRTChecks &Check)
943       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
944                                        EPI, LVL, CM, BFI, PSI, Check) {}
945   /// Implements the interface for creating a vectorized skeleton using the
946   /// *main loop* strategy (ie the first pass of vplan execution).
947   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
948 
949 protected:
950   /// Emits an iteration count bypass check once for the main loop (when \p
951   /// ForEpilogue is false) and once for the epilogue loop (when \p
952   /// ForEpilogue is true).
953   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
954                                              bool ForEpilogue);
955   void printDebugTracesAtStart() override;
956   void printDebugTracesAtEnd() override;
957 };
958 
959 // A specialized derived class of inner loop vectorizer that performs
960 // vectorization of *epilogue* loops in the process of vectorizing loops and
961 // their epilogues.
962 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
963 public:
964   EpilogueVectorizerEpilogueLoop(
965       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
966       DominatorTree *DT, const TargetLibraryInfo *TLI,
967       const TargetTransformInfo *TTI, AssumptionCache *AC,
968       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
969       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
970       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
971       GeneratedRTChecks &Checks)
972       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
973                                        EPI, LVL, CM, BFI, PSI, Checks) {}
974   /// Implements the interface for creating a vectorized skeleton using the
975   /// *epilogue loop* strategy (ie the second pass of vplan execution).
976   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
977 
978 protected:
979   /// Emits an iteration count bypass check after the main vector loop has
980   /// finished to see if there are any iterations left to execute by either
981   /// the vector epilogue or the scalar epilogue.
982   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
983                                                       BasicBlock *Bypass,
984                                                       BasicBlock *Insert);
985   void printDebugTracesAtStart() override;
986   void printDebugTracesAtEnd() override;
987 };
988 } // end namespace llvm
989 
990 /// Look for a meaningful debug location on the instruction or it's
991 /// operands.
992 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
993   if (!I)
994     return I;
995 
996   DebugLoc Empty;
997   if (I->getDebugLoc() != Empty)
998     return I;
999 
1000   for (Use &Op : I->operands()) {
1001     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1002       if (OpInst->getDebugLoc() != Empty)
1003         return OpInst;
1004   }
1005 
1006   return I;
1007 }
1008 
1009 void InnerLoopVectorizer::setDebugLocFromInst(
1010     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1011   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1012   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1013     const DILocation *DIL = Inst->getDebugLoc();
1014 
1015     // When a FSDiscriminator is enabled, we don't need to add the multiply
1016     // factors to the discriminators.
1017     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1018         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1019       // FIXME: For scalable vectors, assume vscale=1.
1020       auto NewDIL =
1021           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1022       if (NewDIL)
1023         B->SetCurrentDebugLocation(NewDIL.getValue());
1024       else
1025         LLVM_DEBUG(dbgs()
1026                    << "Failed to create new discriminator: "
1027                    << DIL->getFilename() << " Line: " << DIL->getLine());
1028     } else
1029       B->SetCurrentDebugLocation(DIL);
1030   } else
1031     B->SetCurrentDebugLocation(DebugLoc());
1032 }
1033 
1034 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1035 /// is passed, the message relates to that particular instruction.
1036 #ifndef NDEBUG
1037 static void debugVectorizationMessage(const StringRef Prefix,
1038                                       const StringRef DebugMsg,
1039                                       Instruction *I) {
1040   dbgs() << "LV: " << Prefix << DebugMsg;
1041   if (I != nullptr)
1042     dbgs() << " " << *I;
1043   else
1044     dbgs() << '.';
1045   dbgs() << '\n';
1046 }
1047 #endif
1048 
1049 /// Create an analysis remark that explains why vectorization failed
1050 ///
1051 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1052 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1053 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1054 /// the location of the remark.  \return the remark object that can be
1055 /// streamed to.
1056 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1057     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1058   Value *CodeRegion = TheLoop->getHeader();
1059   DebugLoc DL = TheLoop->getStartLoc();
1060 
1061   if (I) {
1062     CodeRegion = I->getParent();
1063     // If there is no debug location attached to the instruction, revert back to
1064     // using the loop's.
1065     if (I->getDebugLoc())
1066       DL = I->getDebugLoc();
1067   }
1068 
1069   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1070 }
1071 
1072 /// Return a value for Step multiplied by VF.
1073 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1074                               int64_t Step) {
1075   assert(Ty->isIntegerTy() && "Expected an integer step");
1076   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1077   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1078 }
1079 
1080 namespace llvm {
1081 
1082 /// Return the runtime value for VF.
1083 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1084   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1085   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1086 }
1087 
1088 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1089   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1090   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1091   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1092   return B.CreateUIToFP(RuntimeVF, FTy);
1093 }
1094 
1095 void reportVectorizationFailure(const StringRef DebugMsg,
1096                                 const StringRef OREMsg, const StringRef ORETag,
1097                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1098                                 Instruction *I) {
1099   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1100   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1101   ORE->emit(
1102       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1103       << "loop not vectorized: " << OREMsg);
1104 }
1105 
1106 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1107                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1108                              Instruction *I) {
1109   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1110   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1111   ORE->emit(
1112       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1113       << Msg);
1114 }
1115 
1116 } // end namespace llvm
1117 
1118 #ifndef NDEBUG
1119 /// \return string containing a file name and a line # for the given loop.
1120 static std::string getDebugLocString(const Loop *L) {
1121   std::string Result;
1122   if (L) {
1123     raw_string_ostream OS(Result);
1124     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1125       LoopDbgLoc.print(OS);
1126     else
1127       // Just print the module name.
1128       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1129     OS.flush();
1130   }
1131   return Result;
1132 }
1133 #endif
1134 
1135 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1136                                          const Instruction *Orig) {
1137   // If the loop was versioned with memchecks, add the corresponding no-alias
1138   // metadata.
1139   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1140     LVer->annotateInstWithNoAlias(To, Orig);
1141 }
1142 
1143 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1144     VPTransformState &State) {
1145 
1146   // Collect recipes in the backward slice of `Root` that may generate a poison
1147   // value that is used after vectorization.
1148   SmallPtrSet<VPRecipeBase *, 16> Visited;
1149   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1150     SmallVector<VPRecipeBase *, 16> Worklist;
1151     Worklist.push_back(Root);
1152 
1153     // Traverse the backward slice of Root through its use-def chain.
1154     while (!Worklist.empty()) {
1155       VPRecipeBase *CurRec = Worklist.back();
1156       Worklist.pop_back();
1157 
1158       if (!Visited.insert(CurRec).second)
1159         continue;
1160 
1161       // Prune search if we find another recipe generating a widen memory
1162       // instruction. Widen memory instructions involved in address computation
1163       // will lead to gather/scatter instructions, which don't need to be
1164       // handled.
1165       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1166           isa<VPInterleaveRecipe>(CurRec))
1167         continue;
1168 
1169       // This recipe contributes to the address computation of a widen
1170       // load/store. Collect recipe if its underlying instruction has
1171       // poison-generating flags.
1172       Instruction *Instr = CurRec->getUnderlyingInstr();
1173       if (Instr && Instr->hasPoisonGeneratingFlags())
1174         State.MayGeneratePoisonRecipes.insert(CurRec);
1175 
1176       // Add new definitions to the worklist.
1177       for (VPValue *operand : CurRec->operands())
1178         if (VPDef *OpDef = operand->getDef())
1179           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1180     }
1181   });
1182 
1183   // Traverse all the recipes in the VPlan and collect the poison-generating
1184   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1185   // VPInterleaveRecipe.
1186   auto Iter = depth_first(
1187       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1188   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1189     for (VPRecipeBase &Recipe : *VPBB) {
1190       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1191         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1192         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1193         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1194             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1195           collectPoisonGeneratingInstrsInBackwardSlice(
1196               cast<VPRecipeBase>(AddrDef));
1197       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1198         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1199         if (AddrDef) {
1200           // Check if any member of the interleave group needs predication.
1201           const InterleaveGroup<Instruction> *InterGroup =
1202               InterleaveRec->getInterleaveGroup();
1203           bool NeedPredication = false;
1204           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1205                I < NumMembers; ++I) {
1206             Instruction *Member = InterGroup->getMember(I);
1207             if (Member)
1208               NeedPredication |=
1209                   Legal->blockNeedsPredication(Member->getParent());
1210           }
1211 
1212           if (NeedPredication)
1213             collectPoisonGeneratingInstrsInBackwardSlice(
1214                 cast<VPRecipeBase>(AddrDef));
1215         }
1216       }
1217     }
1218   }
1219 }
1220 
1221 void InnerLoopVectorizer::addMetadata(Instruction *To,
1222                                       Instruction *From) {
1223   propagateMetadata(To, From);
1224   addNewMetadata(To, From);
1225 }
1226 
1227 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1228                                       Instruction *From) {
1229   for (Value *V : To) {
1230     if (Instruction *I = dyn_cast<Instruction>(V))
1231       addMetadata(I, From);
1232   }
1233 }
1234 
1235 namespace llvm {
1236 
1237 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1238 // lowered.
1239 enum ScalarEpilogueLowering {
1240 
1241   // The default: allowing scalar epilogues.
1242   CM_ScalarEpilogueAllowed,
1243 
1244   // Vectorization with OptForSize: don't allow epilogues.
1245   CM_ScalarEpilogueNotAllowedOptSize,
1246 
1247   // A special case of vectorisation with OptForSize: loops with a very small
1248   // trip count are considered for vectorization under OptForSize, thereby
1249   // making sure the cost of their loop body is dominant, free of runtime
1250   // guards and scalar iteration overheads.
1251   CM_ScalarEpilogueNotAllowedLowTripLoop,
1252 
1253   // Loop hint predicate indicating an epilogue is undesired.
1254   CM_ScalarEpilogueNotNeededUsePredicate,
1255 
1256   // Directive indicating we must either tail fold or not vectorize
1257   CM_ScalarEpilogueNotAllowedUsePredicate
1258 };
1259 
1260 /// ElementCountComparator creates a total ordering for ElementCount
1261 /// for the purposes of using it in a set structure.
1262 struct ElementCountComparator {
1263   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1264     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1265            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1266   }
1267 };
1268 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1269 
1270 /// LoopVectorizationCostModel - estimates the expected speedups due to
1271 /// vectorization.
1272 /// In many cases vectorization is not profitable. This can happen because of
1273 /// a number of reasons. In this class we mainly attempt to predict the
1274 /// expected speedup/slowdowns due to the supported instruction set. We use the
1275 /// TargetTransformInfo to query the different backends for the cost of
1276 /// different operations.
1277 class LoopVectorizationCostModel {
1278 public:
1279   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1280                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1281                              LoopVectorizationLegality *Legal,
1282                              const TargetTransformInfo &TTI,
1283                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1284                              AssumptionCache *AC,
1285                              OptimizationRemarkEmitter *ORE, const Function *F,
1286                              const LoopVectorizeHints *Hints,
1287                              InterleavedAccessInfo &IAI)
1288       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1289         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1290         Hints(Hints), InterleaveInfo(IAI) {}
1291 
1292   /// \return An upper bound for the vectorization factors (both fixed and
1293   /// scalable). If the factors are 0, vectorization and interleaving should be
1294   /// avoided up front.
1295   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1296 
1297   /// \return True if runtime checks are required for vectorization, and false
1298   /// otherwise.
1299   bool runtimeChecksRequired();
1300 
1301   /// \return The most profitable vectorization factor and the cost of that VF.
1302   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1303   /// then this vectorization factor will be selected if vectorization is
1304   /// possible.
1305   VectorizationFactor
1306   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1307 
1308   VectorizationFactor
1309   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1310                                     const LoopVectorizationPlanner &LVP);
1311 
1312   /// Setup cost-based decisions for user vectorization factor.
1313   /// \return true if the UserVF is a feasible VF to be chosen.
1314   bool selectUserVectorizationFactor(ElementCount UserVF) {
1315     collectUniformsAndScalars(UserVF);
1316     collectInstsToScalarize(UserVF);
1317     return expectedCost(UserVF).first.isValid();
1318   }
1319 
1320   /// \return The size (in bits) of the smallest and widest types in the code
1321   /// that needs to be vectorized. We ignore values that remain scalar such as
1322   /// 64 bit loop indices.
1323   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1324 
1325   /// \return The desired interleave count.
1326   /// If interleave count has been specified by metadata it will be returned.
1327   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1328   /// are the selected vectorization factor and the cost of the selected VF.
1329   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1330 
1331   /// Memory access instruction may be vectorized in more than one way.
1332   /// Form of instruction after vectorization depends on cost.
1333   /// This function takes cost-based decisions for Load/Store instructions
1334   /// and collects them in a map. This decisions map is used for building
1335   /// the lists of loop-uniform and loop-scalar instructions.
1336   /// The calculated cost is saved with widening decision in order to
1337   /// avoid redundant calculations.
1338   void setCostBasedWideningDecision(ElementCount VF);
1339 
1340   /// A struct that represents some properties of the register usage
1341   /// of a loop.
1342   struct RegisterUsage {
1343     /// Holds the number of loop invariant values that are used in the loop.
1344     /// The key is ClassID of target-provided register class.
1345     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1346     /// Holds the maximum number of concurrent live intervals in the loop.
1347     /// The key is ClassID of target-provided register class.
1348     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1349   };
1350 
1351   /// \return Returns information about the register usages of the loop for the
1352   /// given vectorization factors.
1353   SmallVector<RegisterUsage, 8>
1354   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1355 
1356   /// Collect values we want to ignore in the cost model.
1357   void collectValuesToIgnore();
1358 
1359   /// Collect all element types in the loop for which widening is needed.
1360   void collectElementTypesForWidening();
1361 
1362   /// Split reductions into those that happen in the loop, and those that happen
1363   /// outside. In loop reductions are collected into InLoopReductionChains.
1364   void collectInLoopReductions();
1365 
1366   /// Returns true if we should use strict in-order reductions for the given
1367   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1368   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1369   /// of FP operations.
1370   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1371     return !Hints->allowReordering() && RdxDesc.isOrdered();
1372   }
1373 
1374   /// \returns The smallest bitwidth each instruction can be represented with.
1375   /// The vector equivalents of these instructions should be truncated to this
1376   /// type.
1377   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1378     return MinBWs;
1379   }
1380 
1381   /// \returns True if it is more profitable to scalarize instruction \p I for
1382   /// vectorization factor \p VF.
1383   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1384     assert(VF.isVector() &&
1385            "Profitable to scalarize relevant only for VF > 1.");
1386 
1387     // Cost model is not run in the VPlan-native path - return conservative
1388     // result until this changes.
1389     if (EnableVPlanNativePath)
1390       return false;
1391 
1392     auto Scalars = InstsToScalarize.find(VF);
1393     assert(Scalars != InstsToScalarize.end() &&
1394            "VF not yet analyzed for scalarization profitability");
1395     return Scalars->second.find(I) != Scalars->second.end();
1396   }
1397 
1398   /// Returns true if \p I is known to be uniform after vectorization.
1399   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1400     if (VF.isScalar())
1401       return true;
1402 
1403     // Cost model is not run in the VPlan-native path - return conservative
1404     // result until this changes.
1405     if (EnableVPlanNativePath)
1406       return false;
1407 
1408     auto UniformsPerVF = Uniforms.find(VF);
1409     assert(UniformsPerVF != Uniforms.end() &&
1410            "VF not yet analyzed for uniformity");
1411     return UniformsPerVF->second.count(I);
1412   }
1413 
1414   /// Returns true if \p I is known to be scalar after vectorization.
1415   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1416     if (VF.isScalar())
1417       return true;
1418 
1419     // Cost model is not run in the VPlan-native path - return conservative
1420     // result until this changes.
1421     if (EnableVPlanNativePath)
1422       return false;
1423 
1424     auto ScalarsPerVF = Scalars.find(VF);
1425     assert(ScalarsPerVF != Scalars.end() &&
1426            "Scalar values are not calculated for VF");
1427     return ScalarsPerVF->second.count(I);
1428   }
1429 
1430   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1431   /// for vectorization factor \p VF.
1432   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1433     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1434            !isProfitableToScalarize(I, VF) &&
1435            !isScalarAfterVectorization(I, VF);
1436   }
1437 
1438   /// Decision that was taken during cost calculation for memory instruction.
1439   enum InstWidening {
1440     CM_Unknown,
1441     CM_Widen,         // For consecutive accesses with stride +1.
1442     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1443     CM_Interleave,
1444     CM_GatherScatter,
1445     CM_Scalarize
1446   };
1447 
1448   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1449   /// instruction \p I and vector width \p VF.
1450   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1451                            InstructionCost Cost) {
1452     assert(VF.isVector() && "Expected VF >=2");
1453     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1454   }
1455 
1456   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1457   /// interleaving group \p Grp and vector width \p VF.
1458   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1459                            ElementCount VF, InstWidening W,
1460                            InstructionCost Cost) {
1461     assert(VF.isVector() && "Expected VF >=2");
1462     /// Broadcast this decicion to all instructions inside the group.
1463     /// But the cost will be assigned to one instruction only.
1464     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1465       if (auto *I = Grp->getMember(i)) {
1466         if (Grp->getInsertPos() == I)
1467           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1468         else
1469           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1470       }
1471     }
1472   }
1473 
1474   /// Return the cost model decision for the given instruction \p I and vector
1475   /// width \p VF. Return CM_Unknown if this instruction did not pass
1476   /// through the cost modeling.
1477   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1478     assert(VF.isVector() && "Expected VF to be a vector VF");
1479     // Cost model is not run in the VPlan-native path - return conservative
1480     // result until this changes.
1481     if (EnableVPlanNativePath)
1482       return CM_GatherScatter;
1483 
1484     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1485     auto Itr = WideningDecisions.find(InstOnVF);
1486     if (Itr == WideningDecisions.end())
1487       return CM_Unknown;
1488     return Itr->second.first;
1489   }
1490 
1491   /// Return the vectorization cost for the given instruction \p I and vector
1492   /// width \p VF.
1493   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1494     assert(VF.isVector() && "Expected VF >=2");
1495     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1496     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1497            "The cost is not calculated");
1498     return WideningDecisions[InstOnVF].second;
1499   }
1500 
1501   /// Return True if instruction \p I is an optimizable truncate whose operand
1502   /// is an induction variable. Such a truncate will be removed by adding a new
1503   /// induction variable with the destination type.
1504   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1505     // If the instruction is not a truncate, return false.
1506     auto *Trunc = dyn_cast<TruncInst>(I);
1507     if (!Trunc)
1508       return false;
1509 
1510     // Get the source and destination types of the truncate.
1511     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1512     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1513 
1514     // If the truncate is free for the given types, return false. Replacing a
1515     // free truncate with an induction variable would add an induction variable
1516     // update instruction to each iteration of the loop. We exclude from this
1517     // check the primary induction variable since it will need an update
1518     // instruction regardless.
1519     Value *Op = Trunc->getOperand(0);
1520     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1521       return false;
1522 
1523     // If the truncated value is not an induction variable, return false.
1524     return Legal->isInductionPhi(Op);
1525   }
1526 
1527   /// Collects the instructions to scalarize for each predicated instruction in
1528   /// the loop.
1529   void collectInstsToScalarize(ElementCount VF);
1530 
1531   /// Collect Uniform and Scalar values for the given \p VF.
1532   /// The sets depend on CM decision for Load/Store instructions
1533   /// that may be vectorized as interleave, gather-scatter or scalarized.
1534   void collectUniformsAndScalars(ElementCount VF) {
1535     // Do the analysis once.
1536     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1537       return;
1538     setCostBasedWideningDecision(VF);
1539     collectLoopUniforms(VF);
1540     collectLoopScalars(VF);
1541   }
1542 
1543   /// Returns true if the target machine supports masked store operation
1544   /// for the given \p DataType and kind of access to \p Ptr.
1545   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1546     return Legal->isConsecutivePtr(DataType, Ptr) &&
1547            TTI.isLegalMaskedStore(DataType, Alignment);
1548   }
1549 
1550   /// Returns true if the target machine supports masked load operation
1551   /// for the given \p DataType and kind of access to \p Ptr.
1552   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1553     return Legal->isConsecutivePtr(DataType, Ptr) &&
1554            TTI.isLegalMaskedLoad(DataType, Alignment);
1555   }
1556 
1557   /// Returns true if the target machine can represent \p V as a masked gather
1558   /// or scatter operation.
1559   bool isLegalGatherOrScatter(Value *V) {
1560     bool LI = isa<LoadInst>(V);
1561     bool SI = isa<StoreInst>(V);
1562     if (!LI && !SI)
1563       return false;
1564     auto *Ty = getLoadStoreType(V);
1565     Align Align = getLoadStoreAlignment(V);
1566     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1567            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1568   }
1569 
1570   /// Returns true if the target machine supports all of the reduction
1571   /// variables found for the given VF.
1572   bool canVectorizeReductions(ElementCount VF) const {
1573     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1574       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1575       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1576     }));
1577   }
1578 
1579   /// Returns true if \p I is an instruction that will be scalarized with
1580   /// predication. Such instructions include conditional stores and
1581   /// instructions that may divide by zero.
1582   /// If a non-zero VF has been calculated, we check if I will be scalarized
1583   /// predication for that VF.
1584   bool isScalarWithPredication(Instruction *I) const;
1585 
1586   // Returns true if \p I is an instruction that will be predicated either
1587   // through scalar predication or masked load/store or masked gather/scatter.
1588   // Superset of instructions that return true for isScalarWithPredication.
1589   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1590     // When we know the load is uniform and the original scalar loop was not
1591     // predicated we don't need to mark it as a predicated instruction. Any
1592     // vectorised blocks created when tail-folding are something artificial we
1593     // have introduced and we know there is always at least one active lane.
1594     // That's why we call Legal->blockNeedsPredication here because it doesn't
1595     // query tail-folding.
1596     if (IsKnownUniform && isa<LoadInst>(I) &&
1597         !Legal->blockNeedsPredication(I->getParent()))
1598       return false;
1599     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1600       return false;
1601     // Loads and stores that need some form of masked operation are predicated
1602     // instructions.
1603     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1604       return Legal->isMaskRequired(I);
1605     return isScalarWithPredication(I);
1606   }
1607 
1608   /// Returns true if \p I is a memory instruction with consecutive memory
1609   /// access that can be widened.
1610   bool
1611   memoryInstructionCanBeWidened(Instruction *I,
1612                                 ElementCount VF = ElementCount::getFixed(1));
1613 
1614   /// Returns true if \p I is a memory instruction in an interleaved-group
1615   /// of memory accesses that can be vectorized with wide vector loads/stores
1616   /// and shuffles.
1617   bool
1618   interleavedAccessCanBeWidened(Instruction *I,
1619                                 ElementCount VF = ElementCount::getFixed(1));
1620 
1621   /// Check if \p Instr belongs to any interleaved access group.
1622   bool isAccessInterleaved(Instruction *Instr) {
1623     return InterleaveInfo.isInterleaved(Instr);
1624   }
1625 
1626   /// Get the interleaved access group that \p Instr belongs to.
1627   const InterleaveGroup<Instruction> *
1628   getInterleavedAccessGroup(Instruction *Instr) {
1629     return InterleaveInfo.getInterleaveGroup(Instr);
1630   }
1631 
1632   /// Returns true if we're required to use a scalar epilogue for at least
1633   /// the final iteration of the original loop.
1634   bool requiresScalarEpilogue(ElementCount VF) const {
1635     if (!isScalarEpilogueAllowed())
1636       return false;
1637     // If we might exit from anywhere but the latch, must run the exiting
1638     // iteration in scalar form.
1639     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1640       return true;
1641     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1642   }
1643 
1644   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1645   /// loop hint annotation.
1646   bool isScalarEpilogueAllowed() const {
1647     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1648   }
1649 
1650   /// Returns true if all loop blocks should be masked to fold tail loop.
1651   bool foldTailByMasking() const { return FoldTailByMasking; }
1652 
1653   /// Returns true if the instructions in this block requires predication
1654   /// for any reason, e.g. because tail folding now requires a predicate
1655   /// or because the block in the original loop was predicated.
1656   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1657     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1658   }
1659 
1660   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1661   /// nodes to the chain of instructions representing the reductions. Uses a
1662   /// MapVector to ensure deterministic iteration order.
1663   using ReductionChainMap =
1664       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1665 
1666   /// Return the chain of instructions representing an inloop reduction.
1667   const ReductionChainMap &getInLoopReductionChains() const {
1668     return InLoopReductionChains;
1669   }
1670 
1671   /// Returns true if the Phi is part of an inloop reduction.
1672   bool isInLoopReduction(PHINode *Phi) const {
1673     return InLoopReductionChains.count(Phi);
1674   }
1675 
1676   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1677   /// with factor VF.  Return the cost of the instruction, including
1678   /// scalarization overhead if it's needed.
1679   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1680 
1681   /// Estimate cost of a call instruction CI if it were vectorized with factor
1682   /// VF. Return the cost of the instruction, including scalarization overhead
1683   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1684   /// scalarized -
1685   /// i.e. either vector version isn't available, or is too expensive.
1686   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1687                                     bool &NeedToScalarize) const;
1688 
1689   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1690   /// that of B.
1691   bool isMoreProfitable(const VectorizationFactor &A,
1692                         const VectorizationFactor &B) const;
1693 
1694   /// Invalidates decisions already taken by the cost model.
1695   void invalidateCostModelingDecisions() {
1696     WideningDecisions.clear();
1697     Uniforms.clear();
1698     Scalars.clear();
1699   }
1700 
1701 private:
1702   unsigned NumPredStores = 0;
1703 
1704   /// \return An upper bound for the vectorization factors for both
1705   /// fixed and scalable vectorization, where the minimum-known number of
1706   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1707   /// disabled or unsupported, then the scalable part will be equal to
1708   /// ElementCount::getScalable(0).
1709   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1710                                            ElementCount UserVF,
1711                                            bool FoldTailByMasking);
1712 
1713   /// \return the maximized element count based on the targets vector
1714   /// registers and the loop trip-count, but limited to a maximum safe VF.
1715   /// This is a helper function of computeFeasibleMaxVF.
1716   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1717   /// issue that occurred on one of the buildbots which cannot be reproduced
1718   /// without having access to the properietary compiler (see comments on
1719   /// D98509). The issue is currently under investigation and this workaround
1720   /// will be removed as soon as possible.
1721   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1722                                        unsigned SmallestType,
1723                                        unsigned WidestType,
1724                                        const ElementCount &MaxSafeVF,
1725                                        bool FoldTailByMasking);
1726 
1727   /// \return the maximum legal scalable VF, based on the safe max number
1728   /// of elements.
1729   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1730 
1731   /// The vectorization cost is a combination of the cost itself and a boolean
1732   /// indicating whether any of the contributing operations will actually
1733   /// operate on vector values after type legalization in the backend. If this
1734   /// latter value is false, then all operations will be scalarized (i.e. no
1735   /// vectorization has actually taken place).
1736   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1737 
1738   /// Returns the expected execution cost. The unit of the cost does
1739   /// not matter because we use the 'cost' units to compare different
1740   /// vector widths. The cost that is returned is *not* normalized by
1741   /// the factor width. If \p Invalid is not nullptr, this function
1742   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1743   /// each instruction that has an Invalid cost for the given VF.
1744   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1745   VectorizationCostTy
1746   expectedCost(ElementCount VF,
1747                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1748 
1749   /// Returns the execution time cost of an instruction for a given vector
1750   /// width. Vector width of one means scalar.
1751   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1752 
1753   /// The cost-computation logic from getInstructionCost which provides
1754   /// the vector type as an output parameter.
1755   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1756                                      Type *&VectorTy);
1757 
1758   /// Return the cost of instructions in an inloop reduction pattern, if I is
1759   /// part of that pattern.
1760   Optional<InstructionCost>
1761   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1762                           TTI::TargetCostKind CostKind);
1763 
1764   /// Calculate vectorization cost of memory instruction \p I.
1765   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1766 
1767   /// The cost computation for scalarized memory instruction.
1768   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost computation for interleaving group of memory instructions.
1771   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1772 
1773   /// The cost computation for Gather/Scatter instruction.
1774   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1775 
1776   /// The cost computation for widening instruction \p I with consecutive
1777   /// memory access.
1778   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1779 
1780   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1781   /// Load: scalar load + broadcast.
1782   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1783   /// element)
1784   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1785 
1786   /// Estimate the overhead of scalarizing an instruction. This is a
1787   /// convenience wrapper for the type-based getScalarizationOverhead API.
1788   InstructionCost getScalarizationOverhead(Instruction *I,
1789                                            ElementCount VF) const;
1790 
1791   /// Returns whether the instruction is a load or store and will be a emitted
1792   /// as a vector operation.
1793   bool isConsecutiveLoadOrStore(Instruction *I);
1794 
1795   /// Returns true if an artificially high cost for emulated masked memrefs
1796   /// should be used.
1797   bool useEmulatedMaskMemRefHack(Instruction *I);
1798 
1799   /// Map of scalar integer values to the smallest bitwidth they can be legally
1800   /// represented as. The vector equivalents of these values should be truncated
1801   /// to this type.
1802   MapVector<Instruction *, uint64_t> MinBWs;
1803 
1804   /// A type representing the costs for instructions if they were to be
1805   /// scalarized rather than vectorized. The entries are Instruction-Cost
1806   /// pairs.
1807   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1808 
1809   /// A set containing all BasicBlocks that are known to present after
1810   /// vectorization as a predicated block.
1811   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1812 
1813   /// Records whether it is allowed to have the original scalar loop execute at
1814   /// least once. This may be needed as a fallback loop in case runtime
1815   /// aliasing/dependence checks fail, or to handle the tail/remainder
1816   /// iterations when the trip count is unknown or doesn't divide by the VF,
1817   /// or as a peel-loop to handle gaps in interleave-groups.
1818   /// Under optsize and when the trip count is very small we don't allow any
1819   /// iterations to execute in the scalar loop.
1820   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1821 
1822   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1823   bool FoldTailByMasking = false;
1824 
1825   /// A map holding scalar costs for different vectorization factors. The
1826   /// presence of a cost for an instruction in the mapping indicates that the
1827   /// instruction will be scalarized when vectorizing with the associated
1828   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1829   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1830 
1831   /// Holds the instructions known to be uniform after vectorization.
1832   /// The data is collected per VF.
1833   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1834 
1835   /// Holds the instructions known to be scalar after vectorization.
1836   /// The data is collected per VF.
1837   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1838 
1839   /// Holds the instructions (address computations) that are forced to be
1840   /// scalarized.
1841   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1842 
1843   /// PHINodes of the reductions that should be expanded in-loop along with
1844   /// their associated chains of reduction operations, in program order from top
1845   /// (PHI) to bottom
1846   ReductionChainMap InLoopReductionChains;
1847 
1848   /// A Map of inloop reduction operations and their immediate chain operand.
1849   /// FIXME: This can be removed once reductions can be costed correctly in
1850   /// vplan. This was added to allow quick lookup to the inloop operations,
1851   /// without having to loop through InLoopReductionChains.
1852   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1853 
1854   /// Returns the expected difference in cost from scalarizing the expression
1855   /// feeding a predicated instruction \p PredInst. The instructions to
1856   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1857   /// non-negative return value implies the expression will be scalarized.
1858   /// Currently, only single-use chains are considered for scalarization.
1859   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1860                               ElementCount VF);
1861 
1862   /// Collect the instructions that are uniform after vectorization. An
1863   /// instruction is uniform if we represent it with a single scalar value in
1864   /// the vectorized loop corresponding to each vector iteration. Examples of
1865   /// uniform instructions include pointer operands of consecutive or
1866   /// interleaved memory accesses. Note that although uniformity implies an
1867   /// instruction will be scalar, the reverse is not true. In general, a
1868   /// scalarized instruction will be represented by VF scalar values in the
1869   /// vectorized loop, each corresponding to an iteration of the original
1870   /// scalar loop.
1871   void collectLoopUniforms(ElementCount VF);
1872 
1873   /// Collect the instructions that are scalar after vectorization. An
1874   /// instruction is scalar if it is known to be uniform or will be scalarized
1875   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1876   /// to the list if they are used by a load/store instruction that is marked as
1877   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1878   /// VF values in the vectorized loop, each corresponding to an iteration of
1879   /// the original scalar loop.
1880   void collectLoopScalars(ElementCount VF);
1881 
1882   /// Keeps cost model vectorization decision and cost for instructions.
1883   /// Right now it is used for memory instructions only.
1884   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1885                                 std::pair<InstWidening, InstructionCost>>;
1886 
1887   DecisionList WideningDecisions;
1888 
1889   /// Returns true if \p V is expected to be vectorized and it needs to be
1890   /// extracted.
1891   bool needsExtract(Value *V, ElementCount VF) const {
1892     Instruction *I = dyn_cast<Instruction>(V);
1893     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1894         TheLoop->isLoopInvariant(I))
1895       return false;
1896 
1897     // Assume we can vectorize V (and hence we need extraction) if the
1898     // scalars are not computed yet. This can happen, because it is called
1899     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1900     // the scalars are collected. That should be a safe assumption in most
1901     // cases, because we check if the operands have vectorizable types
1902     // beforehand in LoopVectorizationLegality.
1903     return Scalars.find(VF) == Scalars.end() ||
1904            !isScalarAfterVectorization(I, VF);
1905   };
1906 
1907   /// Returns a range containing only operands needing to be extracted.
1908   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1909                                                    ElementCount VF) const {
1910     return SmallVector<Value *, 4>(make_filter_range(
1911         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1912   }
1913 
1914   /// Determines if we have the infrastructure to vectorize loop \p L and its
1915   /// epilogue, assuming the main loop is vectorized by \p VF.
1916   bool isCandidateForEpilogueVectorization(const Loop &L,
1917                                            const ElementCount VF) const;
1918 
1919   /// Returns true if epilogue vectorization is considered profitable, and
1920   /// false otherwise.
1921   /// \p VF is the vectorization factor chosen for the original loop.
1922   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1923 
1924 public:
1925   /// The loop that we evaluate.
1926   Loop *TheLoop;
1927 
1928   /// Predicated scalar evolution analysis.
1929   PredicatedScalarEvolution &PSE;
1930 
1931   /// Loop Info analysis.
1932   LoopInfo *LI;
1933 
1934   /// Vectorization legality.
1935   LoopVectorizationLegality *Legal;
1936 
1937   /// Vector target information.
1938   const TargetTransformInfo &TTI;
1939 
1940   /// Target Library Info.
1941   const TargetLibraryInfo *TLI;
1942 
1943   /// Demanded bits analysis.
1944   DemandedBits *DB;
1945 
1946   /// Assumption cache.
1947   AssumptionCache *AC;
1948 
1949   /// Interface to emit optimization remarks.
1950   OptimizationRemarkEmitter *ORE;
1951 
1952   const Function *TheFunction;
1953 
1954   /// Loop Vectorize Hint.
1955   const LoopVectorizeHints *Hints;
1956 
1957   /// The interleave access information contains groups of interleaved accesses
1958   /// with the same stride and close to each other.
1959   InterleavedAccessInfo &InterleaveInfo;
1960 
1961   /// Values to ignore in the cost model.
1962   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1963 
1964   /// Values to ignore in the cost model when VF > 1.
1965   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1966 
1967   /// All element types found in the loop.
1968   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1969 
1970   /// Profitable vector factors.
1971   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1972 };
1973 } // end namespace llvm
1974 
1975 /// Helper struct to manage generating runtime checks for vectorization.
1976 ///
1977 /// The runtime checks are created up-front in temporary blocks to allow better
1978 /// estimating the cost and un-linked from the existing IR. After deciding to
1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1980 /// temporary blocks are completely removed.
1981 class GeneratedRTChecks {
1982   /// Basic block which contains the generated SCEV checks, if any.
1983   BasicBlock *SCEVCheckBlock = nullptr;
1984 
1985   /// The value representing the result of the generated SCEV checks. If it is
1986   /// nullptr, either no SCEV checks have been generated or they have been used.
1987   Value *SCEVCheckCond = nullptr;
1988 
1989   /// Basic block which contains the generated memory runtime checks, if any.
1990   BasicBlock *MemCheckBlock = nullptr;
1991 
1992   /// The value representing the result of the generated memory runtime checks.
1993   /// If it is nullptr, either no memory runtime checks have been generated or
1994   /// they have been used.
1995   Value *MemRuntimeCheckCond = nullptr;
1996 
1997   DominatorTree *DT;
1998   LoopInfo *LI;
1999 
2000   SCEVExpander SCEVExp;
2001   SCEVExpander MemCheckExp;
2002 
2003 public:
2004   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2005                     const DataLayout &DL)
2006       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2007         MemCheckExp(SE, DL, "scev.check") {}
2008 
2009   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2010   /// accurately estimate the cost of the runtime checks. The blocks are
2011   /// un-linked from the IR and is added back during vector code generation. If
2012   /// there is no vector code generation, the check blocks are removed
2013   /// completely.
2014   void Create(Loop *L, const LoopAccessInfo &LAI,
2015               const SCEVUnionPredicate &UnionPred) {
2016 
2017     BasicBlock *LoopHeader = L->getHeader();
2018     BasicBlock *Preheader = L->getLoopPreheader();
2019 
2020     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2021     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2022     // may be used by SCEVExpander. The blocks will be un-linked from their
2023     // predecessors and removed from LI & DT at the end of the function.
2024     if (!UnionPred.isAlwaysTrue()) {
2025       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2026                                   nullptr, "vector.scevcheck");
2027 
2028       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2029           &UnionPred, SCEVCheckBlock->getTerminator());
2030     }
2031 
2032     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2033     if (RtPtrChecking.Need) {
2034       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2035       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2036                                  "vector.memcheck");
2037 
2038       MemRuntimeCheckCond =
2039           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2040                            RtPtrChecking.getChecks(), MemCheckExp);
2041       assert(MemRuntimeCheckCond &&
2042              "no RT checks generated although RtPtrChecking "
2043              "claimed checks are required");
2044     }
2045 
2046     if (!MemCheckBlock && !SCEVCheckBlock)
2047       return;
2048 
2049     // Unhook the temporary block with the checks, update various places
2050     // accordingly.
2051     if (SCEVCheckBlock)
2052       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2053     if (MemCheckBlock)
2054       MemCheckBlock->replaceAllUsesWith(Preheader);
2055 
2056     if (SCEVCheckBlock) {
2057       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2058       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2059       Preheader->getTerminator()->eraseFromParent();
2060     }
2061     if (MemCheckBlock) {
2062       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2063       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2064       Preheader->getTerminator()->eraseFromParent();
2065     }
2066 
2067     DT->changeImmediateDominator(LoopHeader, Preheader);
2068     if (MemCheckBlock) {
2069       DT->eraseNode(MemCheckBlock);
2070       LI->removeBlock(MemCheckBlock);
2071     }
2072     if (SCEVCheckBlock) {
2073       DT->eraseNode(SCEVCheckBlock);
2074       LI->removeBlock(SCEVCheckBlock);
2075     }
2076   }
2077 
2078   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2079   /// unused.
2080   ~GeneratedRTChecks() {
2081     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2082     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2083     if (!SCEVCheckCond)
2084       SCEVCleaner.markResultUsed();
2085 
2086     if (!MemRuntimeCheckCond)
2087       MemCheckCleaner.markResultUsed();
2088 
2089     if (MemRuntimeCheckCond) {
2090       auto &SE = *MemCheckExp.getSE();
2091       // Memory runtime check generation creates compares that use expanded
2092       // values. Remove them before running the SCEVExpanderCleaners.
2093       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2094         if (MemCheckExp.isInsertedInstruction(&I))
2095           continue;
2096         SE.forgetValue(&I);
2097         I.eraseFromParent();
2098       }
2099     }
2100     MemCheckCleaner.cleanup();
2101     SCEVCleaner.cleanup();
2102 
2103     if (SCEVCheckCond)
2104       SCEVCheckBlock->eraseFromParent();
2105     if (MemRuntimeCheckCond)
2106       MemCheckBlock->eraseFromParent();
2107   }
2108 
2109   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2110   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2111   /// depending on the generated condition.
2112   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2113                              BasicBlock *LoopVectorPreHeader,
2114                              BasicBlock *LoopExitBlock) {
2115     if (!SCEVCheckCond)
2116       return nullptr;
2117     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2118       if (C->isZero())
2119         return nullptr;
2120 
2121     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2122 
2123     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2124     // Create new preheader for vector loop.
2125     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2126       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2127 
2128     SCEVCheckBlock->getTerminator()->eraseFromParent();
2129     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2130     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2131                                                 SCEVCheckBlock);
2132 
2133     DT->addNewBlock(SCEVCheckBlock, Pred);
2134     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2135 
2136     ReplaceInstWithInst(
2137         SCEVCheckBlock->getTerminator(),
2138         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2139     // Mark the check as used, to prevent it from being removed during cleanup.
2140     SCEVCheckCond = nullptr;
2141     return SCEVCheckBlock;
2142   }
2143 
2144   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2145   /// the branches to branch to the vector preheader or \p Bypass, depending on
2146   /// the generated condition.
2147   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2148                                    BasicBlock *LoopVectorPreHeader) {
2149     // Check if we generated code that checks in runtime if arrays overlap.
2150     if (!MemRuntimeCheckCond)
2151       return nullptr;
2152 
2153     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2154     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155                                                 MemCheckBlock);
2156 
2157     DT->addNewBlock(MemCheckBlock, Pred);
2158     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2159     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2160 
2161     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2162       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2163 
2164     ReplaceInstWithInst(
2165         MemCheckBlock->getTerminator(),
2166         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2167     MemCheckBlock->getTerminator()->setDebugLoc(
2168         Pred->getTerminator()->getDebugLoc());
2169 
2170     // Mark the check as used, to prevent it from being removed during cleanup.
2171     MemRuntimeCheckCond = nullptr;
2172     return MemCheckBlock;
2173   }
2174 };
2175 
2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2177 // vectorization. The loop needs to be annotated with #pragma omp simd
2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2179 // vector length information is not provided, vectorization is not considered
2180 // explicit. Interleave hints are not allowed either. These limitations will be
2181 // relaxed in the future.
2182 // Please, note that we are currently forced to abuse the pragma 'clang
2183 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2185 // provides *explicit vectorization hints* (LV can bypass legal checks and
2186 // assume that vectorization is legal). However, both hints are implemented
2187 // using the same metadata (llvm.loop.vectorize, processed by
2188 // LoopVectorizeHints). This will be fixed in the future when the native IR
2189 // representation for pragma 'omp simd' is introduced.
2190 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2191                                    OptimizationRemarkEmitter *ORE) {
2192   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2193   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2194 
2195   // Only outer loops with an explicit vectorization hint are supported.
2196   // Unannotated outer loops are ignored.
2197   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2198     return false;
2199 
2200   Function *Fn = OuterLp->getHeader()->getParent();
2201   if (!Hints.allowVectorization(Fn, OuterLp,
2202                                 true /*VectorizeOnlyWhenForced*/)) {
2203     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2204     return false;
2205   }
2206 
2207   if (Hints.getInterleave() > 1) {
2208     // TODO: Interleave support is future work.
2209     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2210                          "outer loops.\n");
2211     Hints.emitRemarkWithHints();
2212     return false;
2213   }
2214 
2215   return true;
2216 }
2217 
2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2219                                   OptimizationRemarkEmitter *ORE,
2220                                   SmallVectorImpl<Loop *> &V) {
2221   // Collect inner loops and outer loops without irreducible control flow. For
2222   // now, only collect outer loops that have explicit vectorization hints. If we
2223   // are stress testing the VPlan H-CFG construction, we collect the outermost
2224   // loop of every loop nest.
2225   if (L.isInnermost() || VPlanBuildStressTest ||
2226       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2227     LoopBlocksRPO RPOT(&L);
2228     RPOT.perform(LI);
2229     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230       V.push_back(&L);
2231       // TODO: Collect inner loops inside marked outer loops in case
2232       // vectorization fails for the outer loop. Do not invoke
2233       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2234       // already known to be reducible. We can use an inherited attribute for
2235       // that.
2236       return;
2237     }
2238   }
2239   for (Loop *InnerL : L)
2240     collectSupportedLoops(*InnerL, LI, ORE, V);
2241 }
2242 
2243 namespace {
2244 
2245 /// The LoopVectorize Pass.
2246 struct LoopVectorize : public FunctionPass {
2247   /// Pass identification, replacement for typeid
2248   static char ID;
2249 
2250   LoopVectorizePass Impl;
2251 
2252   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2253                          bool VectorizeOnlyWhenForced = false)
2254       : FunctionPass(ID),
2255         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2256     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2257   }
2258 
2259   bool runOnFunction(Function &F) override {
2260     if (skipFunction(F))
2261       return false;
2262 
2263     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2264     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2265     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2266     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2267     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2268     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2269     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2270     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2271     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2272     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2273     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2274     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2275     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2276 
2277     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2278         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2279 
2280     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2281                         GetLAA, *ORE, PSI).MadeAnyChange;
2282   }
2283 
2284   void getAnalysisUsage(AnalysisUsage &AU) const override {
2285     AU.addRequired<AssumptionCacheTracker>();
2286     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2287     AU.addRequired<DominatorTreeWrapperPass>();
2288     AU.addRequired<LoopInfoWrapperPass>();
2289     AU.addRequired<ScalarEvolutionWrapperPass>();
2290     AU.addRequired<TargetTransformInfoWrapperPass>();
2291     AU.addRequired<AAResultsWrapperPass>();
2292     AU.addRequired<LoopAccessLegacyAnalysis>();
2293     AU.addRequired<DemandedBitsWrapperPass>();
2294     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2295     AU.addRequired<InjectTLIMappingsLegacy>();
2296 
2297     // We currently do not preserve loopinfo/dominator analyses with outer loop
2298     // vectorization. Until this is addressed, mark these analyses as preserved
2299     // only for non-VPlan-native path.
2300     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2301     if (!EnableVPlanNativePath) {
2302       AU.addPreserved<LoopInfoWrapperPass>();
2303       AU.addPreserved<DominatorTreeWrapperPass>();
2304     }
2305 
2306     AU.addPreserved<BasicAAWrapperPass>();
2307     AU.addPreserved<GlobalsAAWrapperPass>();
2308     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2309   }
2310 };
2311 
2312 } // end anonymous namespace
2313 
2314 //===----------------------------------------------------------------------===//
2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2316 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2317 //===----------------------------------------------------------------------===//
2318 
2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2320   // We need to place the broadcast of invariant variables outside the loop,
2321   // but only if it's proven safe to do so. Else, broadcast will be inside
2322   // vector loop body.
2323   Instruction *Instr = dyn_cast<Instruction>(V);
2324   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2325                      (!Instr ||
2326                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2327   // Place the code for broadcasting invariant variables in the new preheader.
2328   IRBuilder<>::InsertPointGuard Guard(Builder);
2329   if (SafeToHoist)
2330     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2331 
2332   // Broadcast the scalar into all locations in the vector.
2333   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2334 
2335   return Shuf;
2336 }
2337 
2338 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2339     const InductionDescriptor &II, Value *Step, Value *Start,
2340     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2341   IRBuilder<> &Builder = State.Builder;
2342   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2343          "Expected either an induction phi-node or a truncate of it!");
2344 
2345   // Construct the initial value of the vector IV in the vector loop preheader
2346   auto CurrIP = Builder.saveIP();
2347   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2348   if (isa<TruncInst>(EntryVal)) {
2349     assert(Start->getType()->isIntegerTy() &&
2350            "Truncation requires an integer type");
2351     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2352     Step = Builder.CreateTrunc(Step, TruncType);
2353     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2354   }
2355 
2356   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2357   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2358   Value *SteppedStart =
2359       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
2360 
2361   // We create vector phi nodes for both integer and floating-point induction
2362   // variables. Here, we determine the kind of arithmetic we will perform.
2363   Instruction::BinaryOps AddOp;
2364   Instruction::BinaryOps MulOp;
2365   if (Step->getType()->isIntegerTy()) {
2366     AddOp = Instruction::Add;
2367     MulOp = Instruction::Mul;
2368   } else {
2369     AddOp = II.getInductionOpcode();
2370     MulOp = Instruction::FMul;
2371   }
2372 
2373   // Multiply the vectorization factor by the step using integer or
2374   // floating-point arithmetic as appropriate.
2375   Type *StepType = Step->getType();
2376   Value *RuntimeVF;
2377   if (Step->getType()->isFloatingPointTy())
2378     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2379   else
2380     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2381   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2382 
2383   // Create a vector splat to use in the induction update.
2384   //
2385   // FIXME: If the step is non-constant, we create the vector splat with
2386   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2387   //        handle a constant vector splat.
2388   Value *SplatVF = isa<Constant>(Mul)
2389                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2390                        : Builder.CreateVectorSplat(State.VF, Mul);
2391   Builder.restoreIP(CurrIP);
2392 
2393   // We may need to add the step a number of times, depending on the unroll
2394   // factor. The last of those goes into the PHI.
2395   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2396                                     &*LoopVectorBody->getFirstInsertionPt());
2397   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2398   Instruction *LastInduction = VecInd;
2399   for (unsigned Part = 0; Part < UF; ++Part) {
2400     State.set(Def, LastInduction, Part);
2401 
2402     if (isa<TruncInst>(EntryVal))
2403       addMetadata(LastInduction, EntryVal);
2404 
2405     LastInduction = cast<Instruction>(
2406         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2407     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2408   }
2409 
2410   // Move the last step to the end of the latch block. This ensures consistent
2411   // placement of all induction updates.
2412   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2413   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2414   auto *ICmp = cast<Instruction>(Br->getCondition());
2415   LastInduction->moveBefore(ICmp);
2416   LastInduction->setName("vec.ind.next");
2417 
2418   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2419   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2420 }
2421 
2422 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2423   return Cost->isScalarAfterVectorization(I, VF) ||
2424          Cost->isProfitableToScalarize(I, VF);
2425 }
2426 
2427 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2428   if (shouldScalarizeInstruction(IV))
2429     return true;
2430   auto isScalarInst = [&](User *U) -> bool {
2431     auto *I = cast<Instruction>(U);
2432     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2433   };
2434   return llvm::any_of(IV->users(), isScalarInst);
2435 }
2436 
2437 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
2438                                                 const InductionDescriptor &ID,
2439                                                 Value *Start, TruncInst *Trunc,
2440                                                 VPValue *Def,
2441                                                 VPTransformState &State) {
2442   IRBuilder<> &Builder = State.Builder;
2443   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2444          "Primary induction variable must have an integer type");
2445   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2446 
2447   // The value from the original loop to which we are mapping the new induction
2448   // variable.
2449   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2450 
2451   auto &DL = EntryVal->getModule()->getDataLayout();
2452 
2453   // Generate code for the induction step. Note that induction steps are
2454   // required to be loop-invariant
2455   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2456     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2457            "Induction step should be loop invariant");
2458     if (PSE.getSE()->isSCEVable(IV->getType())) {
2459       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2460       return Exp.expandCodeFor(Step, Step->getType(),
2461                                State.CFG.VectorPreHeader->getTerminator());
2462     }
2463     return cast<SCEVUnknown>(Step)->getValue();
2464   };
2465 
2466   // The scalar value to broadcast. This is derived from the canonical
2467   // induction variable. If a truncation type is given, truncate the canonical
2468   // induction variable and step. Otherwise, derive these values from the
2469   // induction descriptor.
2470   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2471     Value *ScalarIV = Induction;
2472     if (IV != OldInduction) {
2473       ScalarIV = IV->getType()->isIntegerTy()
2474                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2475                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2476                                           IV->getType());
2477       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2478                                       State.CFG.PrevBB);
2479       ScalarIV->setName("offset.idx");
2480     }
2481     if (Trunc) {
2482       auto *TruncType = cast<IntegerType>(Trunc->getType());
2483       assert(Step->getType()->isIntegerTy() &&
2484              "Truncation requires an integer step");
2485       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2486       Step = Builder.CreateTrunc(Step, TruncType);
2487     }
2488     return ScalarIV;
2489   };
2490 
2491   // Create the vector values from the scalar IV, in the absence of creating a
2492   // vector IV.
2493   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2494     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2495     for (unsigned Part = 0; Part < UF; ++Part) {
2496       assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
2497       Value *StartIdx;
2498       if (Step->getType()->isFloatingPointTy())
2499         StartIdx =
2500             getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
2501       else
2502         StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
2503 
2504       Value *EntryPart =
2505           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
2506       State.set(Def, EntryPart, Part);
2507       if (Trunc)
2508         addMetadata(EntryPart, Trunc);
2509     }
2510   };
2511 
2512   // Fast-math-flags propagate from the original induction instruction.
2513   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2514   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2515     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2516 
2517   // Now do the actual transformations, and start with creating the step value.
2518   Value *Step = CreateStepValue(ID.getStep());
2519   if (State.VF.isZero() || State.VF.isScalar()) {
2520     Value *ScalarIV = CreateScalarIV(Step);
2521     CreateSplatIV(ScalarIV, Step);
2522     return;
2523   }
2524 
2525   // Determine if we want a scalar version of the induction variable. This is
2526   // true if the induction variable itself is not widened, or if it has at
2527   // least one user in the loop that is not widened.
2528   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2529   if (!NeedsScalarIV) {
2530     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2531     return;
2532   }
2533 
2534   // Try to create a new independent vector induction variable. If we can't
2535   // create the phi node, we will splat the scalar induction variable in each
2536   // loop iteration.
2537   if (!shouldScalarizeInstruction(EntryVal)) {
2538     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2539     Value *ScalarIV = CreateScalarIV(Step);
2540     // Create scalar steps that can be used by instructions we will later
2541     // scalarize. Note that the addition of the scalar steps will not increase
2542     // the number of instructions in the loop in the common case prior to
2543     // InstCombine. We will be trading one vector extract for each scalar step.
2544     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2545     return;
2546   }
2547 
2548   // All IV users are scalar instructions, so only emit a scalar IV, not a
2549   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2550   // predicate used by the masked loads/stores.
2551   Value *ScalarIV = CreateScalarIV(Step);
2552   if (!Cost->isScalarEpilogueAllowed())
2553     CreateSplatIV(ScalarIV, Step);
2554   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2555 }
2556 
2557 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
2558                                           Value *Step,
2559                                           Instruction::BinaryOps BinOp) {
2560   // Create and check the types.
2561   auto *ValVTy = cast<VectorType>(Val->getType());
2562   ElementCount VLen = ValVTy->getElementCount();
2563 
2564   Type *STy = Val->getType()->getScalarType();
2565   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2566          "Induction Step must be an integer or FP");
2567   assert(Step->getType() == STy && "Step has wrong type");
2568 
2569   SmallVector<Constant *, 8> Indices;
2570 
2571   // Create a vector of consecutive numbers from zero to VF.
2572   VectorType *InitVecValVTy = ValVTy;
2573   Type *InitVecValSTy = STy;
2574   if (STy->isFloatingPointTy()) {
2575     InitVecValSTy =
2576         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2577     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2578   }
2579   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2580 
2581   // Splat the StartIdx
2582   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2583 
2584   if (STy->isIntegerTy()) {
2585     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2586     Step = Builder.CreateVectorSplat(VLen, Step);
2587     assert(Step->getType() == Val->getType() && "Invalid step vec");
2588     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2589     // which can be found from the original scalar operations.
2590     Step = Builder.CreateMul(InitVec, Step);
2591     return Builder.CreateAdd(Val, Step, "induction");
2592   }
2593 
2594   // Floating point induction.
2595   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2596          "Binary Opcode should be specified for FP induction");
2597   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2598   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2599 
2600   Step = Builder.CreateVectorSplat(VLen, Step);
2601   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2602   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2603 }
2604 
2605 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2606                                            Instruction *EntryVal,
2607                                            const InductionDescriptor &ID,
2608                                            VPValue *Def,
2609                                            VPTransformState &State) {
2610   IRBuilder<> &Builder = State.Builder;
2611   // We shouldn't have to build scalar steps if we aren't vectorizing.
2612   assert(State.VF.isVector() && "VF should be greater than one");
2613   // Get the value type and ensure it and the step have the same integer type.
2614   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2615   assert(ScalarIVTy == Step->getType() &&
2616          "Val and Step should have the same type");
2617 
2618   // We build scalar steps for both integer and floating-point induction
2619   // variables. Here, we determine the kind of arithmetic we will perform.
2620   Instruction::BinaryOps AddOp;
2621   Instruction::BinaryOps MulOp;
2622   if (ScalarIVTy->isIntegerTy()) {
2623     AddOp = Instruction::Add;
2624     MulOp = Instruction::Mul;
2625   } else {
2626     AddOp = ID.getInductionOpcode();
2627     MulOp = Instruction::FMul;
2628   }
2629 
2630   // Determine the number of scalars we need to generate for each unroll
2631   // iteration. If EntryVal is uniform, we only need to generate the first
2632   // lane. Otherwise, we generate all VF values.
2633   bool IsUniform =
2634       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2635   unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2636   // Compute the scalar steps and save the results in State.
2637   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2638                                      ScalarIVTy->getScalarSizeInBits());
2639   Type *VecIVTy = nullptr;
2640   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2641   if (!IsUniform && State.VF.isScalable()) {
2642     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2643     UnitStepVec =
2644         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2645     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2646     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2647   }
2648 
2649   for (unsigned Part = 0; Part < State.UF; ++Part) {
2650     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2651 
2652     if (!IsUniform && State.VF.isScalable()) {
2653       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2654       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2655       if (ScalarIVTy->isFloatingPointTy())
2656         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2657       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2658       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2659       State.set(Def, Add, Part);
2660       // It's useful to record the lane values too for the known minimum number
2661       // of elements so we do those below. This improves the code quality when
2662       // trying to extract the first element, for example.
2663     }
2664 
2665     if (ScalarIVTy->isFloatingPointTy())
2666       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2667 
2668     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2669       Value *StartIdx = Builder.CreateBinOp(
2670           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2671       // The step returned by `createStepForVF` is a runtime-evaluated value
2672       // when VF is scalable. Otherwise, it should be folded into a Constant.
2673       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2674              "Expected StartIdx to be folded to a constant when VF is not "
2675              "scalable");
2676       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2677       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2678       State.set(Def, Add, VPIteration(Part, Lane));
2679     }
2680   }
2681 }
2682 
2683 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2684                                                     const VPIteration &Instance,
2685                                                     VPTransformState &State) {
2686   Value *ScalarInst = State.get(Def, Instance);
2687   Value *VectorValue = State.get(Def, Instance.Part);
2688   VectorValue = Builder.CreateInsertElement(
2689       VectorValue, ScalarInst,
2690       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2691   State.set(Def, VectorValue, Instance.Part);
2692 }
2693 
2694 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2695   assert(Vec->getType()->isVectorTy() && "Invalid type");
2696   return Builder.CreateVectorReverse(Vec, "reverse");
2697 }
2698 
2699 // Return whether we allow using masked interleave-groups (for dealing with
2700 // strided loads/stores that reside in predicated blocks, or for dealing
2701 // with gaps).
2702 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2703   // If an override option has been passed in for interleaved accesses, use it.
2704   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2705     return EnableMaskedInterleavedMemAccesses;
2706 
2707   return TTI.enableMaskedInterleavedAccessVectorization();
2708 }
2709 
2710 // Try to vectorize the interleave group that \p Instr belongs to.
2711 //
2712 // E.g. Translate following interleaved load group (factor = 3):
2713 //   for (i = 0; i < N; i+=3) {
2714 //     R = Pic[i];             // Member of index 0
2715 //     G = Pic[i+1];           // Member of index 1
2716 //     B = Pic[i+2];           // Member of index 2
2717 //     ... // do something to R, G, B
2718 //   }
2719 // To:
2720 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2721 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2722 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2723 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2724 //
2725 // Or translate following interleaved store group (factor = 3):
2726 //   for (i = 0; i < N; i+=3) {
2727 //     ... do something to R, G, B
2728 //     Pic[i]   = R;           // Member of index 0
2729 //     Pic[i+1] = G;           // Member of index 1
2730 //     Pic[i+2] = B;           // Member of index 2
2731 //   }
2732 // To:
2733 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2734 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2735 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2736 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2737 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2738 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2739     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2740     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2741     VPValue *BlockInMask) {
2742   Instruction *Instr = Group->getInsertPos();
2743   const DataLayout &DL = Instr->getModule()->getDataLayout();
2744 
2745   // Prepare for the vector type of the interleaved load/store.
2746   Type *ScalarTy = getLoadStoreType(Instr);
2747   unsigned InterleaveFactor = Group->getFactor();
2748   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2749   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2750 
2751   // Prepare for the new pointers.
2752   SmallVector<Value *, 2> AddrParts;
2753   unsigned Index = Group->getIndex(Instr);
2754 
2755   // TODO: extend the masked interleaved-group support to reversed access.
2756   assert((!BlockInMask || !Group->isReverse()) &&
2757          "Reversed masked interleave-group not supported.");
2758 
2759   // If the group is reverse, adjust the index to refer to the last vector lane
2760   // instead of the first. We adjust the index from the first vector lane,
2761   // rather than directly getting the pointer for lane VF - 1, because the
2762   // pointer operand of the interleaved access is supposed to be uniform. For
2763   // uniform instructions, we're only required to generate a value for the
2764   // first vector lane in each unroll iteration.
2765   if (Group->isReverse())
2766     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2767 
2768   for (unsigned Part = 0; Part < UF; Part++) {
2769     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2770     setDebugLocFromInst(AddrPart);
2771 
2772     // Notice current instruction could be any index. Need to adjust the address
2773     // to the member of index 0.
2774     //
2775     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2776     //       b = A[i];       // Member of index 0
2777     // Current pointer is pointed to A[i+1], adjust it to A[i].
2778     //
2779     // E.g.  A[i+1] = a;     // Member of index 1
2780     //       A[i]   = b;     // Member of index 0
2781     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2782     // Current pointer is pointed to A[i+2], adjust it to A[i].
2783 
2784     bool InBounds = false;
2785     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2786       InBounds = gep->isInBounds();
2787     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2788     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2789 
2790     // Cast to the vector pointer type.
2791     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2792     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2793     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2794   }
2795 
2796   setDebugLocFromInst(Instr);
2797   Value *PoisonVec = PoisonValue::get(VecTy);
2798 
2799   Value *MaskForGaps = nullptr;
2800   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2801     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2802     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2803   }
2804 
2805   // Vectorize the interleaved load group.
2806   if (isa<LoadInst>(Instr)) {
2807     // For each unroll part, create a wide load for the group.
2808     SmallVector<Value *, 2> NewLoads;
2809     for (unsigned Part = 0; Part < UF; Part++) {
2810       Instruction *NewLoad;
2811       if (BlockInMask || MaskForGaps) {
2812         assert(useMaskedInterleavedAccesses(*TTI) &&
2813                "masked interleaved groups are not allowed.");
2814         Value *GroupMask = MaskForGaps;
2815         if (BlockInMask) {
2816           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2817           Value *ShuffledMask = Builder.CreateShuffleVector(
2818               BlockInMaskPart,
2819               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2820               "interleaved.mask");
2821           GroupMask = MaskForGaps
2822                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2823                                                 MaskForGaps)
2824                           : ShuffledMask;
2825         }
2826         NewLoad =
2827             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2828                                      GroupMask, PoisonVec, "wide.masked.vec");
2829       }
2830       else
2831         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2832                                             Group->getAlign(), "wide.vec");
2833       Group->addMetadata(NewLoad);
2834       NewLoads.push_back(NewLoad);
2835     }
2836 
2837     // For each member in the group, shuffle out the appropriate data from the
2838     // wide loads.
2839     unsigned J = 0;
2840     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2841       Instruction *Member = Group->getMember(I);
2842 
2843       // Skip the gaps in the group.
2844       if (!Member)
2845         continue;
2846 
2847       auto StrideMask =
2848           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2849       for (unsigned Part = 0; Part < UF; Part++) {
2850         Value *StridedVec = Builder.CreateShuffleVector(
2851             NewLoads[Part], StrideMask, "strided.vec");
2852 
2853         // If this member has different type, cast the result type.
2854         if (Member->getType() != ScalarTy) {
2855           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2856           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2857           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2858         }
2859 
2860         if (Group->isReverse())
2861           StridedVec = reverseVector(StridedVec);
2862 
2863         State.set(VPDefs[J], StridedVec, Part);
2864       }
2865       ++J;
2866     }
2867     return;
2868   }
2869 
2870   // The sub vector type for current instruction.
2871   auto *SubVT = VectorType::get(ScalarTy, VF);
2872 
2873   // Vectorize the interleaved store group.
2874   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2875   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2876          "masked interleaved groups are not allowed.");
2877   assert((!MaskForGaps || !VF.isScalable()) &&
2878          "masking gaps for scalable vectors is not yet supported.");
2879   for (unsigned Part = 0; Part < UF; Part++) {
2880     // Collect the stored vector from each member.
2881     SmallVector<Value *, 4> StoredVecs;
2882     for (unsigned i = 0; i < InterleaveFactor; i++) {
2883       assert((Group->getMember(i) || MaskForGaps) &&
2884              "Fail to get a member from an interleaved store group");
2885       Instruction *Member = Group->getMember(i);
2886 
2887       // Skip the gaps in the group.
2888       if (!Member) {
2889         Value *Undef = PoisonValue::get(SubVT);
2890         StoredVecs.push_back(Undef);
2891         continue;
2892       }
2893 
2894       Value *StoredVec = State.get(StoredValues[i], Part);
2895 
2896       if (Group->isReverse())
2897         StoredVec = reverseVector(StoredVec);
2898 
2899       // If this member has different type, cast it to a unified type.
2900 
2901       if (StoredVec->getType() != SubVT)
2902         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2903 
2904       StoredVecs.push_back(StoredVec);
2905     }
2906 
2907     // Concatenate all vectors into a wide vector.
2908     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2909 
2910     // Interleave the elements in the wide vector.
2911     Value *IVec = Builder.CreateShuffleVector(
2912         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2913         "interleaved.vec");
2914 
2915     Instruction *NewStoreInstr;
2916     if (BlockInMask || MaskForGaps) {
2917       Value *GroupMask = MaskForGaps;
2918       if (BlockInMask) {
2919         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2920         Value *ShuffledMask = Builder.CreateShuffleVector(
2921             BlockInMaskPart,
2922             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2923             "interleaved.mask");
2924         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2925                                                       ShuffledMask, MaskForGaps)
2926                                 : ShuffledMask;
2927       }
2928       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2929                                                 Group->getAlign(), GroupMask);
2930     } else
2931       NewStoreInstr =
2932           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2933 
2934     Group->addMetadata(NewStoreInstr);
2935   }
2936 }
2937 
2938 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2939                                                VPReplicateRecipe *RepRecipe,
2940                                                const VPIteration &Instance,
2941                                                bool IfPredicateInstr,
2942                                                VPTransformState &State) {
2943   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2944 
2945   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2946   // the first lane and part.
2947   if (isa<NoAliasScopeDeclInst>(Instr))
2948     if (!Instance.isFirstIteration())
2949       return;
2950 
2951   setDebugLocFromInst(Instr);
2952 
2953   // Does this instruction return a value ?
2954   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2955 
2956   Instruction *Cloned = Instr->clone();
2957   if (!IsVoidRetTy)
2958     Cloned->setName(Instr->getName() + ".cloned");
2959 
2960   // If the scalarized instruction contributes to the address computation of a
2961   // widen masked load/store which was in a basic block that needed predication
2962   // and is not predicated after vectorization, we can't propagate
2963   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2964   // instruction could feed a poison value to the base address of the widen
2965   // load/store.
2966   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2967     Cloned->dropPoisonGeneratingFlags();
2968 
2969   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2970                                Builder.GetInsertPoint());
2971   // Replace the operands of the cloned instructions with their scalar
2972   // equivalents in the new loop.
2973   for (auto &I : enumerate(RepRecipe->operands())) {
2974     auto InputInstance = Instance;
2975     VPValue *Operand = I.value();
2976     if (State.Plan->isUniformAfterVectorization(Operand))
2977       InputInstance.Lane = VPLane::getFirstLane();
2978     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2979   }
2980   addNewMetadata(Cloned, Instr);
2981 
2982   // Place the cloned scalar in the new loop.
2983   Builder.Insert(Cloned);
2984 
2985   State.set(RepRecipe, Cloned, Instance);
2986 
2987   // If we just cloned a new assumption, add it the assumption cache.
2988   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2989     AC->registerAssumption(II);
2990 
2991   // End if-block.
2992   if (IfPredicateInstr)
2993     PredicatedInstructions.push_back(Cloned);
2994 }
2995 
2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2997                                                       Value *End, Value *Step,
2998                                                       Instruction *DL) {
2999   BasicBlock *Header = L->getHeader();
3000   BasicBlock *Latch = L->getLoopLatch();
3001   // As we're just creating this loop, it's possible no latch exists
3002   // yet. If so, use the header as this will be a single block loop.
3003   if (!Latch)
3004     Latch = Header;
3005 
3006   IRBuilder<> B(&*Header->getFirstInsertionPt());
3007   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3008   setDebugLocFromInst(OldInst, &B);
3009   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3010 
3011   B.SetInsertPoint(Latch->getTerminator());
3012   setDebugLocFromInst(OldInst, &B);
3013 
3014   // Create i+1 and fill the PHINode.
3015   //
3016   // If the tail is not folded, we know that End - Start >= Step (either
3017   // statically or through the minimum iteration checks). We also know that both
3018   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3019   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3020   // overflows and we can mark the induction increment as NUW.
3021   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3022                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3023   Induction->addIncoming(Start, L->getLoopPreheader());
3024   Induction->addIncoming(Next, Latch);
3025   // Create the compare.
3026   Value *ICmp = B.CreateICmpEQ(Next, End);
3027   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3028 
3029   // Now we have two terminators. Remove the old one from the block.
3030   Latch->getTerminator()->eraseFromParent();
3031 
3032   return Induction;
3033 }
3034 
3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3036   if (TripCount)
3037     return TripCount;
3038 
3039   assert(L && "Create Trip Count for null loop.");
3040   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3041   // Find the loop boundaries.
3042   ScalarEvolution *SE = PSE.getSE();
3043   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3044   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3045          "Invalid loop count");
3046 
3047   Type *IdxTy = Legal->getWidestInductionType();
3048   assert(IdxTy && "No type for induction");
3049 
3050   // The exit count might have the type of i64 while the phi is i32. This can
3051   // happen if we have an induction variable that is sign extended before the
3052   // compare. The only way that we get a backedge taken count is that the
3053   // induction variable was signed and as such will not overflow. In such a case
3054   // truncation is legal.
3055   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3056       IdxTy->getPrimitiveSizeInBits())
3057     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3058   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3059 
3060   // Get the total trip count from the count by adding 1.
3061   const SCEV *ExitCount = SE->getAddExpr(
3062       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3063 
3064   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3065 
3066   // Expand the trip count and place the new instructions in the preheader.
3067   // Notice that the pre-header does not change, only the loop body.
3068   SCEVExpander Exp(*SE, DL, "induction");
3069 
3070   // Count holds the overall loop count (N).
3071   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3072                                 L->getLoopPreheader()->getTerminator());
3073 
3074   if (TripCount->getType()->isPointerTy())
3075     TripCount =
3076         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3077                                     L->getLoopPreheader()->getTerminator());
3078 
3079   return TripCount;
3080 }
3081 
3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3083   if (VectorTripCount)
3084     return VectorTripCount;
3085 
3086   Value *TC = getOrCreateTripCount(L);
3087   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3088 
3089   Type *Ty = TC->getType();
3090   // This is where we can make the step a runtime constant.
3091   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3092 
3093   // If the tail is to be folded by masking, round the number of iterations N
3094   // up to a multiple of Step instead of rounding down. This is done by first
3095   // adding Step-1 and then rounding down. Note that it's ok if this addition
3096   // overflows: the vector induction variable will eventually wrap to zero given
3097   // that it starts at zero and its Step is a power of two; the loop will then
3098   // exit, with the last early-exit vector comparison also producing all-true.
3099   if (Cost->foldTailByMasking()) {
3100     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3101            "VF*UF must be a power of 2 when folding tail by masking");
3102     assert(!VF.isScalable() &&
3103            "Tail folding not yet supported for scalable vectors");
3104     TC = Builder.CreateAdd(
3105         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3106   }
3107 
3108   // Now we need to generate the expression for the part of the loop that the
3109   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3110   // iterations are not required for correctness, or N - Step, otherwise. Step
3111   // is equal to the vectorization factor (number of SIMD elements) times the
3112   // unroll factor (number of SIMD instructions).
3113   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3114 
3115   // There are cases where we *must* run at least one iteration in the remainder
3116   // loop.  See the cost model for when this can happen.  If the step evenly
3117   // divides the trip count, we set the remainder to be equal to the step. If
3118   // the step does not evenly divide the trip count, no adjustment is necessary
3119   // since there will already be scalar iterations. Note that the minimum
3120   // iterations check ensures that N >= Step.
3121   if (Cost->requiresScalarEpilogue(VF)) {
3122     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3123     R = Builder.CreateSelect(IsZero, Step, R);
3124   }
3125 
3126   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3127 
3128   return VectorTripCount;
3129 }
3130 
3131 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3132                                                    const DataLayout &DL) {
3133   // Verify that V is a vector type with same number of elements as DstVTy.
3134   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3135   unsigned VF = DstFVTy->getNumElements();
3136   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3137   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3138   Type *SrcElemTy = SrcVecTy->getElementType();
3139   Type *DstElemTy = DstFVTy->getElementType();
3140   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3141          "Vector elements must have same size");
3142 
3143   // Do a direct cast if element types are castable.
3144   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3145     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3146   }
3147   // V cannot be directly casted to desired vector type.
3148   // May happen when V is a floating point vector but DstVTy is a vector of
3149   // pointers or vice-versa. Handle this using a two-step bitcast using an
3150   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3151   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3152          "Only one type should be a pointer type");
3153   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3154          "Only one type should be a floating point type");
3155   Type *IntTy =
3156       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3157   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3158   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3159   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3160 }
3161 
3162 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3163                                                          BasicBlock *Bypass) {
3164   Value *Count = getOrCreateTripCount(L);
3165   // Reuse existing vector loop preheader for TC checks.
3166   // Note that new preheader block is generated for vector loop.
3167   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3168   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3169 
3170   // Generate code to check if the loop's trip count is less than VF * UF, or
3171   // equal to it in case a scalar epilogue is required; this implies that the
3172   // vector trip count is zero. This check also covers the case where adding one
3173   // to the backedge-taken count overflowed leading to an incorrect trip count
3174   // of zero. In this case we will also jump to the scalar loop.
3175   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3176                                             : ICmpInst::ICMP_ULT;
3177 
3178   // If tail is to be folded, vector loop takes care of all iterations.
3179   Value *CheckMinIters = Builder.getFalse();
3180   if (!Cost->foldTailByMasking()) {
3181     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3182     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3183   }
3184   // Create new preheader for vector loop.
3185   LoopVectorPreHeader =
3186       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3187                  "vector.ph");
3188 
3189   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3190                                DT->getNode(Bypass)->getIDom()) &&
3191          "TC check is expected to dominate Bypass");
3192 
3193   // Update dominator for Bypass & LoopExit (if needed).
3194   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3195   if (!Cost->requiresScalarEpilogue(VF))
3196     // If there is an epilogue which must run, there's no edge from the
3197     // middle block to exit blocks  and thus no need to update the immediate
3198     // dominator of the exit blocks.
3199     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3200 
3201   ReplaceInstWithInst(
3202       TCCheckBlock->getTerminator(),
3203       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3204   LoopBypassBlocks.push_back(TCCheckBlock);
3205 }
3206 
3207 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3208 
3209   BasicBlock *const SCEVCheckBlock =
3210       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3211   if (!SCEVCheckBlock)
3212     return nullptr;
3213 
3214   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3215            (OptForSizeBasedOnProfile &&
3216             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3217          "Cannot SCEV check stride or overflow when optimizing for size");
3218 
3219 
3220   // Update dominator only if this is first RT check.
3221   if (LoopBypassBlocks.empty()) {
3222     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3223     if (!Cost->requiresScalarEpilogue(VF))
3224       // If there is an epilogue which must run, there's no edge from the
3225       // middle block to exit blocks  and thus no need to update the immediate
3226       // dominator of the exit blocks.
3227       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3228   }
3229 
3230   LoopBypassBlocks.push_back(SCEVCheckBlock);
3231   AddedSafetyChecks = true;
3232   return SCEVCheckBlock;
3233 }
3234 
3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3236                                                       BasicBlock *Bypass) {
3237   // VPlan-native path does not do any analysis for runtime checks currently.
3238   if (EnableVPlanNativePath)
3239     return nullptr;
3240 
3241   BasicBlock *const MemCheckBlock =
3242       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3243 
3244   // Check if we generated code that checks in runtime if arrays overlap. We put
3245   // the checks into a separate block to make the more common case of few
3246   // elements faster.
3247   if (!MemCheckBlock)
3248     return nullptr;
3249 
3250   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3251     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3252            "Cannot emit memory checks when optimizing for size, unless forced "
3253            "to vectorize.");
3254     ORE->emit([&]() {
3255       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3256                                         L->getStartLoc(), L->getHeader())
3257              << "Code-size may be reduced by not forcing "
3258                 "vectorization, or by source-code modifications "
3259                 "eliminating the need for runtime checks "
3260                 "(e.g., adding 'restrict').";
3261     });
3262   }
3263 
3264   LoopBypassBlocks.push_back(MemCheckBlock);
3265 
3266   AddedSafetyChecks = true;
3267 
3268   // We currently don't use LoopVersioning for the actual loop cloning but we
3269   // still use it to add the noalias metadata.
3270   LVer = std::make_unique<LoopVersioning>(
3271       *Legal->getLAI(),
3272       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3273       DT, PSE.getSE());
3274   LVer->prepareNoAliasMetadata();
3275   return MemCheckBlock;
3276 }
3277 
3278 Value *InnerLoopVectorizer::emitTransformedIndex(
3279     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3280     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3281 
3282   SCEVExpander Exp(*SE, DL, "induction");
3283   auto Step = ID.getStep();
3284   auto StartValue = ID.getStartValue();
3285   assert(Index->getType()->getScalarType() == Step->getType() &&
3286          "Index scalar type does not match StepValue type");
3287 
3288   // Note: the IR at this point is broken. We cannot use SE to create any new
3289   // SCEV and then expand it, hoping that SCEV's simplification will give us
3290   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3291   // lead to various SCEV crashes. So all we can do is to use builder and rely
3292   // on InstCombine for future simplifications. Here we handle some trivial
3293   // cases only.
3294   auto CreateAdd = [&B](Value *X, Value *Y) {
3295     assert(X->getType() == Y->getType() && "Types don't match!");
3296     if (auto *CX = dyn_cast<ConstantInt>(X))
3297       if (CX->isZero())
3298         return Y;
3299     if (auto *CY = dyn_cast<ConstantInt>(Y))
3300       if (CY->isZero())
3301         return X;
3302     return B.CreateAdd(X, Y);
3303   };
3304 
3305   // We allow X to be a vector type, in which case Y will potentially be
3306   // splatted into a vector with the same element count.
3307   auto CreateMul = [&B](Value *X, Value *Y) {
3308     assert(X->getType()->getScalarType() == Y->getType() &&
3309            "Types don't match!");
3310     if (auto *CX = dyn_cast<ConstantInt>(X))
3311       if (CX->isOne())
3312         return Y;
3313     if (auto *CY = dyn_cast<ConstantInt>(Y))
3314       if (CY->isOne())
3315         return X;
3316     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3317     if (XVTy && !isa<VectorType>(Y->getType()))
3318       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3319     return B.CreateMul(X, Y);
3320   };
3321 
3322   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3323   // loop, choose the end of the vector loop header (=VectorHeader), because
3324   // the DomTree is not kept up-to-date for additional blocks generated in the
3325   // vector loop. By using the header as insertion point, we guarantee that the
3326   // expanded instructions dominate all their uses.
3327   auto GetInsertPoint = [this, &B, VectorHeader]() {
3328     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3329     if (InsertBB != LoopVectorBody &&
3330         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3331       return VectorHeader->getTerminator();
3332     return &*B.GetInsertPoint();
3333   };
3334 
3335   switch (ID.getKind()) {
3336   case InductionDescriptor::IK_IntInduction: {
3337     assert(!isa<VectorType>(Index->getType()) &&
3338            "Vector indices not supported for integer inductions yet");
3339     assert(Index->getType() == StartValue->getType() &&
3340            "Index type does not match StartValue type");
3341     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3342       return B.CreateSub(StartValue, Index);
3343     auto *Offset = CreateMul(
3344         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3345     return CreateAdd(StartValue, Offset);
3346   }
3347   case InductionDescriptor::IK_PtrInduction: {
3348     assert(isa<SCEVConstant>(Step) &&
3349            "Expected constant step for pointer induction");
3350     return B.CreateGEP(
3351         ID.getElementType(), StartValue,
3352         CreateMul(Index,
3353                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3354                                     GetInsertPoint())));
3355   }
3356   case InductionDescriptor::IK_FpInduction: {
3357     assert(!isa<VectorType>(Index->getType()) &&
3358            "Vector indices not supported for FP inductions yet");
3359     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3360     auto InductionBinOp = ID.getInductionBinOp();
3361     assert(InductionBinOp &&
3362            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3363             InductionBinOp->getOpcode() == Instruction::FSub) &&
3364            "Original bin op should be defined for FP induction");
3365 
3366     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3367     Value *MulExp = B.CreateFMul(StepValue, Index);
3368     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3369                          "induction");
3370   }
3371   case InductionDescriptor::IK_NoInduction:
3372     return nullptr;
3373   }
3374   llvm_unreachable("invalid enum");
3375 }
3376 
3377 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3378   LoopScalarBody = OrigLoop->getHeader();
3379   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3380   assert(LoopVectorPreHeader && "Invalid loop structure");
3381   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3382   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3383          "multiple exit loop without required epilogue?");
3384 
3385   LoopMiddleBlock =
3386       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3387                  LI, nullptr, Twine(Prefix) + "middle.block");
3388   LoopScalarPreHeader =
3389       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3390                  nullptr, Twine(Prefix) + "scalar.ph");
3391 
3392   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3393 
3394   // Set up the middle block terminator.  Two cases:
3395   // 1) If we know that we must execute the scalar epilogue, emit an
3396   //    unconditional branch.
3397   // 2) Otherwise, we must have a single unique exit block (due to how we
3398   //    implement the multiple exit case).  In this case, set up a conditonal
3399   //    branch from the middle block to the loop scalar preheader, and the
3400   //    exit block.  completeLoopSkeleton will update the condition to use an
3401   //    iteration check, if required to decide whether to execute the remainder.
3402   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3403     BranchInst::Create(LoopScalarPreHeader) :
3404     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3405                        Builder.getTrue());
3406   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3407   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3408 
3409   // We intentionally don't let SplitBlock to update LoopInfo since
3410   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3411   // LoopVectorBody is explicitly added to the correct place few lines later.
3412   LoopVectorBody =
3413       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3414                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3415 
3416   // Update dominator for loop exit.
3417   if (!Cost->requiresScalarEpilogue(VF))
3418     // If there is an epilogue which must run, there's no edge from the
3419     // middle block to exit blocks  and thus no need to update the immediate
3420     // dominator of the exit blocks.
3421     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3422 
3423   // Create and register the new vector loop.
3424   Loop *Lp = LI->AllocateLoop();
3425   Loop *ParentLoop = OrigLoop->getParentLoop();
3426 
3427   // Insert the new loop into the loop nest and register the new basic blocks
3428   // before calling any utilities such as SCEV that require valid LoopInfo.
3429   if (ParentLoop) {
3430     ParentLoop->addChildLoop(Lp);
3431   } else {
3432     LI->addTopLevelLoop(Lp);
3433   }
3434   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3435   return Lp;
3436 }
3437 
3438 void InnerLoopVectorizer::createInductionResumeValues(
3439     Loop *L, Value *VectorTripCount,
3440     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3441   assert(VectorTripCount && L && "Expected valid arguments");
3442   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3443           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3444          "Inconsistent information about additional bypass.");
3445   // We are going to resume the execution of the scalar loop.
3446   // Go over all of the induction variables that we found and fix the
3447   // PHIs that are left in the scalar version of the loop.
3448   // The starting values of PHI nodes depend on the counter of the last
3449   // iteration in the vectorized loop.
3450   // If we come from a bypass edge then we need to start from the original
3451   // start value.
3452   for (auto &InductionEntry : Legal->getInductionVars()) {
3453     PHINode *OrigPhi = InductionEntry.first;
3454     InductionDescriptor II = InductionEntry.second;
3455 
3456     // Create phi nodes to merge from the  backedge-taken check block.
3457     PHINode *BCResumeVal =
3458         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3459                         LoopScalarPreHeader->getTerminator());
3460     // Copy original phi DL over to the new one.
3461     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3462     Value *&EndValue = IVEndValues[OrigPhi];
3463     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3464     if (OrigPhi == OldInduction) {
3465       // We know what the end value is.
3466       EndValue = VectorTripCount;
3467     } else {
3468       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3469 
3470       // Fast-math-flags propagate from the original induction instruction.
3471       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3472         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3473 
3474       Type *StepType = II.getStep()->getType();
3475       Instruction::CastOps CastOp =
3476           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3477       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3478       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3479       EndValue =
3480           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3481       EndValue->setName("ind.end");
3482 
3483       // Compute the end value for the additional bypass (if applicable).
3484       if (AdditionalBypass.first) {
3485         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3486         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3487                                          StepType, true);
3488         CRD =
3489             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3490         EndValueFromAdditionalBypass =
3491             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3492         EndValueFromAdditionalBypass->setName("ind.end");
3493       }
3494     }
3495     // The new PHI merges the original incoming value, in case of a bypass,
3496     // or the value at the end of the vectorized loop.
3497     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3498 
3499     // Fix the scalar body counter (PHI node).
3500     // The old induction's phi node in the scalar body needs the truncated
3501     // value.
3502     for (BasicBlock *BB : LoopBypassBlocks)
3503       BCResumeVal->addIncoming(II.getStartValue(), BB);
3504 
3505     if (AdditionalBypass.first)
3506       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3507                                             EndValueFromAdditionalBypass);
3508 
3509     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3510   }
3511 }
3512 
3513 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3514                                                       MDNode *OrigLoopID) {
3515   assert(L && "Expected valid loop.");
3516 
3517   // The trip counts should be cached by now.
3518   Value *Count = getOrCreateTripCount(L);
3519   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3520 
3521   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3522 
3523   // Add a check in the middle block to see if we have completed
3524   // all of the iterations in the first vector loop.  Three cases:
3525   // 1) If we require a scalar epilogue, there is no conditional branch as
3526   //    we unconditionally branch to the scalar preheader.  Do nothing.
3527   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3528   //    Thus if tail is to be folded, we know we don't need to run the
3529   //    remainder and we can use the previous value for the condition (true).
3530   // 3) Otherwise, construct a runtime check.
3531   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3532     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3533                                         Count, VectorTripCount, "cmp.n",
3534                                         LoopMiddleBlock->getTerminator());
3535 
3536     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3537     // of the corresponding compare because they may have ended up with
3538     // different line numbers and we want to avoid awkward line stepping while
3539     // debugging. Eg. if the compare has got a line number inside the loop.
3540     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3541     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3542   }
3543 
3544   // Get ready to start creating new instructions into the vectorized body.
3545   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3546          "Inconsistent vector loop preheader");
3547   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3548 
3549   Optional<MDNode *> VectorizedLoopID =
3550       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3551                                       LLVMLoopVectorizeFollowupVectorized});
3552   if (VectorizedLoopID.hasValue()) {
3553     L->setLoopID(VectorizedLoopID.getValue());
3554 
3555     // Do not setAlreadyVectorized if loop attributes have been defined
3556     // explicitly.
3557     return LoopVectorPreHeader;
3558   }
3559 
3560   // Keep all loop hints from the original loop on the vector loop (we'll
3561   // replace the vectorizer-specific hints below).
3562   if (MDNode *LID = OrigLoop->getLoopID())
3563     L->setLoopID(LID);
3564 
3565   LoopVectorizeHints Hints(L, true, *ORE, TTI);
3566   Hints.setAlreadyVectorized();
3567 
3568 #ifdef EXPENSIVE_CHECKS
3569   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3570   LI->verify(*DT);
3571 #endif
3572 
3573   return LoopVectorPreHeader;
3574 }
3575 
3576 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3577   /*
3578    In this function we generate a new loop. The new loop will contain
3579    the vectorized instructions while the old loop will continue to run the
3580    scalar remainder.
3581 
3582        [ ] <-- loop iteration number check.
3583     /   |
3584    /    v
3585   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3586   |  /  |
3587   | /   v
3588   ||   [ ]     <-- vector pre header.
3589   |/    |
3590   |     v
3591   |    [  ] \
3592   |    [  ]_|   <-- vector loop.
3593   |     |
3594   |     v
3595   \   -[ ]   <--- middle-block.
3596    \/   |
3597    /\   v
3598    | ->[ ]     <--- new preheader.
3599    |    |
3600  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3601    |   [ ] \
3602    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3603     \   |
3604      \  v
3605       >[ ]     <-- exit block(s).
3606    ...
3607    */
3608 
3609   // Get the metadata of the original loop before it gets modified.
3610   MDNode *OrigLoopID = OrigLoop->getLoopID();
3611 
3612   // Workaround!  Compute the trip count of the original loop and cache it
3613   // before we start modifying the CFG.  This code has a systemic problem
3614   // wherein it tries to run analysis over partially constructed IR; this is
3615   // wrong, and not simply for SCEV.  The trip count of the original loop
3616   // simply happens to be prone to hitting this in practice.  In theory, we
3617   // can hit the same issue for any SCEV, or ValueTracking query done during
3618   // mutation.  See PR49900.
3619   getOrCreateTripCount(OrigLoop);
3620 
3621   // Create an empty vector loop, and prepare basic blocks for the runtime
3622   // checks.
3623   Loop *Lp = createVectorLoopSkeleton("");
3624 
3625   // Now, compare the new count to zero. If it is zero skip the vector loop and
3626   // jump to the scalar loop. This check also covers the case where the
3627   // backedge-taken count is uint##_max: adding one to it will overflow leading
3628   // to an incorrect trip count of zero. In this (rare) case we will also jump
3629   // to the scalar loop.
3630   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3631 
3632   // Generate the code to check any assumptions that we've made for SCEV
3633   // expressions.
3634   emitSCEVChecks(Lp, LoopScalarPreHeader);
3635 
3636   // Generate the code that checks in runtime if arrays overlap. We put the
3637   // checks into a separate block to make the more common case of few elements
3638   // faster.
3639   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3640 
3641   // Some loops have a single integer induction variable, while other loops
3642   // don't. One example is c++ iterators that often have multiple pointer
3643   // induction variables. In the code below we also support a case where we
3644   // don't have a single induction variable.
3645   //
3646   // We try to obtain an induction variable from the original loop as hard
3647   // as possible. However if we don't find one that:
3648   //   - is an integer
3649   //   - counts from zero, stepping by one
3650   //   - is the size of the widest induction variable type
3651   // then we create a new one.
3652   OldInduction = Legal->getPrimaryInduction();
3653   Type *IdxTy = Legal->getWidestInductionType();
3654   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3655   // The loop step is equal to the vectorization factor (num of SIMD elements)
3656   // times the unroll factor (num of SIMD instructions).
3657   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3658   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3659   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3660   Induction =
3661       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3662                               getDebugLocFromInstOrOperands(OldInduction));
3663 
3664   // Emit phis for the new starting index of the scalar loop.
3665   createInductionResumeValues(Lp, CountRoundDown);
3666 
3667   return completeLoopSkeleton(Lp, OrigLoopID);
3668 }
3669 
3670 // Fix up external users of the induction variable. At this point, we are
3671 // in LCSSA form, with all external PHIs that use the IV having one input value,
3672 // coming from the remainder loop. We need those PHIs to also have a correct
3673 // value for the IV when arriving directly from the middle block.
3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3675                                        const InductionDescriptor &II,
3676                                        Value *CountRoundDown, Value *EndValue,
3677                                        BasicBlock *MiddleBlock) {
3678   // There are two kinds of external IV usages - those that use the value
3679   // computed in the last iteration (the PHI) and those that use the penultimate
3680   // value (the value that feeds into the phi from the loop latch).
3681   // We allow both, but they, obviously, have different values.
3682 
3683   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3684 
3685   DenseMap<Value *, Value *> MissingVals;
3686 
3687   // An external user of the last iteration's value should see the value that
3688   // the remainder loop uses to initialize its own IV.
3689   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3690   for (User *U : PostInc->users()) {
3691     Instruction *UI = cast<Instruction>(U);
3692     if (!OrigLoop->contains(UI)) {
3693       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3694       MissingVals[UI] = EndValue;
3695     }
3696   }
3697 
3698   // An external user of the penultimate value need to see EndValue - Step.
3699   // The simplest way to get this is to recompute it from the constituent SCEVs,
3700   // that is Start + (Step * (CRD - 1)).
3701   for (User *U : OrigPhi->users()) {
3702     auto *UI = cast<Instruction>(U);
3703     if (!OrigLoop->contains(UI)) {
3704       const DataLayout &DL =
3705           OrigLoop->getHeader()->getModule()->getDataLayout();
3706       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3707 
3708       IRBuilder<> B(MiddleBlock->getTerminator());
3709 
3710       // Fast-math-flags propagate from the original induction instruction.
3711       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3712         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3713 
3714       Value *CountMinusOne = B.CreateSub(
3715           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3716       Value *CMO =
3717           !II.getStep()->getType()->isIntegerTy()
3718               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3719                              II.getStep()->getType())
3720               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3721       CMO->setName("cast.cmo");
3722       Value *Escape =
3723           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3724       Escape->setName("ind.escape");
3725       MissingVals[UI] = Escape;
3726     }
3727   }
3728 
3729   for (auto &I : MissingVals) {
3730     PHINode *PHI = cast<PHINode>(I.first);
3731     // One corner case we have to handle is two IVs "chasing" each-other,
3732     // that is %IV2 = phi [...], [ %IV1, %latch ]
3733     // In this case, if IV1 has an external use, we need to avoid adding both
3734     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3735     // don't already have an incoming value for the middle block.
3736     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3737       PHI->addIncoming(I.second, MiddleBlock);
3738   }
3739 }
3740 
3741 namespace {
3742 
3743 struct CSEDenseMapInfo {
3744   static bool canHandle(const Instruction *I) {
3745     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3746            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3747   }
3748 
3749   static inline Instruction *getEmptyKey() {
3750     return DenseMapInfo<Instruction *>::getEmptyKey();
3751   }
3752 
3753   static inline Instruction *getTombstoneKey() {
3754     return DenseMapInfo<Instruction *>::getTombstoneKey();
3755   }
3756 
3757   static unsigned getHashValue(const Instruction *I) {
3758     assert(canHandle(I) && "Unknown instruction!");
3759     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3760                                                            I->value_op_end()));
3761   }
3762 
3763   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3764     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3765         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3766       return LHS == RHS;
3767     return LHS->isIdenticalTo(RHS);
3768   }
3769 };
3770 
3771 } // end anonymous namespace
3772 
3773 ///Perform cse of induction variable instructions.
3774 static void cse(BasicBlock *BB) {
3775   // Perform simple cse.
3776   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3777   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3778     if (!CSEDenseMapInfo::canHandle(&In))
3779       continue;
3780 
3781     // Check if we can replace this instruction with any of the
3782     // visited instructions.
3783     if (Instruction *V = CSEMap.lookup(&In)) {
3784       In.replaceAllUsesWith(V);
3785       In.eraseFromParent();
3786       continue;
3787     }
3788 
3789     CSEMap[&In] = &In;
3790   }
3791 }
3792 
3793 InstructionCost
3794 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3795                                               bool &NeedToScalarize) const {
3796   Function *F = CI->getCalledFunction();
3797   Type *ScalarRetTy = CI->getType();
3798   SmallVector<Type *, 4> Tys, ScalarTys;
3799   for (auto &ArgOp : CI->args())
3800     ScalarTys.push_back(ArgOp->getType());
3801 
3802   // Estimate cost of scalarized vector call. The source operands are assumed
3803   // to be vectors, so we need to extract individual elements from there,
3804   // execute VF scalar calls, and then gather the result into the vector return
3805   // value.
3806   InstructionCost ScalarCallCost =
3807       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3808   if (VF.isScalar())
3809     return ScalarCallCost;
3810 
3811   // Compute corresponding vector type for return value and arguments.
3812   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3813   for (Type *ScalarTy : ScalarTys)
3814     Tys.push_back(ToVectorTy(ScalarTy, VF));
3815 
3816   // Compute costs of unpacking argument values for the scalar calls and
3817   // packing the return values to a vector.
3818   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3819 
3820   InstructionCost Cost =
3821       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3822 
3823   // If we can't emit a vector call for this function, then the currently found
3824   // cost is the cost we need to return.
3825   NeedToScalarize = true;
3826   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3827   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3828 
3829   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3830     return Cost;
3831 
3832   // If the corresponding vector cost is cheaper, return its cost.
3833   InstructionCost VectorCallCost =
3834       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3835   if (VectorCallCost < Cost) {
3836     NeedToScalarize = false;
3837     Cost = VectorCallCost;
3838   }
3839   return Cost;
3840 }
3841 
3842 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3843   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3844     return Elt;
3845   return VectorType::get(Elt, VF);
3846 }
3847 
3848 InstructionCost
3849 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3850                                                    ElementCount VF) const {
3851   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3852   assert(ID && "Expected intrinsic call!");
3853   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3854   FastMathFlags FMF;
3855   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3856     FMF = FPMO->getFastMathFlags();
3857 
3858   SmallVector<const Value *> Arguments(CI->args());
3859   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3860   SmallVector<Type *> ParamTys;
3861   std::transform(FTy->param_begin(), FTy->param_end(),
3862                  std::back_inserter(ParamTys),
3863                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3864 
3865   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3866                                     dyn_cast<IntrinsicInst>(CI));
3867   return TTI.getIntrinsicInstrCost(CostAttrs,
3868                                    TargetTransformInfo::TCK_RecipThroughput);
3869 }
3870 
3871 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3872   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3873   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3874   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3875 }
3876 
3877 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3878   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3879   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3880   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3881 }
3882 
3883 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3884   // For every instruction `I` in MinBWs, truncate the operands, create a
3885   // truncated version of `I` and reextend its result. InstCombine runs
3886   // later and will remove any ext/trunc pairs.
3887   SmallPtrSet<Value *, 4> Erased;
3888   for (const auto &KV : Cost->getMinimalBitwidths()) {
3889     // If the value wasn't vectorized, we must maintain the original scalar
3890     // type. The absence of the value from State indicates that it
3891     // wasn't vectorized.
3892     // FIXME: Should not rely on getVPValue at this point.
3893     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3894     if (!State.hasAnyVectorValue(Def))
3895       continue;
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *I = State.get(Def, Part);
3898       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3899         continue;
3900       Type *OriginalTy = I->getType();
3901       Type *ScalarTruncatedTy =
3902           IntegerType::get(OriginalTy->getContext(), KV.second);
3903       auto *TruncatedTy = VectorType::get(
3904           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3905       if (TruncatedTy == OriginalTy)
3906         continue;
3907 
3908       IRBuilder<> B(cast<Instruction>(I));
3909       auto ShrinkOperand = [&](Value *V) -> Value * {
3910         if (auto *ZI = dyn_cast<ZExtInst>(V))
3911           if (ZI->getSrcTy() == TruncatedTy)
3912             return ZI->getOperand(0);
3913         return B.CreateZExtOrTrunc(V, TruncatedTy);
3914       };
3915 
3916       // The actual instruction modification depends on the instruction type,
3917       // unfortunately.
3918       Value *NewI = nullptr;
3919       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3920         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3921                              ShrinkOperand(BO->getOperand(1)));
3922 
3923         // Any wrapping introduced by shrinking this operation shouldn't be
3924         // considered undefined behavior. So, we can't unconditionally copy
3925         // arithmetic wrapping flags to NewI.
3926         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3927       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3928         NewI =
3929             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3930                          ShrinkOperand(CI->getOperand(1)));
3931       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3932         NewI = B.CreateSelect(SI->getCondition(),
3933                               ShrinkOperand(SI->getTrueValue()),
3934                               ShrinkOperand(SI->getFalseValue()));
3935       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3936         switch (CI->getOpcode()) {
3937         default:
3938           llvm_unreachable("Unhandled cast!");
3939         case Instruction::Trunc:
3940           NewI = ShrinkOperand(CI->getOperand(0));
3941           break;
3942         case Instruction::SExt:
3943           NewI = B.CreateSExtOrTrunc(
3944               CI->getOperand(0),
3945               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3946           break;
3947         case Instruction::ZExt:
3948           NewI = B.CreateZExtOrTrunc(
3949               CI->getOperand(0),
3950               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3951           break;
3952         }
3953       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3954         auto Elements0 =
3955             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3956         auto *O0 = B.CreateZExtOrTrunc(
3957             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3958         auto Elements1 =
3959             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3960         auto *O1 = B.CreateZExtOrTrunc(
3961             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3962 
3963         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3964       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3965         // Don't do anything with the operands, just extend the result.
3966         continue;
3967       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3968         auto Elements =
3969             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3970         auto *O0 = B.CreateZExtOrTrunc(
3971             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3972         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3973         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3974       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3975         auto Elements =
3976             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3977         auto *O0 = B.CreateZExtOrTrunc(
3978             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3979         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3980       } else {
3981         // If we don't know what to do, be conservative and don't do anything.
3982         continue;
3983       }
3984 
3985       // Lastly, extend the result.
3986       NewI->takeName(cast<Instruction>(I));
3987       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3988       I->replaceAllUsesWith(Res);
3989       cast<Instruction>(I)->eraseFromParent();
3990       Erased.insert(I);
3991       State.reset(Def, Res, Part);
3992     }
3993   }
3994 
3995   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3996   for (const auto &KV : Cost->getMinimalBitwidths()) {
3997     // If the value wasn't vectorized, we must maintain the original scalar
3998     // type. The absence of the value from State indicates that it
3999     // wasn't vectorized.
4000     // FIXME: Should not rely on getVPValue at this point.
4001     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4002     if (!State.hasAnyVectorValue(Def))
4003       continue;
4004     for (unsigned Part = 0; Part < UF; ++Part) {
4005       Value *I = State.get(Def, Part);
4006       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4007       if (Inst && Inst->use_empty()) {
4008         Value *NewI = Inst->getOperand(0);
4009         Inst->eraseFromParent();
4010         State.reset(Def, NewI, Part);
4011       }
4012     }
4013   }
4014 }
4015 
4016 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4017   // Insert truncates and extends for any truncated instructions as hints to
4018   // InstCombine.
4019   if (VF.isVector())
4020     truncateToMinimalBitwidths(State);
4021 
4022   // Fix widened non-induction PHIs by setting up the PHI operands.
4023   if (OrigPHIsToFix.size()) {
4024     assert(EnableVPlanNativePath &&
4025            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4026     fixNonInductionPHIs(State);
4027   }
4028 
4029   // At this point every instruction in the original loop is widened to a
4030   // vector form. Now we need to fix the recurrences in the loop. These PHI
4031   // nodes are currently empty because we did not want to introduce cycles.
4032   // This is the second stage of vectorizing recurrences.
4033   fixCrossIterationPHIs(State);
4034 
4035   // Forget the original basic block.
4036   PSE.getSE()->forgetLoop(OrigLoop);
4037 
4038   // If we inserted an edge from the middle block to the unique exit block,
4039   // update uses outside the loop (phis) to account for the newly inserted
4040   // edge.
4041   if (!Cost->requiresScalarEpilogue(VF)) {
4042     // Fix-up external users of the induction variables.
4043     for (auto &Entry : Legal->getInductionVars())
4044       fixupIVUsers(Entry.first, Entry.second,
4045                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4046                    IVEndValues[Entry.first], LoopMiddleBlock);
4047 
4048     fixLCSSAPHIs(State);
4049   }
4050 
4051   for (Instruction *PI : PredicatedInstructions)
4052     sinkScalarOperands(&*PI);
4053 
4054   // Remove redundant induction instructions.
4055   cse(LoopVectorBody);
4056 
4057   // Set/update profile weights for the vector and remainder loops as original
4058   // loop iterations are now distributed among them. Note that original loop
4059   // represented by LoopScalarBody becomes remainder loop after vectorization.
4060   //
4061   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4062   // end up getting slightly roughened result but that should be OK since
4063   // profile is not inherently precise anyway. Note also possible bypass of
4064   // vector code caused by legality checks is ignored, assigning all the weight
4065   // to the vector loop, optimistically.
4066   //
4067   // For scalable vectorization we can't know at compile time how many iterations
4068   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4069   // vscale of '1'.
4070   setProfileInfoAfterUnrolling(
4071       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4072       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4073 }
4074 
4075 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4076   // In order to support recurrences we need to be able to vectorize Phi nodes.
4077   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4078   // stage #2: We now need to fix the recurrences by adding incoming edges to
4079   // the currently empty PHI nodes. At this point every instruction in the
4080   // original loop is widened to a vector form so we can use them to construct
4081   // the incoming edges.
4082   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4083   for (VPRecipeBase &R : Header->phis()) {
4084     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4085       fixReduction(ReductionPhi, State);
4086     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4087       fixFirstOrderRecurrence(FOR, State);
4088   }
4089 }
4090 
4091 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4092                                                   VPTransformState &State) {
4093   // This is the second phase of vectorizing first-order recurrences. An
4094   // overview of the transformation is described below. Suppose we have the
4095   // following loop.
4096   //
4097   //   for (int i = 0; i < n; ++i)
4098   //     b[i] = a[i] - a[i - 1];
4099   //
4100   // There is a first-order recurrence on "a". For this loop, the shorthand
4101   // scalar IR looks like:
4102   //
4103   //   scalar.ph:
4104   //     s_init = a[-1]
4105   //     br scalar.body
4106   //
4107   //   scalar.body:
4108   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4109   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4110   //     s2 = a[i]
4111   //     b[i] = s2 - s1
4112   //     br cond, scalar.body, ...
4113   //
4114   // In this example, s1 is a recurrence because it's value depends on the
4115   // previous iteration. In the first phase of vectorization, we created a
4116   // vector phi v1 for s1. We now complete the vectorization and produce the
4117   // shorthand vector IR shown below (for VF = 4, UF = 1).
4118   //
4119   //   vector.ph:
4120   //     v_init = vector(..., ..., ..., a[-1])
4121   //     br vector.body
4122   //
4123   //   vector.body
4124   //     i = phi [0, vector.ph], [i+4, vector.body]
4125   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4126   //     v2 = a[i, i+1, i+2, i+3];
4127   //     v3 = vector(v1(3), v2(0, 1, 2))
4128   //     b[i, i+1, i+2, i+3] = v2 - v3
4129   //     br cond, vector.body, middle.block
4130   //
4131   //   middle.block:
4132   //     x = v2(3)
4133   //     br scalar.ph
4134   //
4135   //   scalar.ph:
4136   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4137   //     br scalar.body
4138   //
4139   // After execution completes the vector loop, we extract the next value of
4140   // the recurrence (x) to use as the initial value in the scalar loop.
4141 
4142   // Extract the last vector element in the middle block. This will be the
4143   // initial value for the recurrence when jumping to the scalar loop.
4144   VPValue *PreviousDef = PhiR->getBackedgeValue();
4145   Value *Incoming = State.get(PreviousDef, UF - 1);
4146   auto *ExtractForScalar = Incoming;
4147   auto *IdxTy = Builder.getInt32Ty();
4148   if (VF.isVector()) {
4149     auto *One = ConstantInt::get(IdxTy, 1);
4150     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4151     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4152     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4153     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4154                                                     "vector.recur.extract");
4155   }
4156   // Extract the second last element in the middle block if the
4157   // Phi is used outside the loop. We need to extract the phi itself
4158   // and not the last element (the phi update in the current iteration). This
4159   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4160   // when the scalar loop is not run at all.
4161   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4162   if (VF.isVector()) {
4163     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4164     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4165     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4166         Incoming, Idx, "vector.recur.extract.for.phi");
4167   } else if (UF > 1)
4168     // When loop is unrolled without vectorizing, initialize
4169     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4170     // of `Incoming`. This is analogous to the vectorized case above: extracting
4171     // the second last element when VF > 1.
4172     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4173 
4174   // Fix the initial value of the original recurrence in the scalar loop.
4175   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4176   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4177   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4178   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4179   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4180     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4181     Start->addIncoming(Incoming, BB);
4182   }
4183 
4184   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4185   Phi->setName("scalar.recur");
4186 
4187   // Finally, fix users of the recurrence outside the loop. The users will need
4188   // either the last value of the scalar recurrence or the last value of the
4189   // vector recurrence we extracted in the middle block. Since the loop is in
4190   // LCSSA form, we just need to find all the phi nodes for the original scalar
4191   // recurrence in the exit block, and then add an edge for the middle block.
4192   // Note that LCSSA does not imply single entry when the original scalar loop
4193   // had multiple exiting edges (as we always run the last iteration in the
4194   // scalar epilogue); in that case, there is no edge from middle to exit and
4195   // and thus no phis which needed updated.
4196   if (!Cost->requiresScalarEpilogue(VF))
4197     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4198       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4199         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4200 }
4201 
4202 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4203                                        VPTransformState &State) {
4204   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4205   // Get it's reduction variable descriptor.
4206   assert(Legal->isReductionVariable(OrigPhi) &&
4207          "Unable to find the reduction variable");
4208   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4209 
4210   RecurKind RK = RdxDesc.getRecurrenceKind();
4211   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4212   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4213   setDebugLocFromInst(ReductionStartValue);
4214 
4215   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4216   // This is the vector-clone of the value that leaves the loop.
4217   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4218 
4219   // Wrap flags are in general invalid after vectorization, clear them.
4220   clearReductionWrapFlags(RdxDesc, State);
4221 
4222   // Before each round, move the insertion point right between
4223   // the PHIs and the values we are going to write.
4224   // This allows us to write both PHINodes and the extractelement
4225   // instructions.
4226   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4227 
4228   setDebugLocFromInst(LoopExitInst);
4229 
4230   Type *PhiTy = OrigPhi->getType();
4231   // If tail is folded by masking, the vector value to leave the loop should be
4232   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4233   // instead of the former. For an inloop reduction the reduction will already
4234   // be predicated, and does not need to be handled here.
4235   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4236     for (unsigned Part = 0; Part < UF; ++Part) {
4237       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4238       Value *Sel = nullptr;
4239       for (User *U : VecLoopExitInst->users()) {
4240         if (isa<SelectInst>(U)) {
4241           assert(!Sel && "Reduction exit feeding two selects");
4242           Sel = U;
4243         } else
4244           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4245       }
4246       assert(Sel && "Reduction exit feeds no select");
4247       State.reset(LoopExitInstDef, Sel, Part);
4248 
4249       // If the target can create a predicated operator for the reduction at no
4250       // extra cost in the loop (for example a predicated vadd), it can be
4251       // cheaper for the select to remain in the loop than be sunk out of it,
4252       // and so use the select value for the phi instead of the old
4253       // LoopExitValue.
4254       if (PreferPredicatedReductionSelect ||
4255           TTI->preferPredicatedReductionSelect(
4256               RdxDesc.getOpcode(), PhiTy,
4257               TargetTransformInfo::ReductionFlags())) {
4258         auto *VecRdxPhi =
4259             cast<PHINode>(State.get(PhiR, Part));
4260         VecRdxPhi->setIncomingValueForBlock(
4261             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4262       }
4263     }
4264   }
4265 
4266   // If the vector reduction can be performed in a smaller type, we truncate
4267   // then extend the loop exit value to enable InstCombine to evaluate the
4268   // entire expression in the smaller type.
4269   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4270     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4271     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4272     Builder.SetInsertPoint(
4273         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4274     VectorParts RdxParts(UF);
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4277       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4278       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4279                                         : Builder.CreateZExt(Trunc, VecTy);
4280       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4281         if (U != Trunc) {
4282           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4283           RdxParts[Part] = Extnd;
4284         }
4285     }
4286     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4287     for (unsigned Part = 0; Part < UF; ++Part) {
4288       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4289       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4290     }
4291   }
4292 
4293   // Reduce all of the unrolled parts into a single vector.
4294   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4295   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4296 
4297   // The middle block terminator has already been assigned a DebugLoc here (the
4298   // OrigLoop's single latch terminator). We want the whole middle block to
4299   // appear to execute on this line because: (a) it is all compiler generated,
4300   // (b) these instructions are always executed after evaluating the latch
4301   // conditional branch, and (c) other passes may add new predecessors which
4302   // terminate on this line. This is the easiest way to ensure we don't
4303   // accidentally cause an extra step back into the loop while debugging.
4304   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4305   if (PhiR->isOrdered())
4306     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4307   else {
4308     // Floating-point operations should have some FMF to enable the reduction.
4309     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4310     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4311     for (unsigned Part = 1; Part < UF; ++Part) {
4312       Value *RdxPart = State.get(LoopExitInstDef, Part);
4313       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4314         ReducedPartRdx = Builder.CreateBinOp(
4315             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4316       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4317         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4318                                            ReducedPartRdx, RdxPart);
4319       else
4320         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4321     }
4322   }
4323 
4324   // Create the reduction after the loop. Note that inloop reductions create the
4325   // target reduction in the loop using a Reduction recipe.
4326   if (VF.isVector() && !PhiR->isInLoop()) {
4327     ReducedPartRdx =
4328         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4329     // If the reduction can be performed in a smaller type, we need to extend
4330     // the reduction to the wider type before we branch to the original loop.
4331     if (PhiTy != RdxDesc.getRecurrenceType())
4332       ReducedPartRdx = RdxDesc.isSigned()
4333                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4334                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4335   }
4336 
4337   // Create a phi node that merges control-flow from the backedge-taken check
4338   // block and the middle block.
4339   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4340                                         LoopScalarPreHeader->getTerminator());
4341   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4342     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4343   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4344 
4345   // Now, we need to fix the users of the reduction variable
4346   // inside and outside of the scalar remainder loop.
4347 
4348   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4349   // in the exit blocks.  See comment on analogous loop in
4350   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4351   if (!Cost->requiresScalarEpilogue(VF))
4352     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4353       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4354         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4355 
4356   // Fix the scalar loop reduction variable with the incoming reduction sum
4357   // from the vector body and from the backedge value.
4358   int IncomingEdgeBlockIdx =
4359       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4360   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4361   // Pick the other block.
4362   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4363   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4364   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4365 }
4366 
4367 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4368                                                   VPTransformState &State) {
4369   RecurKind RK = RdxDesc.getRecurrenceKind();
4370   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4371     return;
4372 
4373   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4374   assert(LoopExitInstr && "null loop exit instruction");
4375   SmallVector<Instruction *, 8> Worklist;
4376   SmallPtrSet<Instruction *, 8> Visited;
4377   Worklist.push_back(LoopExitInstr);
4378   Visited.insert(LoopExitInstr);
4379 
4380   while (!Worklist.empty()) {
4381     Instruction *Cur = Worklist.pop_back_val();
4382     if (isa<OverflowingBinaryOperator>(Cur))
4383       for (unsigned Part = 0; Part < UF; ++Part) {
4384         // FIXME: Should not rely on getVPValue at this point.
4385         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4386         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4387       }
4388 
4389     for (User *U : Cur->users()) {
4390       Instruction *UI = cast<Instruction>(U);
4391       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4392           Visited.insert(UI).second)
4393         Worklist.push_back(UI);
4394     }
4395   }
4396 }
4397 
4398 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4399   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4400     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4401       // Some phis were already hand updated by the reduction and recurrence
4402       // code above, leave them alone.
4403       continue;
4404 
4405     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4406     // Non-instruction incoming values will have only one value.
4407 
4408     VPLane Lane = VPLane::getFirstLane();
4409     if (isa<Instruction>(IncomingValue) &&
4410         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4411                                            VF))
4412       Lane = VPLane::getLastLaneForVF(VF);
4413 
4414     // Can be a loop invariant incoming value or the last scalar value to be
4415     // extracted from the vectorized loop.
4416     // FIXME: Should not rely on getVPValue at this point.
4417     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4418     Value *lastIncomingValue =
4419         OrigLoop->isLoopInvariant(IncomingValue)
4420             ? IncomingValue
4421             : State.get(State.Plan->getVPValue(IncomingValue, true),
4422                         VPIteration(UF - 1, Lane));
4423     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4424   }
4425 }
4426 
4427 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4428   // The basic block and loop containing the predicated instruction.
4429   auto *PredBB = PredInst->getParent();
4430   auto *VectorLoop = LI->getLoopFor(PredBB);
4431 
4432   // Initialize a worklist with the operands of the predicated instruction.
4433   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4434 
4435   // Holds instructions that we need to analyze again. An instruction may be
4436   // reanalyzed if we don't yet know if we can sink it or not.
4437   SmallVector<Instruction *, 8> InstsToReanalyze;
4438 
4439   // Returns true if a given use occurs in the predicated block. Phi nodes use
4440   // their operands in their corresponding predecessor blocks.
4441   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4442     auto *I = cast<Instruction>(U.getUser());
4443     BasicBlock *BB = I->getParent();
4444     if (auto *Phi = dyn_cast<PHINode>(I))
4445       BB = Phi->getIncomingBlock(
4446           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4447     return BB == PredBB;
4448   };
4449 
4450   // Iteratively sink the scalarized operands of the predicated instruction
4451   // into the block we created for it. When an instruction is sunk, it's
4452   // operands are then added to the worklist. The algorithm ends after one pass
4453   // through the worklist doesn't sink a single instruction.
4454   bool Changed;
4455   do {
4456     // Add the instructions that need to be reanalyzed to the worklist, and
4457     // reset the changed indicator.
4458     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4459     InstsToReanalyze.clear();
4460     Changed = false;
4461 
4462     while (!Worklist.empty()) {
4463       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4464 
4465       // We can't sink an instruction if it is a phi node, is not in the loop,
4466       // or may have side effects.
4467       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4468           I->mayHaveSideEffects())
4469         continue;
4470 
4471       // If the instruction is already in PredBB, check if we can sink its
4472       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4473       // sinking the scalar instruction I, hence it appears in PredBB; but it
4474       // may have failed to sink I's operands (recursively), which we try
4475       // (again) here.
4476       if (I->getParent() == PredBB) {
4477         Worklist.insert(I->op_begin(), I->op_end());
4478         continue;
4479       }
4480 
4481       // It's legal to sink the instruction if all its uses occur in the
4482       // predicated block. Otherwise, there's nothing to do yet, and we may
4483       // need to reanalyze the instruction.
4484       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4485         InstsToReanalyze.push_back(I);
4486         continue;
4487       }
4488 
4489       // Move the instruction to the beginning of the predicated block, and add
4490       // it's operands to the worklist.
4491       I->moveBefore(&*PredBB->getFirstInsertionPt());
4492       Worklist.insert(I->op_begin(), I->op_end());
4493 
4494       // The sinking may have enabled other instructions to be sunk, so we will
4495       // need to iterate.
4496       Changed = true;
4497     }
4498   } while (Changed);
4499 }
4500 
4501 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4502   for (PHINode *OrigPhi : OrigPHIsToFix) {
4503     VPWidenPHIRecipe *VPPhi =
4504         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4505     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4506     // Make sure the builder has a valid insert point.
4507     Builder.SetInsertPoint(NewPhi);
4508     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4509       VPValue *Inc = VPPhi->getIncomingValue(i);
4510       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4511       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4512     }
4513   }
4514 }
4515 
4516 bool InnerLoopVectorizer::useOrderedReductions(
4517     const RecurrenceDescriptor &RdxDesc) {
4518   return Cost->useOrderedReductions(RdxDesc);
4519 }
4520 
4521 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4522                                               VPWidenPHIRecipe *PhiR,
4523                                               VPTransformState &State) {
4524   PHINode *P = cast<PHINode>(PN);
4525   if (EnableVPlanNativePath) {
4526     // Currently we enter here in the VPlan-native path for non-induction
4527     // PHIs where all control flow is uniform. We simply widen these PHIs.
4528     // Create a vector phi with no operands - the vector phi operands will be
4529     // set at the end of vector code generation.
4530     Type *VecTy = (State.VF.isScalar())
4531                       ? PN->getType()
4532                       : VectorType::get(PN->getType(), State.VF);
4533     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4534     State.set(PhiR, VecPhi, 0);
4535     OrigPHIsToFix.push_back(P);
4536 
4537     return;
4538   }
4539 
4540   assert(PN->getParent() == OrigLoop->getHeader() &&
4541          "Non-header phis should have been handled elsewhere");
4542 
4543   // In order to support recurrences we need to be able to vectorize Phi nodes.
4544   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4545   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4546   // this value when we vectorize all of the instructions that use the PHI.
4547 
4548   assert(!Legal->isReductionVariable(P) &&
4549          "reductions should be handled elsewhere");
4550 
4551   setDebugLocFromInst(P);
4552 
4553   // This PHINode must be an induction variable.
4554   // Make sure that we know about it.
4555   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4556 
4557   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4558   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4559 
4560   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4561   // which can be found from the original scalar operations.
4562   switch (II.getKind()) {
4563   case InductionDescriptor::IK_NoInduction:
4564     llvm_unreachable("Unknown induction");
4565   case InductionDescriptor::IK_IntInduction:
4566   case InductionDescriptor::IK_FpInduction:
4567     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4568   case InductionDescriptor::IK_PtrInduction: {
4569     // Handle the pointer induction variable case.
4570     assert(P->getType()->isPointerTy() && "Unexpected type.");
4571 
4572     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4573       // This is the normalized GEP that starts counting at zero.
4574       Value *PtrInd =
4575           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4576       // Determine the number of scalars we need to generate for each unroll
4577       // iteration. If the instruction is uniform, we only need to generate the
4578       // first lane. Otherwise, we generate all VF values.
4579       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4580       assert((IsUniform || !State.VF.isScalable()) &&
4581              "Cannot scalarize a scalable VF");
4582       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4583 
4584       for (unsigned Part = 0; Part < UF; ++Part) {
4585         Value *PartStart =
4586             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4587 
4588         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4589           Value *Idx = Builder.CreateAdd(
4590               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4591           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4592           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4593                                                 DL, II, State.CFG.PrevBB);
4594           SclrGep->setName("next.gep");
4595           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4596         }
4597       }
4598       return;
4599     }
4600     assert(isa<SCEVConstant>(II.getStep()) &&
4601            "Induction step not a SCEV constant!");
4602     Type *PhiType = II.getStep()->getType();
4603 
4604     // Build a pointer phi
4605     Value *ScalarStartValue = II.getStartValue();
4606     Type *ScStValueType = ScalarStartValue->getType();
4607     PHINode *NewPointerPhi =
4608         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4609     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4610 
4611     // A pointer induction, performed by using a gep
4612     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4613     Instruction *InductionLoc = LoopLatch->getTerminator();
4614     const SCEV *ScalarStep = II.getStep();
4615     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4616     Value *ScalarStepValue =
4617         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4618     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4619     Value *NumUnrolledElems =
4620         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4621     Value *InductionGEP = GetElementPtrInst::Create(
4622         II.getElementType(), NewPointerPhi,
4623         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4624         InductionLoc);
4625     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4626 
4627     // Create UF many actual address geps that use the pointer
4628     // phi as base and a vectorized version of the step value
4629     // (<step*0, ..., step*N>) as offset.
4630     for (unsigned Part = 0; Part < State.UF; ++Part) {
4631       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4632       Value *StartOffsetScalar =
4633           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4634       Value *StartOffset =
4635           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4636       // Create a vector of consecutive numbers from zero to VF.
4637       StartOffset =
4638           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4639 
4640       Value *GEP = Builder.CreateGEP(
4641           II.getElementType(), NewPointerPhi,
4642           Builder.CreateMul(
4643               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4644               "vector.gep"));
4645       State.set(PhiR, GEP, Part);
4646     }
4647   }
4648   }
4649 }
4650 
4651 /// A helper function for checking whether an integer division-related
4652 /// instruction may divide by zero (in which case it must be predicated if
4653 /// executed conditionally in the scalar code).
4654 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4655 /// Non-zero divisors that are non compile-time constants will not be
4656 /// converted into multiplication, so we will still end up scalarizing
4657 /// the division, but can do so w/o predication.
4658 static bool mayDivideByZero(Instruction &I) {
4659   assert((I.getOpcode() == Instruction::UDiv ||
4660           I.getOpcode() == Instruction::SDiv ||
4661           I.getOpcode() == Instruction::URem ||
4662           I.getOpcode() == Instruction::SRem) &&
4663          "Unexpected instruction");
4664   Value *Divisor = I.getOperand(1);
4665   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4666   return !CInt || CInt->isZero();
4667 }
4668 
4669 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4670                                                VPUser &ArgOperands,
4671                                                VPTransformState &State) {
4672   assert(!isa<DbgInfoIntrinsic>(I) &&
4673          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4674   setDebugLocFromInst(&I);
4675 
4676   Module *M = I.getParent()->getParent()->getParent();
4677   auto *CI = cast<CallInst>(&I);
4678 
4679   SmallVector<Type *, 4> Tys;
4680   for (Value *ArgOperand : CI->args())
4681     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4682 
4683   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4684 
4685   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4686   // version of the instruction.
4687   // Is it beneficial to perform intrinsic call compared to lib call?
4688   bool NeedToScalarize = false;
4689   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4690   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4691   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4692   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4693          "Instruction should be scalarized elsewhere.");
4694   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4695          "Either the intrinsic cost or vector call cost must be valid");
4696 
4697   for (unsigned Part = 0; Part < UF; ++Part) {
4698     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4699     SmallVector<Value *, 4> Args;
4700     for (auto &I : enumerate(ArgOperands.operands())) {
4701       // Some intrinsics have a scalar argument - don't replace it with a
4702       // vector.
4703       Value *Arg;
4704       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4705         Arg = State.get(I.value(), Part);
4706       else {
4707         Arg = State.get(I.value(), VPIteration(0, 0));
4708         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4709           TysForDecl.push_back(Arg->getType());
4710       }
4711       Args.push_back(Arg);
4712     }
4713 
4714     Function *VectorF;
4715     if (UseVectorIntrinsic) {
4716       // Use vector version of the intrinsic.
4717       if (VF.isVector())
4718         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4719       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4720       assert(VectorF && "Can't retrieve vector intrinsic.");
4721     } else {
4722       // Use vector version of the function call.
4723       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4724 #ifndef NDEBUG
4725       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4726              "Can't create vector function.");
4727 #endif
4728         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4729     }
4730       SmallVector<OperandBundleDef, 1> OpBundles;
4731       CI->getOperandBundlesAsDefs(OpBundles);
4732       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4733 
4734       if (isa<FPMathOperator>(V))
4735         V->copyFastMathFlags(CI);
4736 
4737       State.set(Def, V, Part);
4738       addMetadata(V, &I);
4739   }
4740 }
4741 
4742 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4743   // We should not collect Scalars more than once per VF. Right now, this
4744   // function is called from collectUniformsAndScalars(), which already does
4745   // this check. Collecting Scalars for VF=1 does not make any sense.
4746   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4747          "This function should not be visited twice for the same VF");
4748 
4749   SmallSetVector<Instruction *, 8> Worklist;
4750 
4751   // These sets are used to seed the analysis with pointers used by memory
4752   // accesses that will remain scalar.
4753   SmallSetVector<Instruction *, 8> ScalarPtrs;
4754   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4755   auto *Latch = TheLoop->getLoopLatch();
4756 
4757   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4758   // The pointer operands of loads and stores will be scalar as long as the
4759   // memory access is not a gather or scatter operation. The value operand of a
4760   // store will remain scalar if the store is scalarized.
4761   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4762     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4763     assert(WideningDecision != CM_Unknown &&
4764            "Widening decision should be ready at this moment");
4765     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4766       if (Ptr == Store->getValueOperand())
4767         return WideningDecision == CM_Scalarize;
4768     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4769            "Ptr is neither a value or pointer operand");
4770     return WideningDecision != CM_GatherScatter;
4771   };
4772 
4773   // A helper that returns true if the given value is a bitcast or
4774   // getelementptr instruction contained in the loop.
4775   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4776     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4777             isa<GetElementPtrInst>(V)) &&
4778            !TheLoop->isLoopInvariant(V);
4779   };
4780 
4781   // A helper that evaluates a memory access's use of a pointer. If the use will
4782   // be a scalar use and the pointer is only used by memory accesses, we place
4783   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4784   // PossibleNonScalarPtrs.
4785   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4786     // We only care about bitcast and getelementptr instructions contained in
4787     // the loop.
4788     if (!isLoopVaryingBitCastOrGEP(Ptr))
4789       return;
4790 
4791     // If the pointer has already been identified as scalar (e.g., if it was
4792     // also identified as uniform), there's nothing to do.
4793     auto *I = cast<Instruction>(Ptr);
4794     if (Worklist.count(I))
4795       return;
4796 
4797     // If the use of the pointer will be a scalar use, and all users of the
4798     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4799     // place the pointer in PossibleNonScalarPtrs.
4800     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4801           return isa<LoadInst>(U) || isa<StoreInst>(U);
4802         }))
4803       ScalarPtrs.insert(I);
4804     else
4805       PossibleNonScalarPtrs.insert(I);
4806   };
4807 
4808   // We seed the scalars analysis with three classes of instructions: (1)
4809   // instructions marked uniform-after-vectorization and (2) bitcast,
4810   // getelementptr and (pointer) phi instructions used by memory accesses
4811   // requiring a scalar use.
4812   //
4813   // (1) Add to the worklist all instructions that have been identified as
4814   // uniform-after-vectorization.
4815   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4816 
4817   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4818   // memory accesses requiring a scalar use. The pointer operands of loads and
4819   // stores will be scalar as long as the memory accesses is not a gather or
4820   // scatter operation. The value operand of a store will remain scalar if the
4821   // store is scalarized.
4822   for (auto *BB : TheLoop->blocks())
4823     for (auto &I : *BB) {
4824       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4825         evaluatePtrUse(Load, Load->getPointerOperand());
4826       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4827         evaluatePtrUse(Store, Store->getPointerOperand());
4828         evaluatePtrUse(Store, Store->getValueOperand());
4829       }
4830     }
4831   for (auto *I : ScalarPtrs)
4832     if (!PossibleNonScalarPtrs.count(I)) {
4833       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4834       Worklist.insert(I);
4835     }
4836 
4837   // Insert the forced scalars.
4838   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4839   // induction variable when the PHI user is scalarized.
4840   auto ForcedScalar = ForcedScalars.find(VF);
4841   if (ForcedScalar != ForcedScalars.end())
4842     for (auto *I : ForcedScalar->second)
4843       Worklist.insert(I);
4844 
4845   // Expand the worklist by looking through any bitcasts and getelementptr
4846   // instructions we've already identified as scalar. This is similar to the
4847   // expansion step in collectLoopUniforms(); however, here we're only
4848   // expanding to include additional bitcasts and getelementptr instructions.
4849   unsigned Idx = 0;
4850   while (Idx != Worklist.size()) {
4851     Instruction *Dst = Worklist[Idx++];
4852     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4853       continue;
4854     auto *Src = cast<Instruction>(Dst->getOperand(0));
4855     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4856           auto *J = cast<Instruction>(U);
4857           return !TheLoop->contains(J) || Worklist.count(J) ||
4858                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4859                   isScalarUse(J, Src));
4860         })) {
4861       Worklist.insert(Src);
4862       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4863     }
4864   }
4865 
4866   // An induction variable will remain scalar if all users of the induction
4867   // variable and induction variable update remain scalar.
4868   for (auto &Induction : Legal->getInductionVars()) {
4869     auto *Ind = Induction.first;
4870     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4871 
4872     // If tail-folding is applied, the primary induction variable will be used
4873     // to feed a vector compare.
4874     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4875       continue;
4876 
4877     // Returns true if \p Indvar is a pointer induction that is used directly by
4878     // load/store instruction \p I.
4879     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4880                                               Instruction *I) {
4881       return Induction.second.getKind() ==
4882                  InductionDescriptor::IK_PtrInduction &&
4883              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4884              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4885     };
4886 
4887     // Determine if all users of the induction variable are scalar after
4888     // vectorization.
4889     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4890       auto *I = cast<Instruction>(U);
4891       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4892              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4893     });
4894     if (!ScalarInd)
4895       continue;
4896 
4897     // Determine if all users of the induction variable update instruction are
4898     // scalar after vectorization.
4899     auto ScalarIndUpdate =
4900         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4901           auto *I = cast<Instruction>(U);
4902           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4903                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4904         });
4905     if (!ScalarIndUpdate)
4906       continue;
4907 
4908     // The induction variable and its update instruction will remain scalar.
4909     Worklist.insert(Ind);
4910     Worklist.insert(IndUpdate);
4911     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4912     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4913                       << "\n");
4914   }
4915 
4916   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4917 }
4918 
4919 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
4920   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4921     return false;
4922   switch(I->getOpcode()) {
4923   default:
4924     break;
4925   case Instruction::Load:
4926   case Instruction::Store: {
4927     if (!Legal->isMaskRequired(I))
4928       return false;
4929     auto *Ptr = getLoadStorePointerOperand(I);
4930     auto *Ty = getLoadStoreType(I);
4931     const Align Alignment = getLoadStoreAlignment(I);
4932     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4933                                 TTI.isLegalMaskedGather(Ty, Alignment))
4934                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4935                                 TTI.isLegalMaskedScatter(Ty, Alignment));
4936   }
4937   case Instruction::UDiv:
4938   case Instruction::SDiv:
4939   case Instruction::SRem:
4940   case Instruction::URem:
4941     return mayDivideByZero(*I);
4942   }
4943   return false;
4944 }
4945 
4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4947     Instruction *I, ElementCount VF) {
4948   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4949   assert(getWideningDecision(I, VF) == CM_Unknown &&
4950          "Decision should not be set yet.");
4951   auto *Group = getInterleavedAccessGroup(I);
4952   assert(Group && "Must have a group.");
4953 
4954   // If the instruction's allocated size doesn't equal it's type size, it
4955   // requires padding and will be scalarized.
4956   auto &DL = I->getModule()->getDataLayout();
4957   auto *ScalarTy = getLoadStoreType(I);
4958   if (hasIrregularType(ScalarTy, DL))
4959     return false;
4960 
4961   // Check if masking is required.
4962   // A Group may need masking for one of two reasons: it resides in a block that
4963   // needs predication, or it was decided to use masking to deal with gaps
4964   // (either a gap at the end of a load-access that may result in a speculative
4965   // load, or any gaps in a store-access).
4966   bool PredicatedAccessRequiresMasking =
4967       blockNeedsPredicationForAnyReason(I->getParent()) &&
4968       Legal->isMaskRequired(I);
4969   bool LoadAccessWithGapsRequiresEpilogMasking =
4970       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4971       !isScalarEpilogueAllowed();
4972   bool StoreAccessWithGapsRequiresMasking =
4973       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4974   if (!PredicatedAccessRequiresMasking &&
4975       !LoadAccessWithGapsRequiresEpilogMasking &&
4976       !StoreAccessWithGapsRequiresMasking)
4977     return true;
4978 
4979   // If masked interleaving is required, we expect that the user/target had
4980   // enabled it, because otherwise it either wouldn't have been created or
4981   // it should have been invalidated by the CostModel.
4982   assert(useMaskedInterleavedAccesses(TTI) &&
4983          "Masked interleave-groups for predicated accesses are not enabled.");
4984 
4985   if (Group->isReverse())
4986     return false;
4987 
4988   auto *Ty = getLoadStoreType(I);
4989   const Align Alignment = getLoadStoreAlignment(I);
4990   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4991                           : TTI.isLegalMaskedStore(Ty, Alignment);
4992 }
4993 
4994 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4995     Instruction *I, ElementCount VF) {
4996   // Get and ensure we have a valid memory instruction.
4997   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4998 
4999   auto *Ptr = getLoadStorePointerOperand(I);
5000   auto *ScalarTy = getLoadStoreType(I);
5001 
5002   // In order to be widened, the pointer should be consecutive, first of all.
5003   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5004     return false;
5005 
5006   // If the instruction is a store located in a predicated block, it will be
5007   // scalarized.
5008   if (isScalarWithPredication(I))
5009     return false;
5010 
5011   // If the instruction's allocated size doesn't equal it's type size, it
5012   // requires padding and will be scalarized.
5013   auto &DL = I->getModule()->getDataLayout();
5014   if (hasIrregularType(ScalarTy, DL))
5015     return false;
5016 
5017   return true;
5018 }
5019 
5020 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5021   // We should not collect Uniforms more than once per VF. Right now,
5022   // this function is called from collectUniformsAndScalars(), which
5023   // already does this check. Collecting Uniforms for VF=1 does not make any
5024   // sense.
5025 
5026   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5027          "This function should not be visited twice for the same VF");
5028 
5029   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5030   // not analyze again.  Uniforms.count(VF) will return 1.
5031   Uniforms[VF].clear();
5032 
5033   // We now know that the loop is vectorizable!
5034   // Collect instructions inside the loop that will remain uniform after
5035   // vectorization.
5036 
5037   // Global values, params and instructions outside of current loop are out of
5038   // scope.
5039   auto isOutOfScope = [&](Value *V) -> bool {
5040     Instruction *I = dyn_cast<Instruction>(V);
5041     return (!I || !TheLoop->contains(I));
5042   };
5043 
5044   // Worklist containing uniform instructions demanding lane 0.
5045   SetVector<Instruction *> Worklist;
5046   BasicBlock *Latch = TheLoop->getLoopLatch();
5047 
5048   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5049   // that are scalar with predication must not be considered uniform after
5050   // vectorization, because that would create an erroneous replicating region
5051   // where only a single instance out of VF should be formed.
5052   // TODO: optimize such seldom cases if found important, see PR40816.
5053   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5054     if (isOutOfScope(I)) {
5055       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5056                         << *I << "\n");
5057       return;
5058     }
5059     if (isScalarWithPredication(I)) {
5060       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5061                         << *I << "\n");
5062       return;
5063     }
5064     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5065     Worklist.insert(I);
5066   };
5067 
5068   // Start with the conditional branch. If the branch condition is an
5069   // instruction contained in the loop that is only used by the branch, it is
5070   // uniform.
5071   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5072   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5073     addToWorklistIfAllowed(Cmp);
5074 
5075   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5076     InstWidening WideningDecision = getWideningDecision(I, VF);
5077     assert(WideningDecision != CM_Unknown &&
5078            "Widening decision should be ready at this moment");
5079 
5080     // A uniform memory op is itself uniform.  We exclude uniform stores
5081     // here as they demand the last lane, not the first one.
5082     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5083       assert(WideningDecision == CM_Scalarize);
5084       return true;
5085     }
5086 
5087     return (WideningDecision == CM_Widen ||
5088             WideningDecision == CM_Widen_Reverse ||
5089             WideningDecision == CM_Interleave);
5090   };
5091 
5092 
5093   // Returns true if Ptr is the pointer operand of a memory access instruction
5094   // I, and I is known to not require scalarization.
5095   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5096     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5097   };
5098 
5099   // Holds a list of values which are known to have at least one uniform use.
5100   // Note that there may be other uses which aren't uniform.  A "uniform use"
5101   // here is something which only demands lane 0 of the unrolled iterations;
5102   // it does not imply that all lanes produce the same value (e.g. this is not
5103   // the usual meaning of uniform)
5104   SetVector<Value *> HasUniformUse;
5105 
5106   // Scan the loop for instructions which are either a) known to have only
5107   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5108   for (auto *BB : TheLoop->blocks())
5109     for (auto &I : *BB) {
5110       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5111         switch (II->getIntrinsicID()) {
5112         case Intrinsic::sideeffect:
5113         case Intrinsic::experimental_noalias_scope_decl:
5114         case Intrinsic::assume:
5115         case Intrinsic::lifetime_start:
5116         case Intrinsic::lifetime_end:
5117           if (TheLoop->hasLoopInvariantOperands(&I))
5118             addToWorklistIfAllowed(&I);
5119           break;
5120         default:
5121           break;
5122         }
5123       }
5124 
5125       // ExtractValue instructions must be uniform, because the operands are
5126       // known to be loop-invariant.
5127       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5128         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5129                "Expected aggregate value to be loop invariant");
5130         addToWorklistIfAllowed(EVI);
5131         continue;
5132       }
5133 
5134       // If there's no pointer operand, there's nothing to do.
5135       auto *Ptr = getLoadStorePointerOperand(&I);
5136       if (!Ptr)
5137         continue;
5138 
5139       // A uniform memory op is itself uniform.  We exclude uniform stores
5140       // here as they demand the last lane, not the first one.
5141       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5142         addToWorklistIfAllowed(&I);
5143 
5144       if (isUniformDecision(&I, VF)) {
5145         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5146         HasUniformUse.insert(Ptr);
5147       }
5148     }
5149 
5150   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5151   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5152   // disallows uses outside the loop as well.
5153   for (auto *V : HasUniformUse) {
5154     if (isOutOfScope(V))
5155       continue;
5156     auto *I = cast<Instruction>(V);
5157     auto UsersAreMemAccesses =
5158       llvm::all_of(I->users(), [&](User *U) -> bool {
5159         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5160       });
5161     if (UsersAreMemAccesses)
5162       addToWorklistIfAllowed(I);
5163   }
5164 
5165   // Expand Worklist in topological order: whenever a new instruction
5166   // is added , its users should be already inside Worklist.  It ensures
5167   // a uniform instruction will only be used by uniform instructions.
5168   unsigned idx = 0;
5169   while (idx != Worklist.size()) {
5170     Instruction *I = Worklist[idx++];
5171 
5172     for (auto OV : I->operand_values()) {
5173       // isOutOfScope operands cannot be uniform instructions.
5174       if (isOutOfScope(OV))
5175         continue;
5176       // First order recurrence Phi's should typically be considered
5177       // non-uniform.
5178       auto *OP = dyn_cast<PHINode>(OV);
5179       if (OP && Legal->isFirstOrderRecurrence(OP))
5180         continue;
5181       // If all the users of the operand are uniform, then add the
5182       // operand into the uniform worklist.
5183       auto *OI = cast<Instruction>(OV);
5184       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5185             auto *J = cast<Instruction>(U);
5186             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5187           }))
5188         addToWorklistIfAllowed(OI);
5189     }
5190   }
5191 
5192   // For an instruction to be added into Worklist above, all its users inside
5193   // the loop should also be in Worklist. However, this condition cannot be
5194   // true for phi nodes that form a cyclic dependence. We must process phi
5195   // nodes separately. An induction variable will remain uniform if all users
5196   // of the induction variable and induction variable update remain uniform.
5197   // The code below handles both pointer and non-pointer induction variables.
5198   for (auto &Induction : Legal->getInductionVars()) {
5199     auto *Ind = Induction.first;
5200     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5201 
5202     // Determine if all users of the induction variable are uniform after
5203     // vectorization.
5204     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5205       auto *I = cast<Instruction>(U);
5206       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5207              isVectorizedMemAccessUse(I, Ind);
5208     });
5209     if (!UniformInd)
5210       continue;
5211 
5212     // Determine if all users of the induction variable update instruction are
5213     // uniform after vectorization.
5214     auto UniformIndUpdate =
5215         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5216           auto *I = cast<Instruction>(U);
5217           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5218                  isVectorizedMemAccessUse(I, IndUpdate);
5219         });
5220     if (!UniformIndUpdate)
5221       continue;
5222 
5223     // The induction variable and its update instruction will remain uniform.
5224     addToWorklistIfAllowed(Ind);
5225     addToWorklistIfAllowed(IndUpdate);
5226   }
5227 
5228   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5229 }
5230 
5231 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5232   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5233 
5234   if (Legal->getRuntimePointerChecking()->Need) {
5235     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5236         "runtime pointer checks needed. Enable vectorization of this "
5237         "loop with '#pragma clang loop vectorize(enable)' when "
5238         "compiling with -Os/-Oz",
5239         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5240     return true;
5241   }
5242 
5243   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5244     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5245         "runtime SCEV checks needed. Enable vectorization of this "
5246         "loop with '#pragma clang loop vectorize(enable)' when "
5247         "compiling with -Os/-Oz",
5248         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5249     return true;
5250   }
5251 
5252   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5253   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5254     reportVectorizationFailure("Runtime stride check for small trip count",
5255         "runtime stride == 1 checks needed. Enable vectorization of "
5256         "this loop without such check by compiling with -Os/-Oz",
5257         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5258     return true;
5259   }
5260 
5261   return false;
5262 }
5263 
5264 ElementCount
5265 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5266   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5267     return ElementCount::getScalable(0);
5268 
5269   if (Hints->isScalableVectorizationDisabled()) {
5270     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5271                             "ScalableVectorizationDisabled", ORE, TheLoop);
5272     return ElementCount::getScalable(0);
5273   }
5274 
5275   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5276 
5277   auto MaxScalableVF = ElementCount::getScalable(
5278       std::numeric_limits<ElementCount::ScalarTy>::max());
5279 
5280   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5281   // FIXME: While for scalable vectors this is currently sufficient, this should
5282   // be replaced by a more detailed mechanism that filters out specific VFs,
5283   // instead of invalidating vectorization for a whole set of VFs based on the
5284   // MaxVF.
5285 
5286   // Disable scalable vectorization if the loop contains unsupported reductions.
5287   if (!canVectorizeReductions(MaxScalableVF)) {
5288     reportVectorizationInfo(
5289         "Scalable vectorization not supported for the reduction "
5290         "operations found in this loop.",
5291         "ScalableVFUnfeasible", ORE, TheLoop);
5292     return ElementCount::getScalable(0);
5293   }
5294 
5295   // Disable scalable vectorization if the loop contains any instructions
5296   // with element types not supported for scalable vectors.
5297   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5298         return !Ty->isVoidTy() &&
5299                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5300       })) {
5301     reportVectorizationInfo("Scalable vectorization is not supported "
5302                             "for all element types found in this loop.",
5303                             "ScalableVFUnfeasible", ORE, TheLoop);
5304     return ElementCount::getScalable(0);
5305   }
5306 
5307   if (Legal->isSafeForAnyVectorWidth())
5308     return MaxScalableVF;
5309 
5310   // Limit MaxScalableVF by the maximum safe dependence distance.
5311   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5312   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5313     MaxVScale =
5314         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5315   MaxScalableVF = ElementCount::getScalable(
5316       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5317   if (!MaxScalableVF)
5318     reportVectorizationInfo(
5319         "Max legal vector width too small, scalable vectorization "
5320         "unfeasible.",
5321         "ScalableVFUnfeasible", ORE, TheLoop);
5322 
5323   return MaxScalableVF;
5324 }
5325 
5326 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5327     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5328   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5329   unsigned SmallestType, WidestType;
5330   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5331 
5332   // Get the maximum safe dependence distance in bits computed by LAA.
5333   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5334   // the memory accesses that is most restrictive (involved in the smallest
5335   // dependence distance).
5336   unsigned MaxSafeElements =
5337       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5338 
5339   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5340   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5341 
5342   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5343                     << ".\n");
5344   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5345                     << ".\n");
5346 
5347   // First analyze the UserVF, fall back if the UserVF should be ignored.
5348   if (UserVF) {
5349     auto MaxSafeUserVF =
5350         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5351 
5352     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5353       // If `VF=vscale x N` is safe, then so is `VF=N`
5354       if (UserVF.isScalable())
5355         return FixedScalableVFPair(
5356             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5357       else
5358         return UserVF;
5359     }
5360 
5361     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5362 
5363     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5364     // is better to ignore the hint and let the compiler choose a suitable VF.
5365     if (!UserVF.isScalable()) {
5366       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5367                         << " is unsafe, clamping to max safe VF="
5368                         << MaxSafeFixedVF << ".\n");
5369       ORE->emit([&]() {
5370         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5371                                           TheLoop->getStartLoc(),
5372                                           TheLoop->getHeader())
5373                << "User-specified vectorization factor "
5374                << ore::NV("UserVectorizationFactor", UserVF)
5375                << " is unsafe, clamping to maximum safe vectorization factor "
5376                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5377       });
5378       return MaxSafeFixedVF;
5379     }
5380 
5381     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5382       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5383                         << " is ignored because scalable vectors are not "
5384                            "available.\n");
5385       ORE->emit([&]() {
5386         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5387                                           TheLoop->getStartLoc(),
5388                                           TheLoop->getHeader())
5389                << "User-specified vectorization factor "
5390                << ore::NV("UserVectorizationFactor", UserVF)
5391                << " is ignored because the target does not support scalable "
5392                   "vectors. The compiler will pick a more suitable value.";
5393       });
5394     } else {
5395       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5396                         << " is unsafe. Ignoring scalable UserVF.\n");
5397       ORE->emit([&]() {
5398         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5399                                           TheLoop->getStartLoc(),
5400                                           TheLoop->getHeader())
5401                << "User-specified vectorization factor "
5402                << ore::NV("UserVectorizationFactor", UserVF)
5403                << " is unsafe. Ignoring the hint to let the compiler pick a "
5404                   "more suitable value.";
5405       });
5406     }
5407   }
5408 
5409   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5410                     << " / " << WidestType << " bits.\n");
5411 
5412   FixedScalableVFPair Result(ElementCount::getFixed(1),
5413                              ElementCount::getScalable(0));
5414   if (auto MaxVF =
5415           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5416                                   MaxSafeFixedVF, FoldTailByMasking))
5417     Result.FixedVF = MaxVF;
5418 
5419   if (auto MaxVF =
5420           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5421                                   MaxSafeScalableVF, FoldTailByMasking))
5422     if (MaxVF.isScalable()) {
5423       Result.ScalableVF = MaxVF;
5424       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5425                         << "\n");
5426     }
5427 
5428   return Result;
5429 }
5430 
5431 FixedScalableVFPair
5432 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5433   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5434     // TODO: It may by useful to do since it's still likely to be dynamically
5435     // uniform if the target can skip.
5436     reportVectorizationFailure(
5437         "Not inserting runtime ptr check for divergent target",
5438         "runtime pointer checks needed. Not enabled for divergent target",
5439         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5440     return FixedScalableVFPair::getNone();
5441   }
5442 
5443   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5444   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5445   if (TC == 1) {
5446     reportVectorizationFailure("Single iteration (non) loop",
5447         "loop trip count is one, irrelevant for vectorization",
5448         "SingleIterationLoop", ORE, TheLoop);
5449     return FixedScalableVFPair::getNone();
5450   }
5451 
5452   switch (ScalarEpilogueStatus) {
5453   case CM_ScalarEpilogueAllowed:
5454     return computeFeasibleMaxVF(TC, UserVF, false);
5455   case CM_ScalarEpilogueNotAllowedUsePredicate:
5456     LLVM_FALLTHROUGH;
5457   case CM_ScalarEpilogueNotNeededUsePredicate:
5458     LLVM_DEBUG(
5459         dbgs() << "LV: vector predicate hint/switch found.\n"
5460                << "LV: Not allowing scalar epilogue, creating predicated "
5461                << "vector loop.\n");
5462     break;
5463   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5464     // fallthrough as a special case of OptForSize
5465   case CM_ScalarEpilogueNotAllowedOptSize:
5466     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5467       LLVM_DEBUG(
5468           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5469     else
5470       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5471                         << "count.\n");
5472 
5473     // Bail if runtime checks are required, which are not good when optimising
5474     // for size.
5475     if (runtimeChecksRequired())
5476       return FixedScalableVFPair::getNone();
5477 
5478     break;
5479   }
5480 
5481   // The only loops we can vectorize without a scalar epilogue, are loops with
5482   // a bottom-test and a single exiting block. We'd have to handle the fact
5483   // that not every instruction executes on the last iteration.  This will
5484   // require a lane mask which varies through the vector loop body.  (TODO)
5485   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5486     // If there was a tail-folding hint/switch, but we can't fold the tail by
5487     // masking, fallback to a vectorization with a scalar epilogue.
5488     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5489       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5490                            "scalar epilogue instead.\n");
5491       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5492       return computeFeasibleMaxVF(TC, UserVF, false);
5493     }
5494     return FixedScalableVFPair::getNone();
5495   }
5496 
5497   // Now try the tail folding
5498 
5499   // Invalidate interleave groups that require an epilogue if we can't mask
5500   // the interleave-group.
5501   if (!useMaskedInterleavedAccesses(TTI)) {
5502     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5503            "No decisions should have been taken at this point");
5504     // Note: There is no need to invalidate any cost modeling decisions here, as
5505     // non where taken so far.
5506     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5507   }
5508 
5509   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5510   // Avoid tail folding if the trip count is known to be a multiple of any VF
5511   // we chose.
5512   // FIXME: The condition below pessimises the case for fixed-width vectors,
5513   // when scalable VFs are also candidates for vectorization.
5514   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5515     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5516     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5517            "MaxFixedVF must be a power of 2");
5518     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5519                                    : MaxFixedVF.getFixedValue();
5520     ScalarEvolution *SE = PSE.getSE();
5521     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5522     const SCEV *ExitCount = SE->getAddExpr(
5523         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5524     const SCEV *Rem = SE->getURemExpr(
5525         SE->applyLoopGuards(ExitCount, TheLoop),
5526         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5527     if (Rem->isZero()) {
5528       // Accept MaxFixedVF if we do not have a tail.
5529       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5530       return MaxFactors;
5531     }
5532   }
5533 
5534   // For scalable vectors, don't use tail folding as this is currently not yet
5535   // supported. The code is likely to have ended up here if the tripcount is
5536   // low, in which case it makes sense not to use scalable vectors.
5537   if (MaxFactors.ScalableVF.isVector())
5538     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5539 
5540   // If we don't know the precise trip count, or if the trip count that we
5541   // found modulo the vectorization factor is not zero, try to fold the tail
5542   // by masking.
5543   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5544   if (Legal->prepareToFoldTailByMasking()) {
5545     FoldTailByMasking = true;
5546     return MaxFactors;
5547   }
5548 
5549   // If there was a tail-folding hint/switch, but we can't fold the tail by
5550   // masking, fallback to a vectorization with a scalar epilogue.
5551   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5552     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5553                          "scalar epilogue instead.\n");
5554     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5555     return MaxFactors;
5556   }
5557 
5558   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5559     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5560     return FixedScalableVFPair::getNone();
5561   }
5562 
5563   if (TC == 0) {
5564     reportVectorizationFailure(
5565         "Unable to calculate the loop count due to complex control flow",
5566         "unable to calculate the loop count due to complex control flow",
5567         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5568     return FixedScalableVFPair::getNone();
5569   }
5570 
5571   reportVectorizationFailure(
5572       "Cannot optimize for size and vectorize at the same time.",
5573       "cannot optimize for size and vectorize at the same time. "
5574       "Enable vectorization of this loop with '#pragma clang loop "
5575       "vectorize(enable)' when compiling with -Os/-Oz",
5576       "NoTailLoopWithOptForSize", ORE, TheLoop);
5577   return FixedScalableVFPair::getNone();
5578 }
5579 
5580 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5581     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5582     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5583   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5584   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5585       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5586                            : TargetTransformInfo::RGK_FixedWidthVector);
5587 
5588   // Convenience function to return the minimum of two ElementCounts.
5589   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5590     assert((LHS.isScalable() == RHS.isScalable()) &&
5591            "Scalable flags must match");
5592     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5593   };
5594 
5595   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5596   // Note that both WidestRegister and WidestType may not be a powers of 2.
5597   auto MaxVectorElementCount = ElementCount::get(
5598       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5599       ComputeScalableMaxVF);
5600   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5601   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5602                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5603 
5604   if (!MaxVectorElementCount) {
5605     LLVM_DEBUG(dbgs() << "LV: The target has no "
5606                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5607                       << " vector registers.\n");
5608     return ElementCount::getFixed(1);
5609   }
5610 
5611   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5612   if (ConstTripCount &&
5613       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5614       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5615     // If loop trip count (TC) is known at compile time there is no point in
5616     // choosing VF greater than TC (as done in the loop below). Select maximum
5617     // power of two which doesn't exceed TC.
5618     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5619     // when the TC is less than or equal to the known number of lanes.
5620     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5621     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5622                          "exceeding the constant trip count: "
5623                       << ClampedConstTripCount << "\n");
5624     return ElementCount::getFixed(ClampedConstTripCount);
5625   }
5626 
5627   ElementCount MaxVF = MaxVectorElementCount;
5628   if (TTI.shouldMaximizeVectorBandwidth() ||
5629       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5630     auto MaxVectorElementCountMaxBW = ElementCount::get(
5631         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5632         ComputeScalableMaxVF);
5633     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5634 
5635     // Collect all viable vectorization factors larger than the default MaxVF
5636     // (i.e. MaxVectorElementCount).
5637     SmallVector<ElementCount, 8> VFs;
5638     for (ElementCount VS = MaxVectorElementCount * 2;
5639          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5640       VFs.push_back(VS);
5641 
5642     // For each VF calculate its register usage.
5643     auto RUs = calculateRegisterUsage(VFs);
5644 
5645     // Select the largest VF which doesn't require more registers than existing
5646     // ones.
5647     for (int i = RUs.size() - 1; i >= 0; --i) {
5648       bool Selected = true;
5649       for (auto &pair : RUs[i].MaxLocalUsers) {
5650         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5651         if (pair.second > TargetNumRegisters)
5652           Selected = false;
5653       }
5654       if (Selected) {
5655         MaxVF = VFs[i];
5656         break;
5657       }
5658     }
5659     if (ElementCount MinVF =
5660             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5661       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5662         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5663                           << ") with target's minimum: " << MinVF << '\n');
5664         MaxVF = MinVF;
5665       }
5666     }
5667   }
5668   return MaxVF;
5669 }
5670 
5671 bool LoopVectorizationCostModel::isMoreProfitable(
5672     const VectorizationFactor &A, const VectorizationFactor &B) const {
5673   InstructionCost CostA = A.Cost;
5674   InstructionCost CostB = B.Cost;
5675 
5676   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5677 
5678   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5679       MaxTripCount) {
5680     // If we are folding the tail and the trip count is a known (possibly small)
5681     // constant, the trip count will be rounded up to an integer number of
5682     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5683     // which we compare directly. When not folding the tail, the total cost will
5684     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5685     // approximated with the per-lane cost below instead of using the tripcount
5686     // as here.
5687     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5688     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5689     return RTCostA < RTCostB;
5690   }
5691 
5692   // Improve estimate for the vector width if it is scalable.
5693   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5694   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5695   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5696     if (A.Width.isScalable())
5697       EstimatedWidthA *= VScale.getValue();
5698     if (B.Width.isScalable())
5699       EstimatedWidthB *= VScale.getValue();
5700   }
5701 
5702   // Assume vscale may be larger than 1 (or the value being tuned for),
5703   // so that scalable vectorization is slightly favorable over fixed-width
5704   // vectorization.
5705   if (A.Width.isScalable() && !B.Width.isScalable())
5706     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5707 
5708   // To avoid the need for FP division:
5709   //      (CostA / A.Width) < (CostB / B.Width)
5710   // <=>  (CostA * B.Width) < (CostB * A.Width)
5711   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5712 }
5713 
5714 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5715     const ElementCountSet &VFCandidates) {
5716   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5717   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5718   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5719   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5720          "Expected Scalar VF to be a candidate");
5721 
5722   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5723   VectorizationFactor ChosenFactor = ScalarCost;
5724 
5725   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5726   if (ForceVectorization && VFCandidates.size() > 1) {
5727     // Ignore scalar width, because the user explicitly wants vectorization.
5728     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5729     // evaluation.
5730     ChosenFactor.Cost = InstructionCost::getMax();
5731   }
5732 
5733   SmallVector<InstructionVFPair> InvalidCosts;
5734   for (const auto &i : VFCandidates) {
5735     // The cost for scalar VF=1 is already calculated, so ignore it.
5736     if (i.isScalar())
5737       continue;
5738 
5739     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5740     VectorizationFactor Candidate(i, C.first);
5741 
5742 #ifndef NDEBUG
5743     unsigned AssumedMinimumVscale = 1;
5744     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5745       AssumedMinimumVscale = VScale.getValue();
5746     unsigned Width =
5747         Candidate.Width.isScalable()
5748             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5749             : Candidate.Width.getFixedValue();
5750     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5751                       << " costs: " << (Candidate.Cost / Width));
5752     if (i.isScalable())
5753       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5754                         << AssumedMinimumVscale << ")");
5755     LLVM_DEBUG(dbgs() << ".\n");
5756 #endif
5757 
5758     if (!C.second && !ForceVectorization) {
5759       LLVM_DEBUG(
5760           dbgs() << "LV: Not considering vector loop of width " << i
5761                  << " because it will not generate any vector instructions.\n");
5762       continue;
5763     }
5764 
5765     // If profitable add it to ProfitableVF list.
5766     if (isMoreProfitable(Candidate, ScalarCost))
5767       ProfitableVFs.push_back(Candidate);
5768 
5769     if (isMoreProfitable(Candidate, ChosenFactor))
5770       ChosenFactor = Candidate;
5771   }
5772 
5773   // Emit a report of VFs with invalid costs in the loop.
5774   if (!InvalidCosts.empty()) {
5775     // Group the remarks per instruction, keeping the instruction order from
5776     // InvalidCosts.
5777     std::map<Instruction *, unsigned> Numbering;
5778     unsigned I = 0;
5779     for (auto &Pair : InvalidCosts)
5780       if (!Numbering.count(Pair.first))
5781         Numbering[Pair.first] = I++;
5782 
5783     // Sort the list, first on instruction(number) then on VF.
5784     llvm::sort(InvalidCosts,
5785                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5786                  if (Numbering[A.first] != Numbering[B.first])
5787                    return Numbering[A.first] < Numbering[B.first];
5788                  ElementCountComparator ECC;
5789                  return ECC(A.second, B.second);
5790                });
5791 
5792     // For a list of ordered instruction-vf pairs:
5793     //   [(load, vf1), (load, vf2), (store, vf1)]
5794     // Group the instructions together to emit separate remarks for:
5795     //   load  (vf1, vf2)
5796     //   store (vf1)
5797     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5798     auto Subset = ArrayRef<InstructionVFPair>();
5799     do {
5800       if (Subset.empty())
5801         Subset = Tail.take_front(1);
5802 
5803       Instruction *I = Subset.front().first;
5804 
5805       // If the next instruction is different, or if there are no other pairs,
5806       // emit a remark for the collated subset. e.g.
5807       //   [(load, vf1), (load, vf2))]
5808       // to emit:
5809       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5810       if (Subset == Tail || Tail[Subset.size()].first != I) {
5811         std::string OutString;
5812         raw_string_ostream OS(OutString);
5813         assert(!Subset.empty() && "Unexpected empty range");
5814         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5815         for (auto &Pair : Subset)
5816           OS << (Pair.second == Subset.front().second ? "" : ", ")
5817              << Pair.second;
5818         OS << "):";
5819         if (auto *CI = dyn_cast<CallInst>(I))
5820           OS << " call to " << CI->getCalledFunction()->getName();
5821         else
5822           OS << " " << I->getOpcodeName();
5823         OS.flush();
5824         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5825         Tail = Tail.drop_front(Subset.size());
5826         Subset = {};
5827       } else
5828         // Grow the subset by one element
5829         Subset = Tail.take_front(Subset.size() + 1);
5830     } while (!Tail.empty());
5831   }
5832 
5833   if (!EnableCondStoresVectorization && NumPredStores) {
5834     reportVectorizationFailure("There are conditional stores.",
5835         "store that is conditionally executed prevents vectorization",
5836         "ConditionalStore", ORE, TheLoop);
5837     ChosenFactor = ScalarCost;
5838   }
5839 
5840   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5841                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5842              << "LV: Vectorization seems to be not beneficial, "
5843              << "but was forced by a user.\n");
5844   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5845   return ChosenFactor;
5846 }
5847 
5848 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5849     const Loop &L, ElementCount VF) const {
5850   // Cross iteration phis such as reductions need special handling and are
5851   // currently unsupported.
5852   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5853         return Legal->isFirstOrderRecurrence(&Phi) ||
5854                Legal->isReductionVariable(&Phi);
5855       }))
5856     return false;
5857 
5858   // Phis with uses outside of the loop require special handling and are
5859   // currently unsupported.
5860   for (auto &Entry : Legal->getInductionVars()) {
5861     // Look for uses of the value of the induction at the last iteration.
5862     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5863     for (User *U : PostInc->users())
5864       if (!L.contains(cast<Instruction>(U)))
5865         return false;
5866     // Look for uses of penultimate value of the induction.
5867     for (User *U : Entry.first->users())
5868       if (!L.contains(cast<Instruction>(U)))
5869         return false;
5870   }
5871 
5872   // Induction variables that are widened require special handling that is
5873   // currently not supported.
5874   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5875         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5876                  this->isProfitableToScalarize(Entry.first, VF));
5877       }))
5878     return false;
5879 
5880   // Epilogue vectorization code has not been auditted to ensure it handles
5881   // non-latch exits properly.  It may be fine, but it needs auditted and
5882   // tested.
5883   if (L.getExitingBlock() != L.getLoopLatch())
5884     return false;
5885 
5886   return true;
5887 }
5888 
5889 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5890     const ElementCount VF) const {
5891   // FIXME: We need a much better cost-model to take different parameters such
5892   // as register pressure, code size increase and cost of extra branches into
5893   // account. For now we apply a very crude heuristic and only consider loops
5894   // with vectorization factors larger than a certain value.
5895   // We also consider epilogue vectorization unprofitable for targets that don't
5896   // consider interleaving beneficial (eg. MVE).
5897   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5898     return false;
5899   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5900     return true;
5901   return false;
5902 }
5903 
5904 VectorizationFactor
5905 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5906     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5907   VectorizationFactor Result = VectorizationFactor::Disabled();
5908   if (!EnableEpilogueVectorization) {
5909     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5910     return Result;
5911   }
5912 
5913   if (!isScalarEpilogueAllowed()) {
5914     LLVM_DEBUG(
5915         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5916                   "allowed.\n";);
5917     return Result;
5918   }
5919 
5920   // Not really a cost consideration, but check for unsupported cases here to
5921   // simplify the logic.
5922   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5923     LLVM_DEBUG(
5924         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5925                   "not a supported candidate.\n";);
5926     return Result;
5927   }
5928 
5929   if (EpilogueVectorizationForceVF > 1) {
5930     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5931     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5932     if (LVP.hasPlanWithVF(ForcedEC))
5933       return {ForcedEC, 0};
5934     else {
5935       LLVM_DEBUG(
5936           dbgs()
5937               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5938       return Result;
5939     }
5940   }
5941 
5942   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5943       TheLoop->getHeader()->getParent()->hasMinSize()) {
5944     LLVM_DEBUG(
5945         dbgs()
5946             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5947     return Result;
5948   }
5949 
5950   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5951   if (MainLoopVF.isScalable())
5952     LLVM_DEBUG(
5953         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5954                   "yet supported. Converting to fixed-width (VF="
5955                << FixedMainLoopVF << ") instead\n");
5956 
5957   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5958     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5959                          "this loop\n");
5960     return Result;
5961   }
5962 
5963   for (auto &NextVF : ProfitableVFs)
5964     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5965         (Result.Width.getFixedValue() == 1 ||
5966          isMoreProfitable(NextVF, Result)) &&
5967         LVP.hasPlanWithVF(NextVF.Width))
5968       Result = NextVF;
5969 
5970   if (Result != VectorizationFactor::Disabled())
5971     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5972                       << Result.Width.getFixedValue() << "\n";);
5973   return Result;
5974 }
5975 
5976 std::pair<unsigned, unsigned>
5977 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5978   unsigned MinWidth = -1U;
5979   unsigned MaxWidth = 8;
5980   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5981   for (Type *T : ElementTypesInLoop) {
5982     MinWidth = std::min<unsigned>(
5983         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5984     MaxWidth = std::max<unsigned>(
5985         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5986   }
5987   return {MinWidth, MaxWidth};
5988 }
5989 
5990 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5991   ElementTypesInLoop.clear();
5992   // For each block.
5993   for (BasicBlock *BB : TheLoop->blocks()) {
5994     // For each instruction in the loop.
5995     for (Instruction &I : BB->instructionsWithoutDebug()) {
5996       Type *T = I.getType();
5997 
5998       // Skip ignored values.
5999       if (ValuesToIgnore.count(&I))
6000         continue;
6001 
6002       // Only examine Loads, Stores and PHINodes.
6003       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6004         continue;
6005 
6006       // Examine PHI nodes that are reduction variables. Update the type to
6007       // account for the recurrence type.
6008       if (auto *PN = dyn_cast<PHINode>(&I)) {
6009         if (!Legal->isReductionVariable(PN))
6010           continue;
6011         const RecurrenceDescriptor &RdxDesc =
6012             Legal->getReductionVars().find(PN)->second;
6013         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6014             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6015                                       RdxDesc.getRecurrenceType(),
6016                                       TargetTransformInfo::ReductionFlags()))
6017           continue;
6018         T = RdxDesc.getRecurrenceType();
6019       }
6020 
6021       // Examine the stored values.
6022       if (auto *ST = dyn_cast<StoreInst>(&I))
6023         T = ST->getValueOperand()->getType();
6024 
6025       // Ignore loaded pointer types and stored pointer types that are not
6026       // vectorizable.
6027       //
6028       // FIXME: The check here attempts to predict whether a load or store will
6029       //        be vectorized. We only know this for certain after a VF has
6030       //        been selected. Here, we assume that if an access can be
6031       //        vectorized, it will be. We should also look at extending this
6032       //        optimization to non-pointer types.
6033       //
6034       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6035           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6036         continue;
6037 
6038       ElementTypesInLoop.insert(T);
6039     }
6040   }
6041 }
6042 
6043 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6044                                                            unsigned LoopCost) {
6045   // -- The interleave heuristics --
6046   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6047   // There are many micro-architectural considerations that we can't predict
6048   // at this level. For example, frontend pressure (on decode or fetch) due to
6049   // code size, or the number and capabilities of the execution ports.
6050   //
6051   // We use the following heuristics to select the interleave count:
6052   // 1. If the code has reductions, then we interleave to break the cross
6053   // iteration dependency.
6054   // 2. If the loop is really small, then we interleave to reduce the loop
6055   // overhead.
6056   // 3. We don't interleave if we think that we will spill registers to memory
6057   // due to the increased register pressure.
6058 
6059   if (!isScalarEpilogueAllowed())
6060     return 1;
6061 
6062   // We used the distance for the interleave count.
6063   if (Legal->getMaxSafeDepDistBytes() != -1U)
6064     return 1;
6065 
6066   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6067   const bool HasReductions = !Legal->getReductionVars().empty();
6068   // Do not interleave loops with a relatively small known or estimated trip
6069   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6070   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6071   // because with the above conditions interleaving can expose ILP and break
6072   // cross iteration dependences for reductions.
6073   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6074       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6075     return 1;
6076 
6077   RegisterUsage R = calculateRegisterUsage({VF})[0];
6078   // We divide by these constants so assume that we have at least one
6079   // instruction that uses at least one register.
6080   for (auto& pair : R.MaxLocalUsers) {
6081     pair.second = std::max(pair.second, 1U);
6082   }
6083 
6084   // We calculate the interleave count using the following formula.
6085   // Subtract the number of loop invariants from the number of available
6086   // registers. These registers are used by all of the interleaved instances.
6087   // Next, divide the remaining registers by the number of registers that is
6088   // required by the loop, in order to estimate how many parallel instances
6089   // fit without causing spills. All of this is rounded down if necessary to be
6090   // a power of two. We want power of two interleave count to simplify any
6091   // addressing operations or alignment considerations.
6092   // We also want power of two interleave counts to ensure that the induction
6093   // variable of the vector loop wraps to zero, when tail is folded by masking;
6094   // this currently happens when OptForSize, in which case IC is set to 1 above.
6095   unsigned IC = UINT_MAX;
6096 
6097   for (auto& pair : R.MaxLocalUsers) {
6098     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6099     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6100                       << " registers of "
6101                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6102     if (VF.isScalar()) {
6103       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6104         TargetNumRegisters = ForceTargetNumScalarRegs;
6105     } else {
6106       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6107         TargetNumRegisters = ForceTargetNumVectorRegs;
6108     }
6109     unsigned MaxLocalUsers = pair.second;
6110     unsigned LoopInvariantRegs = 0;
6111     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6112       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6113 
6114     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6115     // Don't count the induction variable as interleaved.
6116     if (EnableIndVarRegisterHeur) {
6117       TmpIC =
6118           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6119                         std::max(1U, (MaxLocalUsers - 1)));
6120     }
6121 
6122     IC = std::min(IC, TmpIC);
6123   }
6124 
6125   // Clamp the interleave ranges to reasonable counts.
6126   unsigned MaxInterleaveCount =
6127       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6128 
6129   // Check if the user has overridden the max.
6130   if (VF.isScalar()) {
6131     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6132       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6133   } else {
6134     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6135       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6136   }
6137 
6138   // If trip count is known or estimated compile time constant, limit the
6139   // interleave count to be less than the trip count divided by VF, provided it
6140   // is at least 1.
6141   //
6142   // For scalable vectors we can't know if interleaving is beneficial. It may
6143   // not be beneficial for small loops if none of the lanes in the second vector
6144   // iterations is enabled. However, for larger loops, there is likely to be a
6145   // similar benefit as for fixed-width vectors. For now, we choose to leave
6146   // the InterleaveCount as if vscale is '1', although if some information about
6147   // the vector is known (e.g. min vector size), we can make a better decision.
6148   if (BestKnownTC) {
6149     MaxInterleaveCount =
6150         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6151     // Make sure MaxInterleaveCount is greater than 0.
6152     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6153   }
6154 
6155   assert(MaxInterleaveCount > 0 &&
6156          "Maximum interleave count must be greater than 0");
6157 
6158   // Clamp the calculated IC to be between the 1 and the max interleave count
6159   // that the target and trip count allows.
6160   if (IC > MaxInterleaveCount)
6161     IC = MaxInterleaveCount;
6162   else
6163     // Make sure IC is greater than 0.
6164     IC = std::max(1u, IC);
6165 
6166   assert(IC > 0 && "Interleave count must be greater than 0.");
6167 
6168   // If we did not calculate the cost for VF (because the user selected the VF)
6169   // then we calculate the cost of VF here.
6170   if (LoopCost == 0) {
6171     InstructionCost C = expectedCost(VF).first;
6172     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6173     LoopCost = *C.getValue();
6174   }
6175 
6176   assert(LoopCost && "Non-zero loop cost expected");
6177 
6178   // Interleave if we vectorized this loop and there is a reduction that could
6179   // benefit from interleaving.
6180   if (VF.isVector() && HasReductions) {
6181     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6182     return IC;
6183   }
6184 
6185   // Note that if we've already vectorized the loop we will have done the
6186   // runtime check and so interleaving won't require further checks.
6187   bool InterleavingRequiresRuntimePointerCheck =
6188       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6189 
6190   // We want to interleave small loops in order to reduce the loop overhead and
6191   // potentially expose ILP opportunities.
6192   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6193                     << "LV: IC is " << IC << '\n'
6194                     << "LV: VF is " << VF << '\n');
6195   const bool AggressivelyInterleaveReductions =
6196       TTI.enableAggressiveInterleaving(HasReductions);
6197   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6198     // We assume that the cost overhead is 1 and we use the cost model
6199     // to estimate the cost of the loop and interleave until the cost of the
6200     // loop overhead is about 5% of the cost of the loop.
6201     unsigned SmallIC =
6202         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6203 
6204     // Interleave until store/load ports (estimated by max interleave count) are
6205     // saturated.
6206     unsigned NumStores = Legal->getNumStores();
6207     unsigned NumLoads = Legal->getNumLoads();
6208     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6209     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6210 
6211     // There is little point in interleaving for reductions containing selects
6212     // and compares when VF=1 since it may just create more overhead than it's
6213     // worth for loops with small trip counts. This is because we still have to
6214     // do the final reduction after the loop.
6215     bool HasSelectCmpReductions =
6216         HasReductions &&
6217         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6218           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6219           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6220               RdxDesc.getRecurrenceKind());
6221         });
6222     if (HasSelectCmpReductions) {
6223       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6224       return 1;
6225     }
6226 
6227     // If we have a scalar reduction (vector reductions are already dealt with
6228     // by this point), we can increase the critical path length if the loop
6229     // we're interleaving is inside another loop. For tree-wise reductions
6230     // set the limit to 2, and for ordered reductions it's best to disable
6231     // interleaving entirely.
6232     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6233       bool HasOrderedReductions =
6234           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6235             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6236             return RdxDesc.isOrdered();
6237           });
6238       if (HasOrderedReductions) {
6239         LLVM_DEBUG(
6240             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6241         return 1;
6242       }
6243 
6244       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6245       SmallIC = std::min(SmallIC, F);
6246       StoresIC = std::min(StoresIC, F);
6247       LoadsIC = std::min(LoadsIC, F);
6248     }
6249 
6250     if (EnableLoadStoreRuntimeInterleave &&
6251         std::max(StoresIC, LoadsIC) > SmallIC) {
6252       LLVM_DEBUG(
6253           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6254       return std::max(StoresIC, LoadsIC);
6255     }
6256 
6257     // If there are scalar reductions and TTI has enabled aggressive
6258     // interleaving for reductions, we will interleave to expose ILP.
6259     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6260         AggressivelyInterleaveReductions) {
6261       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6262       // Interleave no less than SmallIC but not as aggressive as the normal IC
6263       // to satisfy the rare situation when resources are too limited.
6264       return std::max(IC / 2, SmallIC);
6265     } else {
6266       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6267       return SmallIC;
6268     }
6269   }
6270 
6271   // Interleave if this is a large loop (small loops are already dealt with by
6272   // this point) that could benefit from interleaving.
6273   if (AggressivelyInterleaveReductions) {
6274     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6275     return IC;
6276   }
6277 
6278   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6279   return 1;
6280 }
6281 
6282 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6283 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6284   // This function calculates the register usage by measuring the highest number
6285   // of values that are alive at a single location. Obviously, this is a very
6286   // rough estimation. We scan the loop in a topological order in order and
6287   // assign a number to each instruction. We use RPO to ensure that defs are
6288   // met before their users. We assume that each instruction that has in-loop
6289   // users starts an interval. We record every time that an in-loop value is
6290   // used, so we have a list of the first and last occurrences of each
6291   // instruction. Next, we transpose this data structure into a multi map that
6292   // holds the list of intervals that *end* at a specific location. This multi
6293   // map allows us to perform a linear search. We scan the instructions linearly
6294   // and record each time that a new interval starts, by placing it in a set.
6295   // If we find this value in the multi-map then we remove it from the set.
6296   // The max register usage is the maximum size of the set.
6297   // We also search for instructions that are defined outside the loop, but are
6298   // used inside the loop. We need this number separately from the max-interval
6299   // usage number because when we unroll, loop-invariant values do not take
6300   // more register.
6301   LoopBlocksDFS DFS(TheLoop);
6302   DFS.perform(LI);
6303 
6304   RegisterUsage RU;
6305 
6306   // Each 'key' in the map opens a new interval. The values
6307   // of the map are the index of the 'last seen' usage of the
6308   // instruction that is the key.
6309   using IntervalMap = DenseMap<Instruction *, unsigned>;
6310 
6311   // Maps instruction to its index.
6312   SmallVector<Instruction *, 64> IdxToInstr;
6313   // Marks the end of each interval.
6314   IntervalMap EndPoint;
6315   // Saves the list of instruction indices that are used in the loop.
6316   SmallPtrSet<Instruction *, 8> Ends;
6317   // Saves the list of values that are used in the loop but are
6318   // defined outside the loop, such as arguments and constants.
6319   SmallPtrSet<Value *, 8> LoopInvariants;
6320 
6321   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6322     for (Instruction &I : BB->instructionsWithoutDebug()) {
6323       IdxToInstr.push_back(&I);
6324 
6325       // Save the end location of each USE.
6326       for (Value *U : I.operands()) {
6327         auto *Instr = dyn_cast<Instruction>(U);
6328 
6329         // Ignore non-instruction values such as arguments, constants, etc.
6330         if (!Instr)
6331           continue;
6332 
6333         // If this instruction is outside the loop then record it and continue.
6334         if (!TheLoop->contains(Instr)) {
6335           LoopInvariants.insert(Instr);
6336           continue;
6337         }
6338 
6339         // Overwrite previous end points.
6340         EndPoint[Instr] = IdxToInstr.size();
6341         Ends.insert(Instr);
6342       }
6343     }
6344   }
6345 
6346   // Saves the list of intervals that end with the index in 'key'.
6347   using InstrList = SmallVector<Instruction *, 2>;
6348   DenseMap<unsigned, InstrList> TransposeEnds;
6349 
6350   // Transpose the EndPoints to a list of values that end at each index.
6351   for (auto &Interval : EndPoint)
6352     TransposeEnds[Interval.second].push_back(Interval.first);
6353 
6354   SmallPtrSet<Instruction *, 8> OpenIntervals;
6355   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6356   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6357 
6358   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6359 
6360   // A lambda that gets the register usage for the given type and VF.
6361   const auto &TTICapture = TTI;
6362   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6363     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6364       return 0;
6365     InstructionCost::CostType RegUsage =
6366         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6367     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6368            "Nonsensical values for register usage.");
6369     return RegUsage;
6370   };
6371 
6372   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6373     Instruction *I = IdxToInstr[i];
6374 
6375     // Remove all of the instructions that end at this location.
6376     InstrList &List = TransposeEnds[i];
6377     for (Instruction *ToRemove : List)
6378       OpenIntervals.erase(ToRemove);
6379 
6380     // Ignore instructions that are never used within the loop.
6381     if (!Ends.count(I))
6382       continue;
6383 
6384     // Skip ignored values.
6385     if (ValuesToIgnore.count(I))
6386       continue;
6387 
6388     // For each VF find the maximum usage of registers.
6389     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6390       // Count the number of live intervals.
6391       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6392 
6393       if (VFs[j].isScalar()) {
6394         for (auto Inst : OpenIntervals) {
6395           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6396           if (RegUsage.find(ClassID) == RegUsage.end())
6397             RegUsage[ClassID] = 1;
6398           else
6399             RegUsage[ClassID] += 1;
6400         }
6401       } else {
6402         collectUniformsAndScalars(VFs[j]);
6403         for (auto Inst : OpenIntervals) {
6404           // Skip ignored values for VF > 1.
6405           if (VecValuesToIgnore.count(Inst))
6406             continue;
6407           if (isScalarAfterVectorization(Inst, VFs[j])) {
6408             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6409             if (RegUsage.find(ClassID) == RegUsage.end())
6410               RegUsage[ClassID] = 1;
6411             else
6412               RegUsage[ClassID] += 1;
6413           } else {
6414             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6415             if (RegUsage.find(ClassID) == RegUsage.end())
6416               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6417             else
6418               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6419           }
6420         }
6421       }
6422 
6423       for (auto& pair : RegUsage) {
6424         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6425           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6426         else
6427           MaxUsages[j][pair.first] = pair.second;
6428       }
6429     }
6430 
6431     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6432                       << OpenIntervals.size() << '\n');
6433 
6434     // Add the current instruction to the list of open intervals.
6435     OpenIntervals.insert(I);
6436   }
6437 
6438   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6439     SmallMapVector<unsigned, unsigned, 4> Invariant;
6440 
6441     for (auto Inst : LoopInvariants) {
6442       unsigned Usage =
6443           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6444       unsigned ClassID =
6445           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6446       if (Invariant.find(ClassID) == Invariant.end())
6447         Invariant[ClassID] = Usage;
6448       else
6449         Invariant[ClassID] += Usage;
6450     }
6451 
6452     LLVM_DEBUG({
6453       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6454       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6455              << " item\n";
6456       for (const auto &pair : MaxUsages[i]) {
6457         dbgs() << "LV(REG): RegisterClass: "
6458                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6459                << " registers\n";
6460       }
6461       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6462              << " item\n";
6463       for (const auto &pair : Invariant) {
6464         dbgs() << "LV(REG): RegisterClass: "
6465                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6466                << " registers\n";
6467       }
6468     });
6469 
6470     RU.LoopInvariantRegs = Invariant;
6471     RU.MaxLocalUsers = MaxUsages[i];
6472     RUs[i] = RU;
6473   }
6474 
6475   return RUs;
6476 }
6477 
6478 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6479   // TODO: Cost model for emulated masked load/store is completely
6480   // broken. This hack guides the cost model to use an artificially
6481   // high enough value to practically disable vectorization with such
6482   // operations, except where previously deployed legality hack allowed
6483   // using very low cost values. This is to avoid regressions coming simply
6484   // from moving "masked load/store" check from legality to cost model.
6485   // Masked Load/Gather emulation was previously never allowed.
6486   // Limited number of Masked Store/Scatter emulation was allowed.
6487   assert(isPredicatedInst(I) &&
6488          "Expecting a scalar emulated instruction");
6489   return isa<LoadInst>(I) ||
6490          (isa<StoreInst>(I) &&
6491           NumPredStores > NumberOfStoresToPredicate);
6492 }
6493 
6494 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6495   // If we aren't vectorizing the loop, or if we've already collected the
6496   // instructions to scalarize, there's nothing to do. Collection may already
6497   // have occurred if we have a user-selected VF and are now computing the
6498   // expected cost for interleaving.
6499   if (VF.isScalar() || VF.isZero() ||
6500       InstsToScalarize.find(VF) != InstsToScalarize.end())
6501     return;
6502 
6503   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6504   // not profitable to scalarize any instructions, the presence of VF in the
6505   // map will indicate that we've analyzed it already.
6506   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6507 
6508   // Find all the instructions that are scalar with predication in the loop and
6509   // determine if it would be better to not if-convert the blocks they are in.
6510   // If so, we also record the instructions to scalarize.
6511   for (BasicBlock *BB : TheLoop->blocks()) {
6512     if (!blockNeedsPredicationForAnyReason(BB))
6513       continue;
6514     for (Instruction &I : *BB)
6515       if (isScalarWithPredication(&I)) {
6516         ScalarCostsTy ScalarCosts;
6517         // Do not apply discount if scalable, because that would lead to
6518         // invalid scalarization costs.
6519         // Do not apply discount logic if hacked cost is needed
6520         // for emulated masked memrefs.
6521         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6522             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6523           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6524         // Remember that BB will remain after vectorization.
6525         PredicatedBBsAfterVectorization.insert(BB);
6526       }
6527   }
6528 }
6529 
6530 int LoopVectorizationCostModel::computePredInstDiscount(
6531     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6532   assert(!isUniformAfterVectorization(PredInst, VF) &&
6533          "Instruction marked uniform-after-vectorization will be predicated");
6534 
6535   // Initialize the discount to zero, meaning that the scalar version and the
6536   // vector version cost the same.
6537   InstructionCost Discount = 0;
6538 
6539   // Holds instructions to analyze. The instructions we visit are mapped in
6540   // ScalarCosts. Those instructions are the ones that would be scalarized if
6541   // we find that the scalar version costs less.
6542   SmallVector<Instruction *, 8> Worklist;
6543 
6544   // Returns true if the given instruction can be scalarized.
6545   auto canBeScalarized = [&](Instruction *I) -> bool {
6546     // We only attempt to scalarize instructions forming a single-use chain
6547     // from the original predicated block that would otherwise be vectorized.
6548     // Although not strictly necessary, we give up on instructions we know will
6549     // already be scalar to avoid traversing chains that are unlikely to be
6550     // beneficial.
6551     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6552         isScalarAfterVectorization(I, VF))
6553       return false;
6554 
6555     // If the instruction is scalar with predication, it will be analyzed
6556     // separately. We ignore it within the context of PredInst.
6557     if (isScalarWithPredication(I))
6558       return false;
6559 
6560     // If any of the instruction's operands are uniform after vectorization,
6561     // the instruction cannot be scalarized. This prevents, for example, a
6562     // masked load from being scalarized.
6563     //
6564     // We assume we will only emit a value for lane zero of an instruction
6565     // marked uniform after vectorization, rather than VF identical values.
6566     // Thus, if we scalarize an instruction that uses a uniform, we would
6567     // create uses of values corresponding to the lanes we aren't emitting code
6568     // for. This behavior can be changed by allowing getScalarValue to clone
6569     // the lane zero values for uniforms rather than asserting.
6570     for (Use &U : I->operands())
6571       if (auto *J = dyn_cast<Instruction>(U.get()))
6572         if (isUniformAfterVectorization(J, VF))
6573           return false;
6574 
6575     // Otherwise, we can scalarize the instruction.
6576     return true;
6577   };
6578 
6579   // Compute the expected cost discount from scalarizing the entire expression
6580   // feeding the predicated instruction. We currently only consider expressions
6581   // that are single-use instruction chains.
6582   Worklist.push_back(PredInst);
6583   while (!Worklist.empty()) {
6584     Instruction *I = Worklist.pop_back_val();
6585 
6586     // If we've already analyzed the instruction, there's nothing to do.
6587     if (ScalarCosts.find(I) != ScalarCosts.end())
6588       continue;
6589 
6590     // Compute the cost of the vector instruction. Note that this cost already
6591     // includes the scalarization overhead of the predicated instruction.
6592     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6593 
6594     // Compute the cost of the scalarized instruction. This cost is the cost of
6595     // the instruction as if it wasn't if-converted and instead remained in the
6596     // predicated block. We will scale this cost by block probability after
6597     // computing the scalarization overhead.
6598     InstructionCost ScalarCost =
6599         VF.getFixedValue() *
6600         getInstructionCost(I, ElementCount::getFixed(1)).first;
6601 
6602     // Compute the scalarization overhead of needed insertelement instructions
6603     // and phi nodes.
6604     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6605       ScalarCost += TTI.getScalarizationOverhead(
6606           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6607           APInt::getAllOnes(VF.getFixedValue()), true, false);
6608       ScalarCost +=
6609           VF.getFixedValue() *
6610           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6611     }
6612 
6613     // Compute the scalarization overhead of needed extractelement
6614     // instructions. For each of the instruction's operands, if the operand can
6615     // be scalarized, add it to the worklist; otherwise, account for the
6616     // overhead.
6617     for (Use &U : I->operands())
6618       if (auto *J = dyn_cast<Instruction>(U.get())) {
6619         assert(VectorType::isValidElementType(J->getType()) &&
6620                "Instruction has non-scalar type");
6621         if (canBeScalarized(J))
6622           Worklist.push_back(J);
6623         else if (needsExtract(J, VF)) {
6624           ScalarCost += TTI.getScalarizationOverhead(
6625               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6626               APInt::getAllOnes(VF.getFixedValue()), false, true);
6627         }
6628       }
6629 
6630     // Scale the total scalar cost by block probability.
6631     ScalarCost /= getReciprocalPredBlockProb();
6632 
6633     // Compute the discount. A non-negative discount means the vector version
6634     // of the instruction costs more, and scalarizing would be beneficial.
6635     Discount += VectorCost - ScalarCost;
6636     ScalarCosts[I] = ScalarCost;
6637   }
6638 
6639   return *Discount.getValue();
6640 }
6641 
6642 LoopVectorizationCostModel::VectorizationCostTy
6643 LoopVectorizationCostModel::expectedCost(
6644     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6645   VectorizationCostTy Cost;
6646 
6647   // For each block.
6648   for (BasicBlock *BB : TheLoop->blocks()) {
6649     VectorizationCostTy BlockCost;
6650 
6651     // For each instruction in the old loop.
6652     for (Instruction &I : BB->instructionsWithoutDebug()) {
6653       // Skip ignored values.
6654       if (ValuesToIgnore.count(&I) ||
6655           (VF.isVector() && VecValuesToIgnore.count(&I)))
6656         continue;
6657 
6658       VectorizationCostTy C = getInstructionCost(&I, VF);
6659 
6660       // Check if we should override the cost.
6661       if (C.first.isValid() &&
6662           ForceTargetInstructionCost.getNumOccurrences() > 0)
6663         C.first = InstructionCost(ForceTargetInstructionCost);
6664 
6665       // Keep a list of instructions with invalid costs.
6666       if (Invalid && !C.first.isValid())
6667         Invalid->emplace_back(&I, VF);
6668 
6669       BlockCost.first += C.first;
6670       BlockCost.second |= C.second;
6671       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6672                         << " for VF " << VF << " For instruction: " << I
6673                         << '\n');
6674     }
6675 
6676     // If we are vectorizing a predicated block, it will have been
6677     // if-converted. This means that the block's instructions (aside from
6678     // stores and instructions that may divide by zero) will now be
6679     // unconditionally executed. For the scalar case, we may not always execute
6680     // the predicated block, if it is an if-else block. Thus, scale the block's
6681     // cost by the probability of executing it. blockNeedsPredication from
6682     // Legal is used so as to not include all blocks in tail folded loops.
6683     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6684       BlockCost.first /= getReciprocalPredBlockProb();
6685 
6686     Cost.first += BlockCost.first;
6687     Cost.second |= BlockCost.second;
6688   }
6689 
6690   return Cost;
6691 }
6692 
6693 /// Gets Address Access SCEV after verifying that the access pattern
6694 /// is loop invariant except the induction variable dependence.
6695 ///
6696 /// This SCEV can be sent to the Target in order to estimate the address
6697 /// calculation cost.
6698 static const SCEV *getAddressAccessSCEV(
6699               Value *Ptr,
6700               LoopVectorizationLegality *Legal,
6701               PredicatedScalarEvolution &PSE,
6702               const Loop *TheLoop) {
6703 
6704   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6705   if (!Gep)
6706     return nullptr;
6707 
6708   // We are looking for a gep with all loop invariant indices except for one
6709   // which should be an induction variable.
6710   auto SE = PSE.getSE();
6711   unsigned NumOperands = Gep->getNumOperands();
6712   for (unsigned i = 1; i < NumOperands; ++i) {
6713     Value *Opd = Gep->getOperand(i);
6714     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6715         !Legal->isInductionVariable(Opd))
6716       return nullptr;
6717   }
6718 
6719   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6720   return PSE.getSCEV(Ptr);
6721 }
6722 
6723 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6724   return Legal->hasStride(I->getOperand(0)) ||
6725          Legal->hasStride(I->getOperand(1));
6726 }
6727 
6728 InstructionCost
6729 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6730                                                         ElementCount VF) {
6731   assert(VF.isVector() &&
6732          "Scalarization cost of instruction implies vectorization.");
6733   if (VF.isScalable())
6734     return InstructionCost::getInvalid();
6735 
6736   Type *ValTy = getLoadStoreType(I);
6737   auto SE = PSE.getSE();
6738 
6739   unsigned AS = getLoadStoreAddressSpace(I);
6740   Value *Ptr = getLoadStorePointerOperand(I);
6741   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6742   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6743   //       that it is being called from this specific place.
6744 
6745   // Figure out whether the access is strided and get the stride value
6746   // if it's known in compile time
6747   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6748 
6749   // Get the cost of the scalar memory instruction and address computation.
6750   InstructionCost Cost =
6751       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6752 
6753   // Don't pass *I here, since it is scalar but will actually be part of a
6754   // vectorized loop where the user of it is a vectorized instruction.
6755   const Align Alignment = getLoadStoreAlignment(I);
6756   Cost += VF.getKnownMinValue() *
6757           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6758                               AS, TTI::TCK_RecipThroughput);
6759 
6760   // Get the overhead of the extractelement and insertelement instructions
6761   // we might create due to scalarization.
6762   Cost += getScalarizationOverhead(I, VF);
6763 
6764   // If we have a predicated load/store, it will need extra i1 extracts and
6765   // conditional branches, but may not be executed for each vector lane. Scale
6766   // the cost by the probability of executing the predicated block.
6767   if (isPredicatedInst(I)) {
6768     Cost /= getReciprocalPredBlockProb();
6769 
6770     // Add the cost of an i1 extract and a branch
6771     auto *Vec_i1Ty =
6772         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6773     Cost += TTI.getScalarizationOverhead(
6774         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6775         /*Insert=*/false, /*Extract=*/true);
6776     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6777 
6778     if (useEmulatedMaskMemRefHack(I))
6779       // Artificially setting to a high enough value to practically disable
6780       // vectorization with such operations.
6781       Cost = 3000000;
6782   }
6783 
6784   return Cost;
6785 }
6786 
6787 InstructionCost
6788 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6789                                                     ElementCount VF) {
6790   Type *ValTy = getLoadStoreType(I);
6791   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6792   Value *Ptr = getLoadStorePointerOperand(I);
6793   unsigned AS = getLoadStoreAddressSpace(I);
6794   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6795   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6796 
6797   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6798          "Stride should be 1 or -1 for consecutive memory access");
6799   const Align Alignment = getLoadStoreAlignment(I);
6800   InstructionCost Cost = 0;
6801   if (Legal->isMaskRequired(I))
6802     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6803                                       CostKind);
6804   else
6805     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6806                                 CostKind, I);
6807 
6808   bool Reverse = ConsecutiveStride < 0;
6809   if (Reverse)
6810     Cost +=
6811         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6812   return Cost;
6813 }
6814 
6815 InstructionCost
6816 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6817                                                 ElementCount VF) {
6818   assert(Legal->isUniformMemOp(*I));
6819 
6820   Type *ValTy = getLoadStoreType(I);
6821   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6822   const Align Alignment = getLoadStoreAlignment(I);
6823   unsigned AS = getLoadStoreAddressSpace(I);
6824   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6825   if (isa<LoadInst>(I)) {
6826     return TTI.getAddressComputationCost(ValTy) +
6827            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6828                                CostKind) +
6829            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6830   }
6831   StoreInst *SI = cast<StoreInst>(I);
6832 
6833   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6834   return TTI.getAddressComputationCost(ValTy) +
6835          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6836                              CostKind) +
6837          (isLoopInvariantStoreValue
6838               ? 0
6839               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6840                                        VF.getKnownMinValue() - 1));
6841 }
6842 
6843 InstructionCost
6844 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6845                                                  ElementCount VF) {
6846   Type *ValTy = getLoadStoreType(I);
6847   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6848   const Align Alignment = getLoadStoreAlignment(I);
6849   const Value *Ptr = getLoadStorePointerOperand(I);
6850 
6851   return TTI.getAddressComputationCost(VectorTy) +
6852          TTI.getGatherScatterOpCost(
6853              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6854              TargetTransformInfo::TCK_RecipThroughput, I);
6855 }
6856 
6857 InstructionCost
6858 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6859                                                    ElementCount VF) {
6860   // TODO: Once we have support for interleaving with scalable vectors
6861   // we can calculate the cost properly here.
6862   if (VF.isScalable())
6863     return InstructionCost::getInvalid();
6864 
6865   Type *ValTy = getLoadStoreType(I);
6866   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6867   unsigned AS = getLoadStoreAddressSpace(I);
6868 
6869   auto Group = getInterleavedAccessGroup(I);
6870   assert(Group && "Fail to get an interleaved access group.");
6871 
6872   unsigned InterleaveFactor = Group->getFactor();
6873   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6874 
6875   // Holds the indices of existing members in the interleaved group.
6876   SmallVector<unsigned, 4> Indices;
6877   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6878     if (Group->getMember(IF))
6879       Indices.push_back(IF);
6880 
6881   // Calculate the cost of the whole interleaved group.
6882   bool UseMaskForGaps =
6883       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6884       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6885   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6886       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6887       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6888 
6889   if (Group->isReverse()) {
6890     // TODO: Add support for reversed masked interleaved access.
6891     assert(!Legal->isMaskRequired(I) &&
6892            "Reverse masked interleaved access not supported.");
6893     Cost +=
6894         Group->getNumMembers() *
6895         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6896   }
6897   return Cost;
6898 }
6899 
6900 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6901     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6902   using namespace llvm::PatternMatch;
6903   // Early exit for no inloop reductions
6904   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6905     return None;
6906   auto *VectorTy = cast<VectorType>(Ty);
6907 
6908   // We are looking for a pattern of, and finding the minimal acceptable cost:
6909   //  reduce(mul(ext(A), ext(B))) or
6910   //  reduce(mul(A, B)) or
6911   //  reduce(ext(A)) or
6912   //  reduce(A).
6913   // The basic idea is that we walk down the tree to do that, finding the root
6914   // reduction instruction in InLoopReductionImmediateChains. From there we find
6915   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6916   // of the components. If the reduction cost is lower then we return it for the
6917   // reduction instruction and 0 for the other instructions in the pattern. If
6918   // it is not we return an invalid cost specifying the orignal cost method
6919   // should be used.
6920   Instruction *RetI = I;
6921   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6922     if (!RetI->hasOneUser())
6923       return None;
6924     RetI = RetI->user_back();
6925   }
6926   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6927       RetI->user_back()->getOpcode() == Instruction::Add) {
6928     if (!RetI->hasOneUser())
6929       return None;
6930     RetI = RetI->user_back();
6931   }
6932 
6933   // Test if the found instruction is a reduction, and if not return an invalid
6934   // cost specifying the parent to use the original cost modelling.
6935   if (!InLoopReductionImmediateChains.count(RetI))
6936     return None;
6937 
6938   // Find the reduction this chain is a part of and calculate the basic cost of
6939   // the reduction on its own.
6940   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6941   Instruction *ReductionPhi = LastChain;
6942   while (!isa<PHINode>(ReductionPhi))
6943     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6944 
6945   const RecurrenceDescriptor &RdxDesc =
6946       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6947 
6948   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6949       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6950 
6951   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6952   // normal fmul instruction to the cost of the fadd reduction.
6953   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6954     BaseCost +=
6955         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6956 
6957   // If we're using ordered reductions then we can just return the base cost
6958   // here, since getArithmeticReductionCost calculates the full ordered
6959   // reduction cost when FP reassociation is not allowed.
6960   if (useOrderedReductions(RdxDesc))
6961     return BaseCost;
6962 
6963   // Get the operand that was not the reduction chain and match it to one of the
6964   // patterns, returning the better cost if it is found.
6965   Instruction *RedOp = RetI->getOperand(1) == LastChain
6966                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6967                            : dyn_cast<Instruction>(RetI->getOperand(1));
6968 
6969   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6970 
6971   Instruction *Op0, *Op1;
6972   if (RedOp &&
6973       match(RedOp,
6974             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6975       match(Op0, m_ZExtOrSExt(m_Value())) &&
6976       Op0->getOpcode() == Op1->getOpcode() &&
6977       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6978       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6979       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6980 
6981     // Matched reduce(ext(mul(ext(A), ext(B)))
6982     // Note that the extend opcodes need to all match, or if A==B they will have
6983     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6984     // which is equally fine.
6985     bool IsUnsigned = isa<ZExtInst>(Op0);
6986     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6987     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6988 
6989     InstructionCost ExtCost =
6990         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6991                              TTI::CastContextHint::None, CostKind, Op0);
6992     InstructionCost MulCost =
6993         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6994     InstructionCost Ext2Cost =
6995         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6996                              TTI::CastContextHint::None, CostKind, RedOp);
6997 
6998     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6999         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7000         CostKind);
7001 
7002     if (RedCost.isValid() &&
7003         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7004       return I == RetI ? RedCost : 0;
7005   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7006              !TheLoop->isLoopInvariant(RedOp)) {
7007     // Matched reduce(ext(A))
7008     bool IsUnsigned = isa<ZExtInst>(RedOp);
7009     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7010     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7011         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7012         CostKind);
7013 
7014     InstructionCost ExtCost =
7015         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7016                              TTI::CastContextHint::None, CostKind, RedOp);
7017     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7018       return I == RetI ? RedCost : 0;
7019   } else if (RedOp &&
7020              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7021     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7022         Op0->getOpcode() == Op1->getOpcode() &&
7023         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7024       bool IsUnsigned = isa<ZExtInst>(Op0);
7025       Type *Op0Ty = Op0->getOperand(0)->getType();
7026       Type *Op1Ty = Op1->getOperand(0)->getType();
7027       Type *LargestOpTy =
7028           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
7029                                                                     : Op0Ty;
7030       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
7031 
7032       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
7033       // different sizes. We take the largest type as the ext to reduce, and add
7034       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
7035       InstructionCost ExtCost0 = TTI.getCastInstrCost(
7036           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
7037           TTI::CastContextHint::None, CostKind, Op0);
7038       InstructionCost ExtCost1 = TTI.getCastInstrCost(
7039           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7040           TTI::CastContextHint::None, CostKind, Op1);
7041       InstructionCost MulCost =
7042           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7043 
7044       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7045           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7046           CostKind);
7047       InstructionCost ExtraExtCost = 0;
7048       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7049         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7050         ExtraExtCost = TTI.getCastInstrCost(
7051             ExtraExtOp->getOpcode(), ExtType,
7052             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7053             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7054       }
7055 
7056       if (RedCost.isValid() &&
7057           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7058         return I == RetI ? RedCost : 0;
7059     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7060       // Matched reduce(mul())
7061       InstructionCost MulCost =
7062           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7063 
7064       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7065           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7066           CostKind);
7067 
7068       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7069         return I == RetI ? RedCost : 0;
7070     }
7071   }
7072 
7073   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7074 }
7075 
7076 InstructionCost
7077 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7078                                                      ElementCount VF) {
7079   // Calculate scalar cost only. Vectorization cost should be ready at this
7080   // moment.
7081   if (VF.isScalar()) {
7082     Type *ValTy = getLoadStoreType(I);
7083     const Align Alignment = getLoadStoreAlignment(I);
7084     unsigned AS = getLoadStoreAddressSpace(I);
7085 
7086     return TTI.getAddressComputationCost(ValTy) +
7087            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7088                                TTI::TCK_RecipThroughput, I);
7089   }
7090   return getWideningCost(I, VF);
7091 }
7092 
7093 LoopVectorizationCostModel::VectorizationCostTy
7094 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7095                                                ElementCount VF) {
7096   // If we know that this instruction will remain uniform, check the cost of
7097   // the scalar version.
7098   if (isUniformAfterVectorization(I, VF))
7099     VF = ElementCount::getFixed(1);
7100 
7101   if (VF.isVector() && isProfitableToScalarize(I, VF))
7102     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7103 
7104   // Forced scalars do not have any scalarization overhead.
7105   auto ForcedScalar = ForcedScalars.find(VF);
7106   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7107     auto InstSet = ForcedScalar->second;
7108     if (InstSet.count(I))
7109       return VectorizationCostTy(
7110           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7111            VF.getKnownMinValue()),
7112           false);
7113   }
7114 
7115   Type *VectorTy;
7116   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7117 
7118   bool TypeNotScalarized = false;
7119   if (VF.isVector() && VectorTy->isVectorTy()) {
7120     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7121     if (NumParts)
7122       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7123     else
7124       C = InstructionCost::getInvalid();
7125   }
7126   return VectorizationCostTy(C, TypeNotScalarized);
7127 }
7128 
7129 InstructionCost
7130 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7131                                                      ElementCount VF) const {
7132 
7133   // There is no mechanism yet to create a scalable scalarization loop,
7134   // so this is currently Invalid.
7135   if (VF.isScalable())
7136     return InstructionCost::getInvalid();
7137 
7138   if (VF.isScalar())
7139     return 0;
7140 
7141   InstructionCost Cost = 0;
7142   Type *RetTy = ToVectorTy(I->getType(), VF);
7143   if (!RetTy->isVoidTy() &&
7144       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7145     Cost += TTI.getScalarizationOverhead(
7146         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7147         false);
7148 
7149   // Some targets keep addresses scalar.
7150   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7151     return Cost;
7152 
7153   // Some targets support efficient element stores.
7154   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7155     return Cost;
7156 
7157   // Collect operands to consider.
7158   CallInst *CI = dyn_cast<CallInst>(I);
7159   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7160 
7161   // Skip operands that do not require extraction/scalarization and do not incur
7162   // any overhead.
7163   SmallVector<Type *> Tys;
7164   for (auto *V : filterExtractingOperands(Ops, VF))
7165     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7166   return Cost + TTI.getOperandsScalarizationOverhead(
7167                     filterExtractingOperands(Ops, VF), Tys);
7168 }
7169 
7170 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7171   if (VF.isScalar())
7172     return;
7173   NumPredStores = 0;
7174   for (BasicBlock *BB : TheLoop->blocks()) {
7175     // For each instruction in the old loop.
7176     for (Instruction &I : *BB) {
7177       Value *Ptr =  getLoadStorePointerOperand(&I);
7178       if (!Ptr)
7179         continue;
7180 
7181       // TODO: We should generate better code and update the cost model for
7182       // predicated uniform stores. Today they are treated as any other
7183       // predicated store (see added test cases in
7184       // invariant-store-vectorization.ll).
7185       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7186         NumPredStores++;
7187 
7188       if (Legal->isUniformMemOp(I)) {
7189         // TODO: Avoid replicating loads and stores instead of
7190         // relying on instcombine to remove them.
7191         // Load: Scalar load + broadcast
7192         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7193         InstructionCost Cost;
7194         if (isa<StoreInst>(&I) && VF.isScalable() &&
7195             isLegalGatherOrScatter(&I)) {
7196           Cost = getGatherScatterCost(&I, VF);
7197           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7198         } else {
7199           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7200                  "Cannot yet scalarize uniform stores");
7201           Cost = getUniformMemOpCost(&I, VF);
7202           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7203         }
7204         continue;
7205       }
7206 
7207       // We assume that widening is the best solution when possible.
7208       if (memoryInstructionCanBeWidened(&I, VF)) {
7209         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7210         int ConsecutiveStride = Legal->isConsecutivePtr(
7211             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7212         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7213                "Expected consecutive stride.");
7214         InstWidening Decision =
7215             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7216         setWideningDecision(&I, VF, Decision, Cost);
7217         continue;
7218       }
7219 
7220       // Choose between Interleaving, Gather/Scatter or Scalarization.
7221       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7222       unsigned NumAccesses = 1;
7223       if (isAccessInterleaved(&I)) {
7224         auto Group = getInterleavedAccessGroup(&I);
7225         assert(Group && "Fail to get an interleaved access group.");
7226 
7227         // Make one decision for the whole group.
7228         if (getWideningDecision(&I, VF) != CM_Unknown)
7229           continue;
7230 
7231         NumAccesses = Group->getNumMembers();
7232         if (interleavedAccessCanBeWidened(&I, VF))
7233           InterleaveCost = getInterleaveGroupCost(&I, VF);
7234       }
7235 
7236       InstructionCost GatherScatterCost =
7237           isLegalGatherOrScatter(&I)
7238               ? getGatherScatterCost(&I, VF) * NumAccesses
7239               : InstructionCost::getInvalid();
7240 
7241       InstructionCost ScalarizationCost =
7242           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7243 
7244       // Choose better solution for the current VF,
7245       // write down this decision and use it during vectorization.
7246       InstructionCost Cost;
7247       InstWidening Decision;
7248       if (InterleaveCost <= GatherScatterCost &&
7249           InterleaveCost < ScalarizationCost) {
7250         Decision = CM_Interleave;
7251         Cost = InterleaveCost;
7252       } else if (GatherScatterCost < ScalarizationCost) {
7253         Decision = CM_GatherScatter;
7254         Cost = GatherScatterCost;
7255       } else {
7256         Decision = CM_Scalarize;
7257         Cost = ScalarizationCost;
7258       }
7259       // If the instructions belongs to an interleave group, the whole group
7260       // receives the same decision. The whole group receives the cost, but
7261       // the cost will actually be assigned to one instruction.
7262       if (auto Group = getInterleavedAccessGroup(&I))
7263         setWideningDecision(Group, VF, Decision, Cost);
7264       else
7265         setWideningDecision(&I, VF, Decision, Cost);
7266     }
7267   }
7268 
7269   // Make sure that any load of address and any other address computation
7270   // remains scalar unless there is gather/scatter support. This avoids
7271   // inevitable extracts into address registers, and also has the benefit of
7272   // activating LSR more, since that pass can't optimize vectorized
7273   // addresses.
7274   if (TTI.prefersVectorizedAddressing())
7275     return;
7276 
7277   // Start with all scalar pointer uses.
7278   SmallPtrSet<Instruction *, 8> AddrDefs;
7279   for (BasicBlock *BB : TheLoop->blocks())
7280     for (Instruction &I : *BB) {
7281       Instruction *PtrDef =
7282         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7283       if (PtrDef && TheLoop->contains(PtrDef) &&
7284           getWideningDecision(&I, VF) != CM_GatherScatter)
7285         AddrDefs.insert(PtrDef);
7286     }
7287 
7288   // Add all instructions used to generate the addresses.
7289   SmallVector<Instruction *, 4> Worklist;
7290   append_range(Worklist, AddrDefs);
7291   while (!Worklist.empty()) {
7292     Instruction *I = Worklist.pop_back_val();
7293     for (auto &Op : I->operands())
7294       if (auto *InstOp = dyn_cast<Instruction>(Op))
7295         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7296             AddrDefs.insert(InstOp).second)
7297           Worklist.push_back(InstOp);
7298   }
7299 
7300   for (auto *I : AddrDefs) {
7301     if (isa<LoadInst>(I)) {
7302       // Setting the desired widening decision should ideally be handled in
7303       // by cost functions, but since this involves the task of finding out
7304       // if the loaded register is involved in an address computation, it is
7305       // instead changed here when we know this is the case.
7306       InstWidening Decision = getWideningDecision(I, VF);
7307       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7308         // Scalarize a widened load of address.
7309         setWideningDecision(
7310             I, VF, CM_Scalarize,
7311             (VF.getKnownMinValue() *
7312              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7313       else if (auto Group = getInterleavedAccessGroup(I)) {
7314         // Scalarize an interleave group of address loads.
7315         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7316           if (Instruction *Member = Group->getMember(I))
7317             setWideningDecision(
7318                 Member, VF, CM_Scalarize,
7319                 (VF.getKnownMinValue() *
7320                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7321         }
7322       }
7323     } else
7324       // Make sure I gets scalarized and a cost estimate without
7325       // scalarization overhead.
7326       ForcedScalars[VF].insert(I);
7327   }
7328 }
7329 
7330 InstructionCost
7331 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7332                                                Type *&VectorTy) {
7333   Type *RetTy = I->getType();
7334   if (canTruncateToMinimalBitwidth(I, VF))
7335     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7336   auto SE = PSE.getSE();
7337   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7338 
7339   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7340                                                 ElementCount VF) -> bool {
7341     if (VF.isScalar())
7342       return true;
7343 
7344     auto Scalarized = InstsToScalarize.find(VF);
7345     assert(Scalarized != InstsToScalarize.end() &&
7346            "VF not yet analyzed for scalarization profitability");
7347     return !Scalarized->second.count(I) &&
7348            llvm::all_of(I->users(), [&](User *U) {
7349              auto *UI = cast<Instruction>(U);
7350              return !Scalarized->second.count(UI);
7351            });
7352   };
7353   (void) hasSingleCopyAfterVectorization;
7354 
7355   if (isScalarAfterVectorization(I, VF)) {
7356     // With the exception of GEPs and PHIs, after scalarization there should
7357     // only be one copy of the instruction generated in the loop. This is
7358     // because the VF is either 1, or any instructions that need scalarizing
7359     // have already been dealt with by the the time we get here. As a result,
7360     // it means we don't have to multiply the instruction cost by VF.
7361     assert(I->getOpcode() == Instruction::GetElementPtr ||
7362            I->getOpcode() == Instruction::PHI ||
7363            (I->getOpcode() == Instruction::BitCast &&
7364             I->getType()->isPointerTy()) ||
7365            hasSingleCopyAfterVectorization(I, VF));
7366     VectorTy = RetTy;
7367   } else
7368     VectorTy = ToVectorTy(RetTy, VF);
7369 
7370   // TODO: We need to estimate the cost of intrinsic calls.
7371   switch (I->getOpcode()) {
7372   case Instruction::GetElementPtr:
7373     // We mark this instruction as zero-cost because the cost of GEPs in
7374     // vectorized code depends on whether the corresponding memory instruction
7375     // is scalarized or not. Therefore, we handle GEPs with the memory
7376     // instruction cost.
7377     return 0;
7378   case Instruction::Br: {
7379     // In cases of scalarized and predicated instructions, there will be VF
7380     // predicated blocks in the vectorized loop. Each branch around these
7381     // blocks requires also an extract of its vector compare i1 element.
7382     bool ScalarPredicatedBB = false;
7383     BranchInst *BI = cast<BranchInst>(I);
7384     if (VF.isVector() && BI->isConditional() &&
7385         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7386          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7387       ScalarPredicatedBB = true;
7388 
7389     if (ScalarPredicatedBB) {
7390       // Not possible to scalarize scalable vector with predicated instructions.
7391       if (VF.isScalable())
7392         return InstructionCost::getInvalid();
7393       // Return cost for branches around scalarized and predicated blocks.
7394       auto *Vec_i1Ty =
7395           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7396       return (
7397           TTI.getScalarizationOverhead(
7398               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7399           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7400     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7401       // The back-edge branch will remain, as will all scalar branches.
7402       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7403     else
7404       // This branch will be eliminated by if-conversion.
7405       return 0;
7406     // Note: We currently assume zero cost for an unconditional branch inside
7407     // a predicated block since it will become a fall-through, although we
7408     // may decide in the future to call TTI for all branches.
7409   }
7410   case Instruction::PHI: {
7411     auto *Phi = cast<PHINode>(I);
7412 
7413     // First-order recurrences are replaced by vector shuffles inside the loop.
7414     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7415     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7416       return TTI.getShuffleCost(
7417           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7418           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7419 
7420     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7421     // converted into select instructions. We require N - 1 selects per phi
7422     // node, where N is the number of incoming values.
7423     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7424       return (Phi->getNumIncomingValues() - 1) *
7425              TTI.getCmpSelInstrCost(
7426                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7427                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7428                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7429 
7430     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7431   }
7432   case Instruction::UDiv:
7433   case Instruction::SDiv:
7434   case Instruction::URem:
7435   case Instruction::SRem:
7436     // If we have a predicated instruction, it may not be executed for each
7437     // vector lane. Get the scalarization cost and scale this amount by the
7438     // probability of executing the predicated block. If the instruction is not
7439     // predicated, we fall through to the next case.
7440     if (VF.isVector() && isScalarWithPredication(I)) {
7441       InstructionCost Cost = 0;
7442 
7443       // These instructions have a non-void type, so account for the phi nodes
7444       // that we will create. This cost is likely to be zero. The phi node
7445       // cost, if any, should be scaled by the block probability because it
7446       // models a copy at the end of each predicated block.
7447       Cost += VF.getKnownMinValue() *
7448               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7449 
7450       // The cost of the non-predicated instruction.
7451       Cost += VF.getKnownMinValue() *
7452               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7453 
7454       // The cost of insertelement and extractelement instructions needed for
7455       // scalarization.
7456       Cost += getScalarizationOverhead(I, VF);
7457 
7458       // Scale the cost by the probability of executing the predicated blocks.
7459       // This assumes the predicated block for each vector lane is equally
7460       // likely.
7461       return Cost / getReciprocalPredBlockProb();
7462     }
7463     LLVM_FALLTHROUGH;
7464   case Instruction::Add:
7465   case Instruction::FAdd:
7466   case Instruction::Sub:
7467   case Instruction::FSub:
7468   case Instruction::Mul:
7469   case Instruction::FMul:
7470   case Instruction::FDiv:
7471   case Instruction::FRem:
7472   case Instruction::Shl:
7473   case Instruction::LShr:
7474   case Instruction::AShr:
7475   case Instruction::And:
7476   case Instruction::Or:
7477   case Instruction::Xor: {
7478     // Since we will replace the stride by 1 the multiplication should go away.
7479     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7480       return 0;
7481 
7482     // Detect reduction patterns
7483     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7484       return *RedCost;
7485 
7486     // Certain instructions can be cheaper to vectorize if they have a constant
7487     // second vector operand. One example of this are shifts on x86.
7488     Value *Op2 = I->getOperand(1);
7489     TargetTransformInfo::OperandValueProperties Op2VP;
7490     TargetTransformInfo::OperandValueKind Op2VK =
7491         TTI.getOperandInfo(Op2, Op2VP);
7492     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7493       Op2VK = TargetTransformInfo::OK_UniformValue;
7494 
7495     SmallVector<const Value *, 4> Operands(I->operand_values());
7496     return TTI.getArithmeticInstrCost(
7497         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7498         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7499   }
7500   case Instruction::FNeg: {
7501     return TTI.getArithmeticInstrCost(
7502         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7503         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7504         TargetTransformInfo::OP_None, I->getOperand(0), I);
7505   }
7506   case Instruction::Select: {
7507     SelectInst *SI = cast<SelectInst>(I);
7508     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7509     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7510 
7511     const Value *Op0, *Op1;
7512     using namespace llvm::PatternMatch;
7513     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7514                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7515       // select x, y, false --> x & y
7516       // select x, true, y --> x | y
7517       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7518       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7519       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7520       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7521       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7522               Op1->getType()->getScalarSizeInBits() == 1);
7523 
7524       SmallVector<const Value *, 2> Operands{Op0, Op1};
7525       return TTI.getArithmeticInstrCost(
7526           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7527           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7528     }
7529 
7530     Type *CondTy = SI->getCondition()->getType();
7531     if (!ScalarCond)
7532       CondTy = VectorType::get(CondTy, VF);
7533 
7534     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7535     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7536       Pred = Cmp->getPredicate();
7537     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7538                                   CostKind, I);
7539   }
7540   case Instruction::ICmp:
7541   case Instruction::FCmp: {
7542     Type *ValTy = I->getOperand(0)->getType();
7543     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7544     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7545       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7546     VectorTy = ToVectorTy(ValTy, VF);
7547     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7548                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7549                                   I);
7550   }
7551   case Instruction::Store:
7552   case Instruction::Load: {
7553     ElementCount Width = VF;
7554     if (Width.isVector()) {
7555       InstWidening Decision = getWideningDecision(I, Width);
7556       assert(Decision != CM_Unknown &&
7557              "CM decision should be taken at this point");
7558       if (Decision == CM_Scalarize)
7559         Width = ElementCount::getFixed(1);
7560     }
7561     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7562     return getMemoryInstructionCost(I, VF);
7563   }
7564   case Instruction::BitCast:
7565     if (I->getType()->isPointerTy())
7566       return 0;
7567     LLVM_FALLTHROUGH;
7568   case Instruction::ZExt:
7569   case Instruction::SExt:
7570   case Instruction::FPToUI:
7571   case Instruction::FPToSI:
7572   case Instruction::FPExt:
7573   case Instruction::PtrToInt:
7574   case Instruction::IntToPtr:
7575   case Instruction::SIToFP:
7576   case Instruction::UIToFP:
7577   case Instruction::Trunc:
7578   case Instruction::FPTrunc: {
7579     // Computes the CastContextHint from a Load/Store instruction.
7580     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7581       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7582              "Expected a load or a store!");
7583 
7584       if (VF.isScalar() || !TheLoop->contains(I))
7585         return TTI::CastContextHint::Normal;
7586 
7587       switch (getWideningDecision(I, VF)) {
7588       case LoopVectorizationCostModel::CM_GatherScatter:
7589         return TTI::CastContextHint::GatherScatter;
7590       case LoopVectorizationCostModel::CM_Interleave:
7591         return TTI::CastContextHint::Interleave;
7592       case LoopVectorizationCostModel::CM_Scalarize:
7593       case LoopVectorizationCostModel::CM_Widen:
7594         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7595                                         : TTI::CastContextHint::Normal;
7596       case LoopVectorizationCostModel::CM_Widen_Reverse:
7597         return TTI::CastContextHint::Reversed;
7598       case LoopVectorizationCostModel::CM_Unknown:
7599         llvm_unreachable("Instr did not go through cost modelling?");
7600       }
7601 
7602       llvm_unreachable("Unhandled case!");
7603     };
7604 
7605     unsigned Opcode = I->getOpcode();
7606     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7607     // For Trunc, the context is the only user, which must be a StoreInst.
7608     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7609       if (I->hasOneUse())
7610         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7611           CCH = ComputeCCH(Store);
7612     }
7613     // For Z/Sext, the context is the operand, which must be a LoadInst.
7614     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7615              Opcode == Instruction::FPExt) {
7616       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7617         CCH = ComputeCCH(Load);
7618     }
7619 
7620     // We optimize the truncation of induction variables having constant
7621     // integer steps. The cost of these truncations is the same as the scalar
7622     // operation.
7623     if (isOptimizableIVTruncate(I, VF)) {
7624       auto *Trunc = cast<TruncInst>(I);
7625       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7626                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7627     }
7628 
7629     // Detect reduction patterns
7630     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7631       return *RedCost;
7632 
7633     Type *SrcScalarTy = I->getOperand(0)->getType();
7634     Type *SrcVecTy =
7635         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7636     if (canTruncateToMinimalBitwidth(I, VF)) {
7637       // This cast is going to be shrunk. This may remove the cast or it might
7638       // turn it into slightly different cast. For example, if MinBW == 16,
7639       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7640       //
7641       // Calculate the modified src and dest types.
7642       Type *MinVecTy = VectorTy;
7643       if (Opcode == Instruction::Trunc) {
7644         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7645         VectorTy =
7646             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7647       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7648         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7649         VectorTy =
7650             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7651       }
7652     }
7653 
7654     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7655   }
7656   case Instruction::Call: {
7657     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7658       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7659         return *RedCost;
7660     bool NeedToScalarize;
7661     CallInst *CI = cast<CallInst>(I);
7662     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7663     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7664       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7665       return std::min(CallCost, IntrinsicCost);
7666     }
7667     return CallCost;
7668   }
7669   case Instruction::ExtractValue:
7670     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7671   case Instruction::Alloca:
7672     // We cannot easily widen alloca to a scalable alloca, as
7673     // the result would need to be a vector of pointers.
7674     if (VF.isScalable())
7675       return InstructionCost::getInvalid();
7676     LLVM_FALLTHROUGH;
7677   default:
7678     // This opcode is unknown. Assume that it is the same as 'mul'.
7679     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7680   } // end of switch.
7681 }
7682 
7683 char LoopVectorize::ID = 0;
7684 
7685 static const char lv_name[] = "Loop Vectorization";
7686 
7687 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7688 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7689 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7690 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7691 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7692 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7693 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7694 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7695 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7696 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7697 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7698 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7699 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7700 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7701 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7702 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7703 
7704 namespace llvm {
7705 
7706 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7707 
7708 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7709                               bool VectorizeOnlyWhenForced) {
7710   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7711 }
7712 
7713 } // end namespace llvm
7714 
7715 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7716   // Check if the pointer operand of a load or store instruction is
7717   // consecutive.
7718   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7719     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7720   return false;
7721 }
7722 
7723 void LoopVectorizationCostModel::collectValuesToIgnore() {
7724   // Ignore ephemeral values.
7725   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7726 
7727   // Ignore type-promoting instructions we identified during reduction
7728   // detection.
7729   for (auto &Reduction : Legal->getReductionVars()) {
7730     const RecurrenceDescriptor &RedDes = Reduction.second;
7731     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7732     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7733   }
7734   // Ignore type-casting instructions we identified during induction
7735   // detection.
7736   for (auto &Induction : Legal->getInductionVars()) {
7737     const InductionDescriptor &IndDes = Induction.second;
7738     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7739     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7740   }
7741 }
7742 
7743 void LoopVectorizationCostModel::collectInLoopReductions() {
7744   for (auto &Reduction : Legal->getReductionVars()) {
7745     PHINode *Phi = Reduction.first;
7746     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7747 
7748     // We don't collect reductions that are type promoted (yet).
7749     if (RdxDesc.getRecurrenceType() != Phi->getType())
7750       continue;
7751 
7752     // If the target would prefer this reduction to happen "in-loop", then we
7753     // want to record it as such.
7754     unsigned Opcode = RdxDesc.getOpcode();
7755     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7756         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7757                                    TargetTransformInfo::ReductionFlags()))
7758       continue;
7759 
7760     // Check that we can correctly put the reductions into the loop, by
7761     // finding the chain of operations that leads from the phi to the loop
7762     // exit value.
7763     SmallVector<Instruction *, 4> ReductionOperations =
7764         RdxDesc.getReductionOpChain(Phi, TheLoop);
7765     bool InLoop = !ReductionOperations.empty();
7766     if (InLoop) {
7767       InLoopReductionChains[Phi] = ReductionOperations;
7768       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7769       Instruction *LastChain = Phi;
7770       for (auto *I : ReductionOperations) {
7771         InLoopReductionImmediateChains[I] = LastChain;
7772         LastChain = I;
7773       }
7774     }
7775     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7776                       << " reduction for phi: " << *Phi << "\n");
7777   }
7778 }
7779 
7780 // TODO: we could return a pair of values that specify the max VF and
7781 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7782 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7783 // doesn't have a cost model that can choose which plan to execute if
7784 // more than one is generated.
7785 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7786                                  LoopVectorizationCostModel &CM) {
7787   unsigned WidestType;
7788   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7789   return WidestVectorRegBits / WidestType;
7790 }
7791 
7792 VectorizationFactor
7793 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7794   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7795   ElementCount VF = UserVF;
7796   // Outer loop handling: They may require CFG and instruction level
7797   // transformations before even evaluating whether vectorization is profitable.
7798   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7799   // the vectorization pipeline.
7800   if (!OrigLoop->isInnermost()) {
7801     // If the user doesn't provide a vectorization factor, determine a
7802     // reasonable one.
7803     if (UserVF.isZero()) {
7804       VF = ElementCount::getFixed(determineVPlanVF(
7805           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7806               .getFixedSize(),
7807           CM));
7808       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7809 
7810       // Make sure we have a VF > 1 for stress testing.
7811       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7812         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7813                           << "overriding computed VF.\n");
7814         VF = ElementCount::getFixed(4);
7815       }
7816     }
7817     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7818     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7819            "VF needs to be a power of two");
7820     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7821                       << "VF " << VF << " to build VPlans.\n");
7822     buildVPlans(VF, VF);
7823 
7824     // For VPlan build stress testing, we bail out after VPlan construction.
7825     if (VPlanBuildStressTest)
7826       return VectorizationFactor::Disabled();
7827 
7828     return {VF, 0 /*Cost*/};
7829   }
7830 
7831   LLVM_DEBUG(
7832       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7833                 "VPlan-native path.\n");
7834   return VectorizationFactor::Disabled();
7835 }
7836 
7837 Optional<VectorizationFactor>
7838 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7839   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7840   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7841   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7842     return None;
7843 
7844   // Invalidate interleave groups if all blocks of loop will be predicated.
7845   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7846       !useMaskedInterleavedAccesses(*TTI)) {
7847     LLVM_DEBUG(
7848         dbgs()
7849         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7850            "which requires masked-interleaved support.\n");
7851     if (CM.InterleaveInfo.invalidateGroups())
7852       // Invalidating interleave groups also requires invalidating all decisions
7853       // based on them, which includes widening decisions and uniform and scalar
7854       // values.
7855       CM.invalidateCostModelingDecisions();
7856   }
7857 
7858   ElementCount MaxUserVF =
7859       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7860   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7861   if (!UserVF.isZero() && UserVFIsLegal) {
7862     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7863            "VF needs to be a power of two");
7864     // Collect the instructions (and their associated costs) that will be more
7865     // profitable to scalarize.
7866     if (CM.selectUserVectorizationFactor(UserVF)) {
7867       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7868       CM.collectInLoopReductions();
7869       buildVPlansWithVPRecipes(UserVF, UserVF);
7870       LLVM_DEBUG(printPlans(dbgs()));
7871       return {{UserVF, 0}};
7872     } else
7873       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7874                               "InvalidCost", ORE, OrigLoop);
7875   }
7876 
7877   // Populate the set of Vectorization Factor Candidates.
7878   ElementCountSet VFCandidates;
7879   for (auto VF = ElementCount::getFixed(1);
7880        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7881     VFCandidates.insert(VF);
7882   for (auto VF = ElementCount::getScalable(1);
7883        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7884     VFCandidates.insert(VF);
7885 
7886   for (const auto &VF : VFCandidates) {
7887     // Collect Uniform and Scalar instructions after vectorization with VF.
7888     CM.collectUniformsAndScalars(VF);
7889 
7890     // Collect the instructions (and their associated costs) that will be more
7891     // profitable to scalarize.
7892     if (VF.isVector())
7893       CM.collectInstsToScalarize(VF);
7894   }
7895 
7896   CM.collectInLoopReductions();
7897   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7898   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7899 
7900   LLVM_DEBUG(printPlans(dbgs()));
7901   if (!MaxFactors.hasVector())
7902     return VectorizationFactor::Disabled();
7903 
7904   // Select the optimal vectorization factor.
7905   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7906 
7907   // Check if it is profitable to vectorize with runtime checks.
7908   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7909   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7910     bool PragmaThresholdReached =
7911         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7912     bool ThresholdReached =
7913         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7914     if ((ThresholdReached && !Hints.allowReordering()) ||
7915         PragmaThresholdReached) {
7916       ORE->emit([&]() {
7917         return OptimizationRemarkAnalysisAliasing(
7918                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7919                    OrigLoop->getHeader())
7920                << "loop not vectorized: cannot prove it is safe to reorder "
7921                   "memory operations";
7922       });
7923       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7924       Hints.emitRemarkWithHints();
7925       return VectorizationFactor::Disabled();
7926     }
7927   }
7928   return SelectedVF;
7929 }
7930 
7931 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7932   assert(count_if(VPlans,
7933                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7934              1 &&
7935          "Best VF has not a single VPlan.");
7936 
7937   for (const VPlanPtr &Plan : VPlans) {
7938     if (Plan->hasVF(VF))
7939       return *Plan.get();
7940   }
7941   llvm_unreachable("No plan found!");
7942 }
7943 
7944 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7945                                            VPlan &BestVPlan,
7946                                            InnerLoopVectorizer &ILV,
7947                                            DominatorTree *DT) {
7948   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7949                     << '\n');
7950 
7951   // Perform the actual loop transformation.
7952 
7953   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7954   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7955   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7956   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7957   State.CanonicalIV = ILV.Induction;
7958   ILV.collectPoisonGeneratingRecipes(State);
7959 
7960   ILV.printDebugTracesAtStart();
7961 
7962   //===------------------------------------------------===//
7963   //
7964   // Notice: any optimization or new instruction that go
7965   // into the code below should also be implemented in
7966   // the cost-model.
7967   //
7968   //===------------------------------------------------===//
7969 
7970   // 2. Copy and widen instructions from the old loop into the new loop.
7971   BestVPlan.execute(&State);
7972 
7973   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7974   //    predication, updating analyses.
7975   ILV.fixVectorizedLoop(State);
7976 
7977   ILV.printDebugTracesAtEnd();
7978 }
7979 
7980 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7981 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7982   for (const auto &Plan : VPlans)
7983     if (PrintVPlansInDotFormat)
7984       Plan->printDOT(O);
7985     else
7986       Plan->print(O);
7987 }
7988 #endif
7989 
7990 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7991     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7992 
7993   // We create new control-flow for the vectorized loop, so the original exit
7994   // conditions will be dead after vectorization if it's only used by the
7995   // terminator
7996   SmallVector<BasicBlock*> ExitingBlocks;
7997   OrigLoop->getExitingBlocks(ExitingBlocks);
7998   for (auto *BB : ExitingBlocks) {
7999     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8000     if (!Cmp || !Cmp->hasOneUse())
8001       continue;
8002 
8003     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8004     if (!DeadInstructions.insert(Cmp).second)
8005       continue;
8006 
8007     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8008     // TODO: can recurse through operands in general
8009     for (Value *Op : Cmp->operands()) {
8010       if (isa<TruncInst>(Op) && Op->hasOneUse())
8011           DeadInstructions.insert(cast<Instruction>(Op));
8012     }
8013   }
8014 
8015   // We create new "steps" for induction variable updates to which the original
8016   // induction variables map. An original update instruction will be dead if
8017   // all its users except the induction variable are dead.
8018   auto *Latch = OrigLoop->getLoopLatch();
8019   for (auto &Induction : Legal->getInductionVars()) {
8020     PHINode *Ind = Induction.first;
8021     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8022 
8023     // If the tail is to be folded by masking, the primary induction variable,
8024     // if exists, isn't dead: it will be used for masking. Don't kill it.
8025     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8026       continue;
8027 
8028     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8029           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8030         }))
8031       DeadInstructions.insert(IndUpdate);
8032   }
8033 }
8034 
8035 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8036 
8037 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8038 
8039 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
8040                                         Value *Step,
8041                                         Instruction::BinaryOps BinOp) {
8042   // When unrolling and the VF is 1, we only need to add a simple scalar.
8043   Type *Ty = Val->getType();
8044   assert(!Ty->isVectorTy() && "Val must be a scalar");
8045 
8046   if (Ty->isFloatingPointTy()) {
8047     // Floating-point operations inherit FMF via the builder's flags.
8048     Value *MulOp = Builder.CreateFMul(StartIdx, Step);
8049     return Builder.CreateBinOp(BinOp, Val, MulOp);
8050   }
8051   return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
8052 }
8053 
8054 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8055   SmallVector<Metadata *, 4> MDs;
8056   // Reserve first location for self reference to the LoopID metadata node.
8057   MDs.push_back(nullptr);
8058   bool IsUnrollMetadata = false;
8059   MDNode *LoopID = L->getLoopID();
8060   if (LoopID) {
8061     // First find existing loop unrolling disable metadata.
8062     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8063       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8064       if (MD) {
8065         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8066         IsUnrollMetadata =
8067             S && S->getString().startswith("llvm.loop.unroll.disable");
8068       }
8069       MDs.push_back(LoopID->getOperand(i));
8070     }
8071   }
8072 
8073   if (!IsUnrollMetadata) {
8074     // Add runtime unroll disable metadata.
8075     LLVMContext &Context = L->getHeader()->getContext();
8076     SmallVector<Metadata *, 1> DisableOperands;
8077     DisableOperands.push_back(
8078         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8079     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8080     MDs.push_back(DisableNode);
8081     MDNode *NewLoopID = MDNode::get(Context, MDs);
8082     // Set operand 0 to refer to the loop id itself.
8083     NewLoopID->replaceOperandWith(0, NewLoopID);
8084     L->setLoopID(NewLoopID);
8085   }
8086 }
8087 
8088 //===--------------------------------------------------------------------===//
8089 // EpilogueVectorizerMainLoop
8090 //===--------------------------------------------------------------------===//
8091 
8092 /// This function is partially responsible for generating the control flow
8093 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8094 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8095   MDNode *OrigLoopID = OrigLoop->getLoopID();
8096   Loop *Lp = createVectorLoopSkeleton("");
8097 
8098   // Generate the code to check the minimum iteration count of the vector
8099   // epilogue (see below).
8100   EPI.EpilogueIterationCountCheck =
8101       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8102   EPI.EpilogueIterationCountCheck->setName("iter.check");
8103 
8104   // Generate the code to check any assumptions that we've made for SCEV
8105   // expressions.
8106   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8107 
8108   // Generate the code that checks at runtime if arrays overlap. We put the
8109   // checks into a separate block to make the more common case of few elements
8110   // faster.
8111   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8112 
8113   // Generate the iteration count check for the main loop, *after* the check
8114   // for the epilogue loop, so that the path-length is shorter for the case
8115   // that goes directly through the vector epilogue. The longer-path length for
8116   // the main loop is compensated for, by the gain from vectorizing the larger
8117   // trip count. Note: the branch will get updated later on when we vectorize
8118   // the epilogue.
8119   EPI.MainLoopIterationCountCheck =
8120       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8121 
8122   // Generate the induction variable.
8123   OldInduction = Legal->getPrimaryInduction();
8124   Type *IdxTy = Legal->getWidestInductionType();
8125   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8126 
8127   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8128   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8129   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8130   EPI.VectorTripCount = CountRoundDown;
8131   Induction =
8132       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8133                               getDebugLocFromInstOrOperands(OldInduction));
8134 
8135   // Skip induction resume value creation here because they will be created in
8136   // the second pass. If we created them here, they wouldn't be used anyway,
8137   // because the vplan in the second pass still contains the inductions from the
8138   // original loop.
8139 
8140   return completeLoopSkeleton(Lp, OrigLoopID);
8141 }
8142 
8143 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8144   LLVM_DEBUG({
8145     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8146            << "Main Loop VF:" << EPI.MainLoopVF
8147            << ", Main Loop UF:" << EPI.MainLoopUF
8148            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8149            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8150   });
8151 }
8152 
8153 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8154   DEBUG_WITH_TYPE(VerboseDebug, {
8155     dbgs() << "intermediate fn:\n"
8156            << *OrigLoop->getHeader()->getParent() << "\n";
8157   });
8158 }
8159 
8160 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8161     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8162   assert(L && "Expected valid Loop.");
8163   assert(Bypass && "Expected valid bypass basic block.");
8164   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8165   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8166   Value *Count = getOrCreateTripCount(L);
8167   // Reuse existing vector loop preheader for TC checks.
8168   // Note that new preheader block is generated for vector loop.
8169   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8170   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8171 
8172   // Generate code to check if the loop's trip count is less than VF * UF of the
8173   // main vector loop.
8174   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8175       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8176 
8177   Value *CheckMinIters = Builder.CreateICmp(
8178       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8179       "min.iters.check");
8180 
8181   if (!ForEpilogue)
8182     TCCheckBlock->setName("vector.main.loop.iter.check");
8183 
8184   // Create new preheader for vector loop.
8185   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8186                                    DT, LI, nullptr, "vector.ph");
8187 
8188   if (ForEpilogue) {
8189     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8190                                  DT->getNode(Bypass)->getIDom()) &&
8191            "TC check is expected to dominate Bypass");
8192 
8193     // Update dominator for Bypass & LoopExit.
8194     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8195     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8196       // For loops with multiple exits, there's no edge from the middle block
8197       // to exit blocks (as the epilogue must run) and thus no need to update
8198       // the immediate dominator of the exit blocks.
8199       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8200 
8201     LoopBypassBlocks.push_back(TCCheckBlock);
8202 
8203     // Save the trip count so we don't have to regenerate it in the
8204     // vec.epilog.iter.check. This is safe to do because the trip count
8205     // generated here dominates the vector epilog iter check.
8206     EPI.TripCount = Count;
8207   }
8208 
8209   ReplaceInstWithInst(
8210       TCCheckBlock->getTerminator(),
8211       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8212 
8213   return TCCheckBlock;
8214 }
8215 
8216 //===--------------------------------------------------------------------===//
8217 // EpilogueVectorizerEpilogueLoop
8218 //===--------------------------------------------------------------------===//
8219 
8220 /// This function is partially responsible for generating the control flow
8221 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8222 BasicBlock *
8223 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8224   MDNode *OrigLoopID = OrigLoop->getLoopID();
8225   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8226 
8227   // Now, compare the remaining count and if there aren't enough iterations to
8228   // execute the vectorized epilogue skip to the scalar part.
8229   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8230   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8231   LoopVectorPreHeader =
8232       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8233                  LI, nullptr, "vec.epilog.ph");
8234   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8235                                           VecEpilogueIterationCountCheck);
8236 
8237   // Adjust the control flow taking the state info from the main loop
8238   // vectorization into account.
8239   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8240          "expected this to be saved from the previous pass.");
8241   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8242       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8243 
8244   DT->changeImmediateDominator(LoopVectorPreHeader,
8245                                EPI.MainLoopIterationCountCheck);
8246 
8247   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8248       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8249 
8250   if (EPI.SCEVSafetyCheck)
8251     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8252         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8253   if (EPI.MemSafetyCheck)
8254     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8255         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8256 
8257   DT->changeImmediateDominator(
8258       VecEpilogueIterationCountCheck,
8259       VecEpilogueIterationCountCheck->getSinglePredecessor());
8260 
8261   DT->changeImmediateDominator(LoopScalarPreHeader,
8262                                EPI.EpilogueIterationCountCheck);
8263   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8264     // If there is an epilogue which must run, there's no edge from the
8265     // middle block to exit blocks  and thus no need to update the immediate
8266     // dominator of the exit blocks.
8267     DT->changeImmediateDominator(LoopExitBlock,
8268                                  EPI.EpilogueIterationCountCheck);
8269 
8270   // Keep track of bypass blocks, as they feed start values to the induction
8271   // phis in the scalar loop preheader.
8272   if (EPI.SCEVSafetyCheck)
8273     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8274   if (EPI.MemSafetyCheck)
8275     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8276   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8277 
8278   // Generate a resume induction for the vector epilogue and put it in the
8279   // vector epilogue preheader
8280   Type *IdxTy = Legal->getWidestInductionType();
8281   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8282                                          LoopVectorPreHeader->getFirstNonPHI());
8283   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8284   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8285                            EPI.MainLoopIterationCountCheck);
8286 
8287   // Generate the induction variable.
8288   OldInduction = Legal->getPrimaryInduction();
8289   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8290   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8291   Value *StartIdx = EPResumeVal;
8292   Induction =
8293       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8294                               getDebugLocFromInstOrOperands(OldInduction));
8295 
8296   // Generate induction resume values. These variables save the new starting
8297   // indexes for the scalar loop. They are used to test if there are any tail
8298   // iterations left once the vector loop has completed.
8299   // Note that when the vectorized epilogue is skipped due to iteration count
8300   // check, then the resume value for the induction variable comes from
8301   // the trip count of the main vector loop, hence passing the AdditionalBypass
8302   // argument.
8303   createInductionResumeValues(Lp, CountRoundDown,
8304                               {VecEpilogueIterationCountCheck,
8305                                EPI.VectorTripCount} /* AdditionalBypass */);
8306 
8307   AddRuntimeUnrollDisableMetaData(Lp);
8308   return completeLoopSkeleton(Lp, OrigLoopID);
8309 }
8310 
8311 BasicBlock *
8312 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8313     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8314 
8315   assert(EPI.TripCount &&
8316          "Expected trip count to have been safed in the first pass.");
8317   assert(
8318       (!isa<Instruction>(EPI.TripCount) ||
8319        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8320       "saved trip count does not dominate insertion point.");
8321   Value *TC = EPI.TripCount;
8322   IRBuilder<> Builder(Insert->getTerminator());
8323   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8324 
8325   // Generate code to check if the loop's trip count is less than VF * UF of the
8326   // vector epilogue loop.
8327   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8328       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8329 
8330   Value *CheckMinIters =
8331       Builder.CreateICmp(P, Count,
8332                          createStepForVF(Builder, Count->getType(),
8333                                          EPI.EpilogueVF, EPI.EpilogueUF),
8334                          "min.epilog.iters.check");
8335 
8336   ReplaceInstWithInst(
8337       Insert->getTerminator(),
8338       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8339 
8340   LoopBypassBlocks.push_back(Insert);
8341   return Insert;
8342 }
8343 
8344 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8345   LLVM_DEBUG({
8346     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8347            << "Epilogue Loop VF:" << EPI.EpilogueVF
8348            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8349   });
8350 }
8351 
8352 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8353   DEBUG_WITH_TYPE(VerboseDebug, {
8354     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8355   });
8356 }
8357 
8358 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8359     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8360   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8361   bool PredicateAtRangeStart = Predicate(Range.Start);
8362 
8363   for (ElementCount TmpVF = Range.Start * 2;
8364        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8365     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8366       Range.End = TmpVF;
8367       break;
8368     }
8369 
8370   return PredicateAtRangeStart;
8371 }
8372 
8373 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8374 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8375 /// of VF's starting at a given VF and extending it as much as possible. Each
8376 /// vectorization decision can potentially shorten this sub-range during
8377 /// buildVPlan().
8378 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8379                                            ElementCount MaxVF) {
8380   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8381   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8382     VFRange SubRange = {VF, MaxVFPlusOne};
8383     VPlans.push_back(buildVPlan(SubRange));
8384     VF = SubRange.End;
8385   }
8386 }
8387 
8388 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8389                                          VPlanPtr &Plan) {
8390   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8391 
8392   // Look for cached value.
8393   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8394   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8395   if (ECEntryIt != EdgeMaskCache.end())
8396     return ECEntryIt->second;
8397 
8398   VPValue *SrcMask = createBlockInMask(Src, Plan);
8399 
8400   // The terminator has to be a branch inst!
8401   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8402   assert(BI && "Unexpected terminator found");
8403 
8404   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8405     return EdgeMaskCache[Edge] = SrcMask;
8406 
8407   // If source is an exiting block, we know the exit edge is dynamically dead
8408   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8409   // adding uses of an otherwise potentially dead instruction.
8410   if (OrigLoop->isLoopExiting(Src))
8411     return EdgeMaskCache[Edge] = SrcMask;
8412 
8413   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8414   assert(EdgeMask && "No Edge Mask found for condition");
8415 
8416   if (BI->getSuccessor(0) != Dst)
8417     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8418 
8419   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8420     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8421     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8422     // The select version does not introduce new UB if SrcMask is false and
8423     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8424     VPValue *False = Plan->getOrAddVPValue(
8425         ConstantInt::getFalse(BI->getCondition()->getType()));
8426     EdgeMask =
8427         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8428   }
8429 
8430   return EdgeMaskCache[Edge] = EdgeMask;
8431 }
8432 
8433 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8434   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8435 
8436   // Look for cached value.
8437   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8438   if (BCEntryIt != BlockMaskCache.end())
8439     return BCEntryIt->second;
8440 
8441   // All-one mask is modelled as no-mask following the convention for masked
8442   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8443   VPValue *BlockMask = nullptr;
8444 
8445   if (OrigLoop->getHeader() == BB) {
8446     if (!CM.blockNeedsPredicationForAnyReason(BB))
8447       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8448 
8449     // Introduce the early-exit compare IV <= BTC to form header block mask.
8450     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8451     // Start by constructing the desired canonical IV in the header block.
8452     VPValue *IV = nullptr;
8453     if (Legal->getPrimaryInduction())
8454       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8455     else {
8456       VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8457       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8458       HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
8459       IV = IVRecipe;
8460     }
8461 
8462     // Create the block in mask as the first non-phi instruction in the block.
8463     VPBuilder::InsertPointGuard Guard(Builder);
8464     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8465     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8466 
8467     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8468     bool TailFolded = !CM.isScalarEpilogueAllowed();
8469 
8470     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8471       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8472       // as a second argument, we only pass the IV here and extract the
8473       // tripcount from the transform state where codegen of the VP instructions
8474       // happen.
8475       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8476     } else {
8477       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8478     }
8479     return BlockMaskCache[BB] = BlockMask;
8480   }
8481 
8482   // This is the block mask. We OR all incoming edges.
8483   for (auto *Predecessor : predecessors(BB)) {
8484     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8485     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8486       return BlockMaskCache[BB] = EdgeMask;
8487 
8488     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8489       BlockMask = EdgeMask;
8490       continue;
8491     }
8492 
8493     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8494   }
8495 
8496   return BlockMaskCache[BB] = BlockMask;
8497 }
8498 
8499 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8500                                                 ArrayRef<VPValue *> Operands,
8501                                                 VFRange &Range,
8502                                                 VPlanPtr &Plan) {
8503   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8504          "Must be called with either a load or store");
8505 
8506   auto willWiden = [&](ElementCount VF) -> bool {
8507     if (VF.isScalar())
8508       return false;
8509     LoopVectorizationCostModel::InstWidening Decision =
8510         CM.getWideningDecision(I, VF);
8511     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8512            "CM decision should be taken at this point.");
8513     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8514       return true;
8515     if (CM.isScalarAfterVectorization(I, VF) ||
8516         CM.isProfitableToScalarize(I, VF))
8517       return false;
8518     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8519   };
8520 
8521   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8522     return nullptr;
8523 
8524   VPValue *Mask = nullptr;
8525   if (Legal->isMaskRequired(I))
8526     Mask = createBlockInMask(I->getParent(), Plan);
8527 
8528   // Determine if the pointer operand of the access is either consecutive or
8529   // reverse consecutive.
8530   LoopVectorizationCostModel::InstWidening Decision =
8531       CM.getWideningDecision(I, Range.Start);
8532   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8533   bool Consecutive =
8534       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8535 
8536   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8537     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8538                                               Consecutive, Reverse);
8539 
8540   StoreInst *Store = cast<StoreInst>(I);
8541   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8542                                             Mask, Consecutive, Reverse);
8543 }
8544 
8545 VPWidenIntOrFpInductionRecipe *
8546 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8547                                            ArrayRef<VPValue *> Operands) const {
8548   // Check if this is an integer or fp induction. If so, build the recipe that
8549   // produces its scalar and vector values.
8550   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
8551     assert(II->getStartValue() ==
8552            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8553     return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
8554   }
8555 
8556   return nullptr;
8557 }
8558 
8559 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8560     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8561     VPlan &Plan) const {
8562   // Optimize the special case where the source is a constant integer
8563   // induction variable. Notice that we can only optimize the 'trunc' case
8564   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8565   // (c) other casts depend on pointer size.
8566 
8567   // Determine whether \p K is a truncation based on an induction variable that
8568   // can be optimized.
8569   auto isOptimizableIVTruncate =
8570       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8571     return [=](ElementCount VF) -> bool {
8572       return CM.isOptimizableIVTruncate(K, VF);
8573     };
8574   };
8575 
8576   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8577           isOptimizableIVTruncate(I), Range)) {
8578 
8579     auto *Phi = cast<PHINode>(I->getOperand(0));
8580     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8581     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8582     return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
8583   }
8584   return nullptr;
8585 }
8586 
8587 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8588                                                 ArrayRef<VPValue *> Operands,
8589                                                 VPlanPtr &Plan) {
8590   // If all incoming values are equal, the incoming VPValue can be used directly
8591   // instead of creating a new VPBlendRecipe.
8592   VPValue *FirstIncoming = Operands[0];
8593   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8594         return FirstIncoming == Inc;
8595       })) {
8596     return Operands[0];
8597   }
8598 
8599   // We know that all PHIs in non-header blocks are converted into selects, so
8600   // we don't have to worry about the insertion order and we can just use the
8601   // builder. At this point we generate the predication tree. There may be
8602   // duplications since this is a simple recursive scan, but future
8603   // optimizations will clean it up.
8604   SmallVector<VPValue *, 2> OperandsWithMask;
8605   unsigned NumIncoming = Phi->getNumIncomingValues();
8606 
8607   for (unsigned In = 0; In < NumIncoming; In++) {
8608     VPValue *EdgeMask =
8609       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8610     assert((EdgeMask || NumIncoming == 1) &&
8611            "Multiple predecessors with one having a full mask");
8612     OperandsWithMask.push_back(Operands[In]);
8613     if (EdgeMask)
8614       OperandsWithMask.push_back(EdgeMask);
8615   }
8616   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8617 }
8618 
8619 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8620                                                    ArrayRef<VPValue *> Operands,
8621                                                    VFRange &Range) const {
8622 
8623   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8624       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8625       Range);
8626 
8627   if (IsPredicated)
8628     return nullptr;
8629 
8630   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8631   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8632              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8633              ID == Intrinsic::pseudoprobe ||
8634              ID == Intrinsic::experimental_noalias_scope_decl))
8635     return nullptr;
8636 
8637   auto willWiden = [&](ElementCount VF) -> bool {
8638     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8639     // The following case may be scalarized depending on the VF.
8640     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8641     // version of the instruction.
8642     // Is it beneficial to perform intrinsic call compared to lib call?
8643     bool NeedToScalarize = false;
8644     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8645     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8646     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8647     return UseVectorIntrinsic || !NeedToScalarize;
8648   };
8649 
8650   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8651     return nullptr;
8652 
8653   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8654   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8655 }
8656 
8657 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8658   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8659          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8660   // Instruction should be widened, unless it is scalar after vectorization,
8661   // scalarization is profitable or it is predicated.
8662   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8663     return CM.isScalarAfterVectorization(I, VF) ||
8664            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8665   };
8666   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8667                                                              Range);
8668 }
8669 
8670 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8671                                            ArrayRef<VPValue *> Operands) const {
8672   auto IsVectorizableOpcode = [](unsigned Opcode) {
8673     switch (Opcode) {
8674     case Instruction::Add:
8675     case Instruction::And:
8676     case Instruction::AShr:
8677     case Instruction::BitCast:
8678     case Instruction::FAdd:
8679     case Instruction::FCmp:
8680     case Instruction::FDiv:
8681     case Instruction::FMul:
8682     case Instruction::FNeg:
8683     case Instruction::FPExt:
8684     case Instruction::FPToSI:
8685     case Instruction::FPToUI:
8686     case Instruction::FPTrunc:
8687     case Instruction::FRem:
8688     case Instruction::FSub:
8689     case Instruction::ICmp:
8690     case Instruction::IntToPtr:
8691     case Instruction::LShr:
8692     case Instruction::Mul:
8693     case Instruction::Or:
8694     case Instruction::PtrToInt:
8695     case Instruction::SDiv:
8696     case Instruction::Select:
8697     case Instruction::SExt:
8698     case Instruction::Shl:
8699     case Instruction::SIToFP:
8700     case Instruction::SRem:
8701     case Instruction::Sub:
8702     case Instruction::Trunc:
8703     case Instruction::UDiv:
8704     case Instruction::UIToFP:
8705     case Instruction::URem:
8706     case Instruction::Xor:
8707     case Instruction::ZExt:
8708       return true;
8709     }
8710     return false;
8711   };
8712 
8713   if (!IsVectorizableOpcode(I->getOpcode()))
8714     return nullptr;
8715 
8716   // Success: widen this instruction.
8717   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8718 }
8719 
8720 void VPRecipeBuilder::fixHeaderPhis() {
8721   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8722   for (VPWidenPHIRecipe *R : PhisToFix) {
8723     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8724     VPRecipeBase *IncR =
8725         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8726     R->addOperand(IncR->getVPSingleValue());
8727   }
8728 }
8729 
8730 VPBasicBlock *VPRecipeBuilder::handleReplication(
8731     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8732     VPlanPtr &Plan) {
8733   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8734       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8735       Range);
8736 
8737   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8738       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
8739       Range);
8740 
8741   // Even if the instruction is not marked as uniform, there are certain
8742   // intrinsic calls that can be effectively treated as such, so we check for
8743   // them here. Conservatively, we only do this for scalable vectors, since
8744   // for fixed-width VFs we can always fall back on full scalarization.
8745   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8746     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8747     case Intrinsic::assume:
8748     case Intrinsic::lifetime_start:
8749     case Intrinsic::lifetime_end:
8750       // For scalable vectors if one of the operands is variant then we still
8751       // want to mark as uniform, which will generate one instruction for just
8752       // the first lane of the vector. We can't scalarize the call in the same
8753       // way as for fixed-width vectors because we don't know how many lanes
8754       // there are.
8755       //
8756       // The reasons for doing it this way for scalable vectors are:
8757       //   1. For the assume intrinsic generating the instruction for the first
8758       //      lane is still be better than not generating any at all. For
8759       //      example, the input may be a splat across all lanes.
8760       //   2. For the lifetime start/end intrinsics the pointer operand only
8761       //      does anything useful when the input comes from a stack object,
8762       //      which suggests it should always be uniform. For non-stack objects
8763       //      the effect is to poison the object, which still allows us to
8764       //      remove the call.
8765       IsUniform = true;
8766       break;
8767     default:
8768       break;
8769     }
8770   }
8771 
8772   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8773                                        IsUniform, IsPredicated);
8774   setRecipe(I, Recipe);
8775   Plan->addVPValue(I, Recipe);
8776 
8777   // Find if I uses a predicated instruction. If so, it will use its scalar
8778   // value. Avoid hoisting the insert-element which packs the scalar value into
8779   // a vector value, as that happens iff all users use the vector value.
8780   for (VPValue *Op : Recipe->operands()) {
8781     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8782     if (!PredR)
8783       continue;
8784     auto *RepR =
8785         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8786     assert(RepR->isPredicated() &&
8787            "expected Replicate recipe to be predicated");
8788     RepR->setAlsoPack(false);
8789   }
8790 
8791   // Finalize the recipe for Instr, first if it is not predicated.
8792   if (!IsPredicated) {
8793     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8794     VPBB->appendRecipe(Recipe);
8795     return VPBB;
8796   }
8797   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8798 
8799   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8800   assert(SingleSucc && "VPBB must have a single successor when handling "
8801                        "predicated replication.");
8802   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8803   // Record predicated instructions for above packing optimizations.
8804   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8805   VPBlockUtils::insertBlockAfter(Region, VPBB);
8806   auto *RegSucc = new VPBasicBlock();
8807   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8808   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8809   return RegSucc;
8810 }
8811 
8812 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8813                                                       VPRecipeBase *PredRecipe,
8814                                                       VPlanPtr &Plan) {
8815   // Instructions marked for predication are replicated and placed under an
8816   // if-then construct to prevent side-effects.
8817 
8818   // Generate recipes to compute the block mask for this region.
8819   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8820 
8821   // Build the triangular if-then region.
8822   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8823   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8824   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8825   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8826   auto *PHIRecipe = Instr->getType()->isVoidTy()
8827                         ? nullptr
8828                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8829   if (PHIRecipe) {
8830     Plan->removeVPValueFor(Instr);
8831     Plan->addVPValue(Instr, PHIRecipe);
8832   }
8833   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8834   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8835   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8836 
8837   // Note: first set Entry as region entry and then connect successors starting
8838   // from it in order, to propagate the "parent" of each VPBasicBlock.
8839   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8840   VPBlockUtils::connectBlocks(Pred, Exit);
8841 
8842   return Region;
8843 }
8844 
8845 VPRecipeOrVPValueTy
8846 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8847                                         ArrayRef<VPValue *> Operands,
8848                                         VFRange &Range, VPlanPtr &Plan) {
8849   // First, check for specific widening recipes that deal with calls, memory
8850   // operations, inductions and Phi nodes.
8851   if (auto *CI = dyn_cast<CallInst>(Instr))
8852     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8853 
8854   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8855     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8856 
8857   VPRecipeBase *Recipe;
8858   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8859     if (Phi->getParent() != OrigLoop->getHeader())
8860       return tryToBlend(Phi, Operands, Plan);
8861     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8862       return toVPRecipeResult(Recipe);
8863 
8864     VPWidenPHIRecipe *PhiRecipe = nullptr;
8865     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8866       VPValue *StartV = Operands[0];
8867       if (Legal->isReductionVariable(Phi)) {
8868         const RecurrenceDescriptor &RdxDesc =
8869             Legal->getReductionVars().find(Phi)->second;
8870         assert(RdxDesc.getRecurrenceStartValue() ==
8871                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8872         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8873                                              CM.isInLoopReduction(Phi),
8874                                              CM.useOrderedReductions(RdxDesc));
8875       } else {
8876         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8877       }
8878 
8879       // Record the incoming value from the backedge, so we can add the incoming
8880       // value from the backedge after all recipes have been created.
8881       recordRecipeOf(cast<Instruction>(
8882           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8883       PhisToFix.push_back(PhiRecipe);
8884     } else {
8885       // TODO: record start and backedge value for remaining pointer induction
8886       // phis.
8887       assert(Phi->getType()->isPointerTy() &&
8888              "only pointer phis should be handled here");
8889       PhiRecipe = new VPWidenPHIRecipe(Phi);
8890     }
8891 
8892     return toVPRecipeResult(PhiRecipe);
8893   }
8894 
8895   if (isa<TruncInst>(Instr) &&
8896       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8897                                                Range, *Plan)))
8898     return toVPRecipeResult(Recipe);
8899 
8900   if (!shouldWiden(Instr, Range))
8901     return nullptr;
8902 
8903   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8904     return toVPRecipeResult(new VPWidenGEPRecipe(
8905         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8906 
8907   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8908     bool InvariantCond =
8909         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8910     return toVPRecipeResult(new VPWidenSelectRecipe(
8911         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8912   }
8913 
8914   return toVPRecipeResult(tryToWiden(Instr, Operands));
8915 }
8916 
8917 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8918                                                         ElementCount MaxVF) {
8919   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8920 
8921   // Collect instructions from the original loop that will become trivially dead
8922   // in the vectorized loop. We don't need to vectorize these instructions. For
8923   // example, original induction update instructions can become dead because we
8924   // separately emit induction "steps" when generating code for the new loop.
8925   // Similarly, we create a new latch condition when setting up the structure
8926   // of the new loop, so the old one can become dead.
8927   SmallPtrSet<Instruction *, 4> DeadInstructions;
8928   collectTriviallyDeadInstructions(DeadInstructions);
8929 
8930   // Add assume instructions we need to drop to DeadInstructions, to prevent
8931   // them from being added to the VPlan.
8932   // TODO: We only need to drop assumes in blocks that get flattend. If the
8933   // control flow is preserved, we should keep them.
8934   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8935   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8936 
8937   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8938   // Dead instructions do not need sinking. Remove them from SinkAfter.
8939   for (Instruction *I : DeadInstructions)
8940     SinkAfter.erase(I);
8941 
8942   // Cannot sink instructions after dead instructions (there won't be any
8943   // recipes for them). Instead, find the first non-dead previous instruction.
8944   for (auto &P : Legal->getSinkAfter()) {
8945     Instruction *SinkTarget = P.second;
8946     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8947     (void)FirstInst;
8948     while (DeadInstructions.contains(SinkTarget)) {
8949       assert(
8950           SinkTarget != FirstInst &&
8951           "Must find a live instruction (at least the one feeding the "
8952           "first-order recurrence PHI) before reaching beginning of the block");
8953       SinkTarget = SinkTarget->getPrevNode();
8954       assert(SinkTarget != P.first &&
8955              "sink source equals target, no sinking required");
8956     }
8957     P.second = SinkTarget;
8958   }
8959 
8960   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8961   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8962     VFRange SubRange = {VF, MaxVFPlusOne};
8963     VPlans.push_back(
8964         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8965     VF = SubRange.End;
8966   }
8967 }
8968 
8969 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8970     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8971     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8972 
8973   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8974 
8975   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8976 
8977   // ---------------------------------------------------------------------------
8978   // Pre-construction: record ingredients whose recipes we'll need to further
8979   // process after constructing the initial VPlan.
8980   // ---------------------------------------------------------------------------
8981 
8982   // Mark instructions we'll need to sink later and their targets as
8983   // ingredients whose recipe we'll need to record.
8984   for (auto &Entry : SinkAfter) {
8985     RecipeBuilder.recordRecipeOf(Entry.first);
8986     RecipeBuilder.recordRecipeOf(Entry.second);
8987   }
8988   for (auto &Reduction : CM.getInLoopReductionChains()) {
8989     PHINode *Phi = Reduction.first;
8990     RecurKind Kind =
8991         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8992     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8993 
8994     RecipeBuilder.recordRecipeOf(Phi);
8995     for (auto &R : ReductionOperations) {
8996       RecipeBuilder.recordRecipeOf(R);
8997       // For min/max reducitons, where we have a pair of icmp/select, we also
8998       // need to record the ICmp recipe, so it can be removed later.
8999       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9000              "Only min/max recurrences allowed for inloop reductions");
9001       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9002         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9003     }
9004   }
9005 
9006   // For each interleave group which is relevant for this (possibly trimmed)
9007   // Range, add it to the set of groups to be later applied to the VPlan and add
9008   // placeholders for its members' Recipes which we'll be replacing with a
9009   // single VPInterleaveRecipe.
9010   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9011     auto applyIG = [IG, this](ElementCount VF) -> bool {
9012       return (VF.isVector() && // Query is illegal for VF == 1
9013               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9014                   LoopVectorizationCostModel::CM_Interleave);
9015     };
9016     if (!getDecisionAndClampRange(applyIG, Range))
9017       continue;
9018     InterleaveGroups.insert(IG);
9019     for (unsigned i = 0; i < IG->getFactor(); i++)
9020       if (Instruction *Member = IG->getMember(i))
9021         RecipeBuilder.recordRecipeOf(Member);
9022   };
9023 
9024   // ---------------------------------------------------------------------------
9025   // Build initial VPlan: Scan the body of the loop in a topological order to
9026   // visit each basic block after having visited its predecessor basic blocks.
9027   // ---------------------------------------------------------------------------
9028 
9029   // Create initial VPlan skeleton, with separate header and latch blocks.
9030   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9031   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9032   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9033   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9034   auto Plan = std::make_unique<VPlan>(TopRegion);
9035 
9036   // Scan the body of the loop in a topological order to visit each basic block
9037   // after having visited its predecessor basic blocks.
9038   LoopBlocksDFS DFS(OrigLoop);
9039   DFS.perform(LI);
9040 
9041   VPBasicBlock *VPBB = HeaderVPBB;
9042   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9043   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9044     // Relevant instructions from basic block BB will be grouped into VPRecipe
9045     // ingredients and fill a new VPBasicBlock.
9046     unsigned VPBBsForBB = 0;
9047     VPBB->setName(BB->getName());
9048     Builder.setInsertPoint(VPBB);
9049 
9050     // Introduce each ingredient into VPlan.
9051     // TODO: Model and preserve debug instrinsics in VPlan.
9052     for (Instruction &I : BB->instructionsWithoutDebug()) {
9053       Instruction *Instr = &I;
9054 
9055       // First filter out irrelevant instructions, to ensure no recipes are
9056       // built for them.
9057       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9058         continue;
9059 
9060       SmallVector<VPValue *, 4> Operands;
9061       auto *Phi = dyn_cast<PHINode>(Instr);
9062       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9063         Operands.push_back(Plan->getOrAddVPValue(
9064             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9065       } else {
9066         auto OpRange = Plan->mapToVPValues(Instr->operands());
9067         Operands = {OpRange.begin(), OpRange.end()};
9068       }
9069       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9070               Instr, Operands, Range, Plan)) {
9071         // If Instr can be simplified to an existing VPValue, use it.
9072         if (RecipeOrValue.is<VPValue *>()) {
9073           auto *VPV = RecipeOrValue.get<VPValue *>();
9074           Plan->addVPValue(Instr, VPV);
9075           // If the re-used value is a recipe, register the recipe for the
9076           // instruction, in case the recipe for Instr needs to be recorded.
9077           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9078             RecipeBuilder.setRecipe(Instr, R);
9079           continue;
9080         }
9081         // Otherwise, add the new recipe.
9082         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9083         for (auto *Def : Recipe->definedValues()) {
9084           auto *UV = Def->getUnderlyingValue();
9085           Plan->addVPValue(UV, Def);
9086         }
9087 
9088         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9089             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9090           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9091           // of the header block. That can happen for truncates of induction
9092           // variables. Those recipes are moved to the phi section of the header
9093           // block after applying SinkAfter, which relies on the original
9094           // position of the trunc.
9095           assert(isa<TruncInst>(Instr));
9096           InductionsToMove.push_back(
9097               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9098         }
9099         RecipeBuilder.setRecipe(Instr, Recipe);
9100         VPBB->appendRecipe(Recipe);
9101         continue;
9102       }
9103 
9104       // Otherwise, if all widening options failed, Instruction is to be
9105       // replicated. This may create a successor for VPBB.
9106       VPBasicBlock *NextVPBB =
9107           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9108       if (NextVPBB != VPBB) {
9109         VPBB = NextVPBB;
9110         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9111                                     : "");
9112       }
9113     }
9114 
9115     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9116     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9117   }
9118 
9119   // Fold the last, empty block into its predecessor.
9120   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9121   assert(VPBB && "expected to fold last (empty) block");
9122   // After here, VPBB should not be used.
9123   VPBB = nullptr;
9124 
9125   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9126          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9127          "entry block must be set to a VPRegionBlock having a non-empty entry "
9128          "VPBasicBlock");
9129   RecipeBuilder.fixHeaderPhis();
9130 
9131   // ---------------------------------------------------------------------------
9132   // Transform initial VPlan: Apply previously taken decisions, in order, to
9133   // bring the VPlan to its final state.
9134   // ---------------------------------------------------------------------------
9135 
9136   // Apply Sink-After legal constraints.
9137   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9138     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9139     if (Region && Region->isReplicator()) {
9140       assert(Region->getNumSuccessors() == 1 &&
9141              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9142       assert(R->getParent()->size() == 1 &&
9143              "A recipe in an original replicator region must be the only "
9144              "recipe in its block");
9145       return Region;
9146     }
9147     return nullptr;
9148   };
9149   for (auto &Entry : SinkAfter) {
9150     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9151     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9152 
9153     auto *TargetRegion = GetReplicateRegion(Target);
9154     auto *SinkRegion = GetReplicateRegion(Sink);
9155     if (!SinkRegion) {
9156       // If the sink source is not a replicate region, sink the recipe directly.
9157       if (TargetRegion) {
9158         // The target is in a replication region, make sure to move Sink to
9159         // the block after it, not into the replication region itself.
9160         VPBasicBlock *NextBlock =
9161             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9162         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9163       } else
9164         Sink->moveAfter(Target);
9165       continue;
9166     }
9167 
9168     // The sink source is in a replicate region. Unhook the region from the CFG.
9169     auto *SinkPred = SinkRegion->getSinglePredecessor();
9170     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9171     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9172     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9173     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9174 
9175     if (TargetRegion) {
9176       // The target recipe is also in a replicate region, move the sink region
9177       // after the target region.
9178       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9179       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9180       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9181       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9182     } else {
9183       // The sink source is in a replicate region, we need to move the whole
9184       // replicate region, which should only contain a single recipe in the
9185       // main block.
9186       auto *SplitBlock =
9187           Target->getParent()->splitAt(std::next(Target->getIterator()));
9188 
9189       auto *SplitPred = SplitBlock->getSinglePredecessor();
9190 
9191       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9192       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9193       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9194     }
9195   }
9196 
9197   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9198 
9199   // Now that sink-after is done, move induction recipes for optimized truncates
9200   // to the phi section of the header block.
9201   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9202     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9203 
9204   // Adjust the recipes for any inloop reductions.
9205   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9206                              RecipeBuilder, Range.Start);
9207 
9208   // Introduce a recipe to combine the incoming and previous values of a
9209   // first-order recurrence.
9210   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9211     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9212     if (!RecurPhi)
9213       continue;
9214 
9215     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9216     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9217     auto *Region = GetReplicateRegion(PrevRecipe);
9218     if (Region)
9219       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9220     if (Region || PrevRecipe->isPhi())
9221       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9222     else
9223       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9224 
9225     auto *RecurSplice = cast<VPInstruction>(
9226         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9227                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9228 
9229     RecurPhi->replaceAllUsesWith(RecurSplice);
9230     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9231     // all users.
9232     RecurSplice->setOperand(0, RecurPhi);
9233   }
9234 
9235   // Interleave memory: for each Interleave Group we marked earlier as relevant
9236   // for this VPlan, replace the Recipes widening its memory instructions with a
9237   // single VPInterleaveRecipe at its insertion point.
9238   for (auto IG : InterleaveGroups) {
9239     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9240         RecipeBuilder.getRecipe(IG->getInsertPos()));
9241     SmallVector<VPValue *, 4> StoredValues;
9242     for (unsigned i = 0; i < IG->getFactor(); ++i)
9243       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9244         auto *StoreR =
9245             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9246         StoredValues.push_back(StoreR->getStoredValue());
9247       }
9248 
9249     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9250                                         Recipe->getMask());
9251     VPIG->insertBefore(Recipe);
9252     unsigned J = 0;
9253     for (unsigned i = 0; i < IG->getFactor(); ++i)
9254       if (Instruction *Member = IG->getMember(i)) {
9255         if (!Member->getType()->isVoidTy()) {
9256           VPValue *OriginalV = Plan->getVPValue(Member);
9257           Plan->removeVPValueFor(Member);
9258           Plan->addVPValue(Member, VPIG->getVPValue(J));
9259           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9260           J++;
9261         }
9262         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9263       }
9264   }
9265 
9266   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9267   // in ways that accessing values using original IR values is incorrect.
9268   Plan->disableValue2VPValue();
9269 
9270   VPlanTransforms::sinkScalarOperands(*Plan);
9271   VPlanTransforms::mergeReplicateRegions(*Plan);
9272 
9273   std::string PlanName;
9274   raw_string_ostream RSO(PlanName);
9275   ElementCount VF = Range.Start;
9276   Plan->addVF(VF);
9277   RSO << "Initial VPlan for VF={" << VF;
9278   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9279     Plan->addVF(VF);
9280     RSO << "," << VF;
9281   }
9282   RSO << "},UF>=1";
9283   RSO.flush();
9284   Plan->setName(PlanName);
9285 
9286   // Fold Exit block into its predecessor if possible.
9287   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9288   // VPBasicBlock as exit.
9289   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9290 
9291   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9292   return Plan;
9293 }
9294 
9295 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9296   // Outer loop handling: They may require CFG and instruction level
9297   // transformations before even evaluating whether vectorization is profitable.
9298   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9299   // the vectorization pipeline.
9300   assert(!OrigLoop->isInnermost());
9301   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9302 
9303   // Create new empty VPlan
9304   auto Plan = std::make_unique<VPlan>();
9305 
9306   // Build hierarchical CFG
9307   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9308   HCFGBuilder.buildHierarchicalCFG();
9309 
9310   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9311        VF *= 2)
9312     Plan->addVF(VF);
9313 
9314   if (EnableVPlanPredication) {
9315     VPlanPredicator VPP(*Plan);
9316     VPP.predicate();
9317 
9318     // Avoid running transformation to recipes until masked code generation in
9319     // VPlan-native path is in place.
9320     return Plan;
9321   }
9322 
9323   SmallPtrSet<Instruction *, 1> DeadInstructions;
9324   VPlanTransforms::VPInstructionsToVPRecipes(
9325       OrigLoop, Plan,
9326       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9327       DeadInstructions, *PSE.getSE());
9328   return Plan;
9329 }
9330 
9331 // Adjust the recipes for reductions. For in-loop reductions the chain of
9332 // instructions leading from the loop exit instr to the phi need to be converted
9333 // to reductions, with one operand being vector and the other being the scalar
9334 // reduction chain. For other reductions, a select is introduced between the phi
9335 // and live-out recipes when folding the tail.
9336 void LoopVectorizationPlanner::adjustRecipesForReductions(
9337     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9338     ElementCount MinVF) {
9339   for (auto &Reduction : CM.getInLoopReductionChains()) {
9340     PHINode *Phi = Reduction.first;
9341     const RecurrenceDescriptor &RdxDesc =
9342         Legal->getReductionVars().find(Phi)->second;
9343     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9344 
9345     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9346       continue;
9347 
9348     // ReductionOperations are orders top-down from the phi's use to the
9349     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9350     // which of the two operands will remain scalar and which will be reduced.
9351     // For minmax the chain will be the select instructions.
9352     Instruction *Chain = Phi;
9353     for (Instruction *R : ReductionOperations) {
9354       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9355       RecurKind Kind = RdxDesc.getRecurrenceKind();
9356 
9357       VPValue *ChainOp = Plan->getVPValue(Chain);
9358       unsigned FirstOpId;
9359       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9360              "Only min/max recurrences allowed for inloop reductions");
9361       // Recognize a call to the llvm.fmuladd intrinsic.
9362       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9363       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9364              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9365       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9366         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9367                "Expected to replace a VPWidenSelectSC");
9368         FirstOpId = 1;
9369       } else {
9370         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9371                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9372                "Expected to replace a VPWidenSC");
9373         FirstOpId = 0;
9374       }
9375       unsigned VecOpId =
9376           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9377       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9378 
9379       auto *CondOp = CM.foldTailByMasking()
9380                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9381                          : nullptr;
9382 
9383       if (IsFMulAdd) {
9384         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9385         // need to create an fmul recipe to use as the vector operand for the
9386         // fadd reduction.
9387         VPInstruction *FMulRecipe = new VPInstruction(
9388             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9389         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9390         WidenRecipe->getParent()->insert(FMulRecipe,
9391                                          WidenRecipe->getIterator());
9392         VecOp = FMulRecipe;
9393       }
9394       VPReductionRecipe *RedRecipe =
9395           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9396       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9397       Plan->removeVPValueFor(R);
9398       Plan->addVPValue(R, RedRecipe);
9399       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9400       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9401       WidenRecipe->eraseFromParent();
9402 
9403       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9404         VPRecipeBase *CompareRecipe =
9405             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9406         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9407                "Expected to replace a VPWidenSC");
9408         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9409                "Expected no remaining users");
9410         CompareRecipe->eraseFromParent();
9411       }
9412       Chain = R;
9413     }
9414   }
9415 
9416   // If tail is folded by masking, introduce selects between the phi
9417   // and the live-out instruction of each reduction, at the end of the latch.
9418   if (CM.foldTailByMasking()) {
9419     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9420       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9421       if (!PhiR || PhiR->isInLoop())
9422         continue;
9423       Builder.setInsertPoint(LatchVPBB);
9424       VPValue *Cond =
9425           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9426       VPValue *Red = PhiR->getBackedgeValue();
9427       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9428     }
9429   }
9430 }
9431 
9432 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9433 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9434                                VPSlotTracker &SlotTracker) const {
9435   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9436   IG->getInsertPos()->printAsOperand(O, false);
9437   O << ", ";
9438   getAddr()->printAsOperand(O, SlotTracker);
9439   VPValue *Mask = getMask();
9440   if (Mask) {
9441     O << ", ";
9442     Mask->printAsOperand(O, SlotTracker);
9443   }
9444 
9445   unsigned OpIdx = 0;
9446   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9447     if (!IG->getMember(i))
9448       continue;
9449     if (getNumStoreOperands() > 0) {
9450       O << "\n" << Indent << "  store ";
9451       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9452       O << " to index " << i;
9453     } else {
9454       O << "\n" << Indent << "  ";
9455       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9456       O << " = load from index " << i;
9457     }
9458     ++OpIdx;
9459   }
9460 }
9461 #endif
9462 
9463 void VPWidenCallRecipe::execute(VPTransformState &State) {
9464   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9465                                   *this, State);
9466 }
9467 
9468 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9469   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9470   State.ILV->setDebugLocFromInst(&I);
9471 
9472   // The condition can be loop invariant  but still defined inside the
9473   // loop. This means that we can't just use the original 'cond' value.
9474   // We have to take the 'vectorized' value and pick the first lane.
9475   // Instcombine will make this a no-op.
9476   auto *InvarCond =
9477       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9478 
9479   for (unsigned Part = 0; Part < State.UF; ++Part) {
9480     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9481     Value *Op0 = State.get(getOperand(1), Part);
9482     Value *Op1 = State.get(getOperand(2), Part);
9483     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9484     State.set(this, Sel, Part);
9485     State.ILV->addMetadata(Sel, &I);
9486   }
9487 }
9488 
9489 void VPWidenRecipe::execute(VPTransformState &State) {
9490   auto &I = *cast<Instruction>(getUnderlyingValue());
9491   auto &Builder = State.Builder;
9492   switch (I.getOpcode()) {
9493   case Instruction::Call:
9494   case Instruction::Br:
9495   case Instruction::PHI:
9496   case Instruction::GetElementPtr:
9497   case Instruction::Select:
9498     llvm_unreachable("This instruction is handled by a different recipe.");
9499   case Instruction::UDiv:
9500   case Instruction::SDiv:
9501   case Instruction::SRem:
9502   case Instruction::URem:
9503   case Instruction::Add:
9504   case Instruction::FAdd:
9505   case Instruction::Sub:
9506   case Instruction::FSub:
9507   case Instruction::FNeg:
9508   case Instruction::Mul:
9509   case Instruction::FMul:
9510   case Instruction::FDiv:
9511   case Instruction::FRem:
9512   case Instruction::Shl:
9513   case Instruction::LShr:
9514   case Instruction::AShr:
9515   case Instruction::And:
9516   case Instruction::Or:
9517   case Instruction::Xor: {
9518     // Just widen unops and binops.
9519     State.ILV->setDebugLocFromInst(&I);
9520 
9521     for (unsigned Part = 0; Part < State.UF; ++Part) {
9522       SmallVector<Value *, 2> Ops;
9523       for (VPValue *VPOp : operands())
9524         Ops.push_back(State.get(VPOp, Part));
9525 
9526       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9527 
9528       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9529         VecOp->copyIRFlags(&I);
9530 
9531         // If the instruction is vectorized and was in a basic block that needed
9532         // predication, we can't propagate poison-generating flags (nuw/nsw,
9533         // exact, etc.). The control flow has been linearized and the
9534         // instruction is no longer guarded by the predicate, which could make
9535         // the flag properties to no longer hold.
9536         if (State.MayGeneratePoisonRecipes.contains(this))
9537           VecOp->dropPoisonGeneratingFlags();
9538       }
9539 
9540       // Use this vector value for all users of the original instruction.
9541       State.set(this, V, Part);
9542       State.ILV->addMetadata(V, &I);
9543     }
9544 
9545     break;
9546   }
9547   case Instruction::ICmp:
9548   case Instruction::FCmp: {
9549     // Widen compares. Generate vector compares.
9550     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9551     auto *Cmp = cast<CmpInst>(&I);
9552     State.ILV->setDebugLocFromInst(Cmp);
9553     for (unsigned Part = 0; Part < State.UF; ++Part) {
9554       Value *A = State.get(getOperand(0), Part);
9555       Value *B = State.get(getOperand(1), Part);
9556       Value *C = nullptr;
9557       if (FCmp) {
9558         // Propagate fast math flags.
9559         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9560         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9561         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9562       } else {
9563         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9564       }
9565       State.set(this, C, Part);
9566       State.ILV->addMetadata(C, &I);
9567     }
9568 
9569     break;
9570   }
9571 
9572   case Instruction::ZExt:
9573   case Instruction::SExt:
9574   case Instruction::FPToUI:
9575   case Instruction::FPToSI:
9576   case Instruction::FPExt:
9577   case Instruction::PtrToInt:
9578   case Instruction::IntToPtr:
9579   case Instruction::SIToFP:
9580   case Instruction::UIToFP:
9581   case Instruction::Trunc:
9582   case Instruction::FPTrunc:
9583   case Instruction::BitCast: {
9584     auto *CI = cast<CastInst>(&I);
9585     State.ILV->setDebugLocFromInst(CI);
9586 
9587     /// Vectorize casts.
9588     Type *DestTy = (State.VF.isScalar())
9589                        ? CI->getType()
9590                        : VectorType::get(CI->getType(), State.VF);
9591 
9592     for (unsigned Part = 0; Part < State.UF; ++Part) {
9593       Value *A = State.get(getOperand(0), Part);
9594       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9595       State.set(this, Cast, Part);
9596       State.ILV->addMetadata(Cast, &I);
9597     }
9598     break;
9599   }
9600   default:
9601     // This instruction is not vectorized by simple widening.
9602     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9603     llvm_unreachable("Unhandled instruction!");
9604   } // end of switch.
9605 }
9606 
9607 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9608   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9609   // Construct a vector GEP by widening the operands of the scalar GEP as
9610   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9611   // results in a vector of pointers when at least one operand of the GEP
9612   // is vector-typed. Thus, to keep the representation compact, we only use
9613   // vector-typed operands for loop-varying values.
9614 
9615   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9616     // If we are vectorizing, but the GEP has only loop-invariant operands,
9617     // the GEP we build (by only using vector-typed operands for
9618     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9619     // produce a vector of pointers, we need to either arbitrarily pick an
9620     // operand to broadcast, or broadcast a clone of the original GEP.
9621     // Here, we broadcast a clone of the original.
9622     //
9623     // TODO: If at some point we decide to scalarize instructions having
9624     //       loop-invariant operands, this special case will no longer be
9625     //       required. We would add the scalarization decision to
9626     //       collectLoopScalars() and teach getVectorValue() to broadcast
9627     //       the lane-zero scalar value.
9628     auto *Clone = State.Builder.Insert(GEP->clone());
9629     for (unsigned Part = 0; Part < State.UF; ++Part) {
9630       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9631       State.set(this, EntryPart, Part);
9632       State.ILV->addMetadata(EntryPart, GEP);
9633     }
9634   } else {
9635     // If the GEP has at least one loop-varying operand, we are sure to
9636     // produce a vector of pointers. But if we are only unrolling, we want
9637     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9638     // produce with the code below will be scalar (if VF == 1) or vector
9639     // (otherwise). Note that for the unroll-only case, we still maintain
9640     // values in the vector mapping with initVector, as we do for other
9641     // instructions.
9642     for (unsigned Part = 0; Part < State.UF; ++Part) {
9643       // The pointer operand of the new GEP. If it's loop-invariant, we
9644       // won't broadcast it.
9645       auto *Ptr = IsPtrLoopInvariant
9646                       ? State.get(getOperand(0), VPIteration(0, 0))
9647                       : State.get(getOperand(0), Part);
9648 
9649       // Collect all the indices for the new GEP. If any index is
9650       // loop-invariant, we won't broadcast it.
9651       SmallVector<Value *, 4> Indices;
9652       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9653         VPValue *Operand = getOperand(I);
9654         if (IsIndexLoopInvariant[I - 1])
9655           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9656         else
9657           Indices.push_back(State.get(Operand, Part));
9658       }
9659 
9660       // If the GEP instruction is vectorized and was in a basic block that
9661       // needed predication, we can't propagate the poison-generating 'inbounds'
9662       // flag. The control flow has been linearized and the GEP is no longer
9663       // guarded by the predicate, which could make the 'inbounds' properties to
9664       // no longer hold.
9665       bool IsInBounds =
9666           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9667 
9668       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9669       // but it should be a vector, otherwise.
9670       auto *NewGEP = IsInBounds
9671                          ? State.Builder.CreateInBoundsGEP(
9672                                GEP->getSourceElementType(), Ptr, Indices)
9673                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9674                                                    Ptr, Indices);
9675       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9676              "NewGEP is not a pointer vector");
9677       State.set(this, NewGEP, Part);
9678       State.ILV->addMetadata(NewGEP, GEP);
9679     }
9680   }
9681 }
9682 
9683 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9684   assert(!State.Instance && "Int or FP induction being replicated.");
9685   State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
9686                                    getStartValue()->getLiveInIRValue(),
9687                                    getTruncInst(), getVPValue(0), State);
9688 }
9689 
9690 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9691   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9692                                  State);
9693 }
9694 
9695 void VPBlendRecipe::execute(VPTransformState &State) {
9696   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9697   // We know that all PHIs in non-header blocks are converted into
9698   // selects, so we don't have to worry about the insertion order and we
9699   // can just use the builder.
9700   // At this point we generate the predication tree. There may be
9701   // duplications since this is a simple recursive scan, but future
9702   // optimizations will clean it up.
9703 
9704   unsigned NumIncoming = getNumIncomingValues();
9705 
9706   // Generate a sequence of selects of the form:
9707   // SELECT(Mask3, In3,
9708   //        SELECT(Mask2, In2,
9709   //               SELECT(Mask1, In1,
9710   //                      In0)))
9711   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9712   // are essentially undef are taken from In0.
9713   InnerLoopVectorizer::VectorParts Entry(State.UF);
9714   for (unsigned In = 0; In < NumIncoming; ++In) {
9715     for (unsigned Part = 0; Part < State.UF; ++Part) {
9716       // We might have single edge PHIs (blocks) - use an identity
9717       // 'select' for the first PHI operand.
9718       Value *In0 = State.get(getIncomingValue(In), Part);
9719       if (In == 0)
9720         Entry[Part] = In0; // Initialize with the first incoming value.
9721       else {
9722         // Select between the current value and the previous incoming edge
9723         // based on the incoming mask.
9724         Value *Cond = State.get(getMask(In), Part);
9725         Entry[Part] =
9726             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9727       }
9728     }
9729   }
9730   for (unsigned Part = 0; Part < State.UF; ++Part)
9731     State.set(this, Entry[Part], Part);
9732 }
9733 
9734 void VPInterleaveRecipe::execute(VPTransformState &State) {
9735   assert(!State.Instance && "Interleave group being replicated.");
9736   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9737                                       getStoredValues(), getMask());
9738 }
9739 
9740 void VPReductionRecipe::execute(VPTransformState &State) {
9741   assert(!State.Instance && "Reduction being replicated.");
9742   Value *PrevInChain = State.get(getChainOp(), 0);
9743   RecurKind Kind = RdxDesc->getRecurrenceKind();
9744   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9745   // Propagate the fast-math flags carried by the underlying instruction.
9746   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9747   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9748   for (unsigned Part = 0; Part < State.UF; ++Part) {
9749     Value *NewVecOp = State.get(getVecOp(), Part);
9750     if (VPValue *Cond = getCondOp()) {
9751       Value *NewCond = State.get(Cond, Part);
9752       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9753       Value *Iden = RdxDesc->getRecurrenceIdentity(
9754           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9755       Value *IdenVec =
9756           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9757       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9758       NewVecOp = Select;
9759     }
9760     Value *NewRed;
9761     Value *NextInChain;
9762     if (IsOrdered) {
9763       if (State.VF.isVector())
9764         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9765                                         PrevInChain);
9766       else
9767         NewRed = State.Builder.CreateBinOp(
9768             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9769             NewVecOp);
9770       PrevInChain = NewRed;
9771     } else {
9772       PrevInChain = State.get(getChainOp(), Part);
9773       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9774     }
9775     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9776       NextInChain =
9777           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9778                          NewRed, PrevInChain);
9779     } else if (IsOrdered)
9780       NextInChain = NewRed;
9781     else
9782       NextInChain = State.Builder.CreateBinOp(
9783           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9784           PrevInChain);
9785     State.set(this, NextInChain, Part);
9786   }
9787 }
9788 
9789 void VPReplicateRecipe::execute(VPTransformState &State) {
9790   if (State.Instance) { // Generate a single instance.
9791     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9792     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9793                                     IsPredicated, State);
9794     // Insert scalar instance packing it into a vector.
9795     if (AlsoPack && State.VF.isVector()) {
9796       // If we're constructing lane 0, initialize to start from poison.
9797       if (State.Instance->Lane.isFirstLane()) {
9798         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9799         Value *Poison = PoisonValue::get(
9800             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9801         State.set(this, Poison, State.Instance->Part);
9802       }
9803       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9804     }
9805     return;
9806   }
9807 
9808   // Generate scalar instances for all VF lanes of all UF parts, unless the
9809   // instruction is uniform inwhich case generate only the first lane for each
9810   // of the UF parts.
9811   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9812   assert((!State.VF.isScalable() || IsUniform) &&
9813          "Can't scalarize a scalable vector");
9814   for (unsigned Part = 0; Part < State.UF; ++Part)
9815     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9816       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9817                                       VPIteration(Part, Lane), IsPredicated,
9818                                       State);
9819 }
9820 
9821 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9822   assert(State.Instance && "Branch on Mask works only on single instance.");
9823 
9824   unsigned Part = State.Instance->Part;
9825   unsigned Lane = State.Instance->Lane.getKnownLane();
9826 
9827   Value *ConditionBit = nullptr;
9828   VPValue *BlockInMask = getMask();
9829   if (BlockInMask) {
9830     ConditionBit = State.get(BlockInMask, Part);
9831     if (ConditionBit->getType()->isVectorTy())
9832       ConditionBit = State.Builder.CreateExtractElement(
9833           ConditionBit, State.Builder.getInt32(Lane));
9834   } else // Block in mask is all-one.
9835     ConditionBit = State.Builder.getTrue();
9836 
9837   // Replace the temporary unreachable terminator with a new conditional branch,
9838   // whose two destinations will be set later when they are created.
9839   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9840   assert(isa<UnreachableInst>(CurrentTerminator) &&
9841          "Expected to replace unreachable terminator with conditional branch.");
9842   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9843   CondBr->setSuccessor(0, nullptr);
9844   ReplaceInstWithInst(CurrentTerminator, CondBr);
9845 }
9846 
9847 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9848   assert(State.Instance && "Predicated instruction PHI works per instance.");
9849   Instruction *ScalarPredInst =
9850       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9851   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9852   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9853   assert(PredicatingBB && "Predicated block has no single predecessor.");
9854   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9855          "operand must be VPReplicateRecipe");
9856 
9857   // By current pack/unpack logic we need to generate only a single phi node: if
9858   // a vector value for the predicated instruction exists at this point it means
9859   // the instruction has vector users only, and a phi for the vector value is
9860   // needed. In this case the recipe of the predicated instruction is marked to
9861   // also do that packing, thereby "hoisting" the insert-element sequence.
9862   // Otherwise, a phi node for the scalar value is needed.
9863   unsigned Part = State.Instance->Part;
9864   if (State.hasVectorValue(getOperand(0), Part)) {
9865     Value *VectorValue = State.get(getOperand(0), Part);
9866     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9867     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9868     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9869     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9870     if (State.hasVectorValue(this, Part))
9871       State.reset(this, VPhi, Part);
9872     else
9873       State.set(this, VPhi, Part);
9874     // NOTE: Currently we need to update the value of the operand, so the next
9875     // predicated iteration inserts its generated value in the correct vector.
9876     State.reset(getOperand(0), VPhi, Part);
9877   } else {
9878     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9879     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9880     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9881                      PredicatingBB);
9882     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9883     if (State.hasScalarValue(this, *State.Instance))
9884       State.reset(this, Phi, *State.Instance);
9885     else
9886       State.set(this, Phi, *State.Instance);
9887     // NOTE: Currently we need to update the value of the operand, so the next
9888     // predicated iteration inserts its generated value in the correct vector.
9889     State.reset(getOperand(0), Phi, *State.Instance);
9890   }
9891 }
9892 
9893 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9894   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9895 
9896   // Attempt to issue a wide load.
9897   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9898   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9899 
9900   assert((LI || SI) && "Invalid Load/Store instruction");
9901   assert((!SI || StoredValue) && "No stored value provided for widened store");
9902   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9903 
9904   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9905 
9906   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9907   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9908   bool CreateGatherScatter = !Consecutive;
9909 
9910   auto &Builder = State.Builder;
9911   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9912   bool isMaskRequired = getMask();
9913   if (isMaskRequired)
9914     for (unsigned Part = 0; Part < State.UF; ++Part)
9915       BlockInMaskParts[Part] = State.get(getMask(), Part);
9916 
9917   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9918     // Calculate the pointer for the specific unroll-part.
9919     GetElementPtrInst *PartPtr = nullptr;
9920 
9921     bool InBounds = false;
9922     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9923       InBounds = gep->isInBounds();
9924     if (Reverse) {
9925       // If the address is consecutive but reversed, then the
9926       // wide store needs to start at the last vector element.
9927       // RunTimeVF =  VScale * VF.getKnownMinValue()
9928       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9929       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9930       // NumElt = -Part * RunTimeVF
9931       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9932       // LastLane = 1 - RunTimeVF
9933       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9934       PartPtr =
9935           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9936       PartPtr->setIsInBounds(InBounds);
9937       PartPtr = cast<GetElementPtrInst>(
9938           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9939       PartPtr->setIsInBounds(InBounds);
9940       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9941         BlockInMaskParts[Part] =
9942             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9943     } else {
9944       Value *Increment =
9945           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9946       PartPtr = cast<GetElementPtrInst>(
9947           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9948       PartPtr->setIsInBounds(InBounds);
9949     }
9950 
9951     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9952     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9953   };
9954 
9955   // Handle Stores:
9956   if (SI) {
9957     State.ILV->setDebugLocFromInst(SI);
9958 
9959     for (unsigned Part = 0; Part < State.UF; ++Part) {
9960       Instruction *NewSI = nullptr;
9961       Value *StoredVal = State.get(StoredValue, Part);
9962       if (CreateGatherScatter) {
9963         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9964         Value *VectorGep = State.get(getAddr(), Part);
9965         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9966                                             MaskPart);
9967       } else {
9968         if (Reverse) {
9969           // If we store to reverse consecutive memory locations, then we need
9970           // to reverse the order of elements in the stored value.
9971           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9972           // We don't want to update the value in the map as it might be used in
9973           // another expression. So don't call resetVectorValue(StoredVal).
9974         }
9975         auto *VecPtr =
9976             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9977         if (isMaskRequired)
9978           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9979                                             BlockInMaskParts[Part]);
9980         else
9981           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9982       }
9983       State.ILV->addMetadata(NewSI, SI);
9984     }
9985     return;
9986   }
9987 
9988   // Handle loads.
9989   assert(LI && "Must have a load instruction");
9990   State.ILV->setDebugLocFromInst(LI);
9991   for (unsigned Part = 0; Part < State.UF; ++Part) {
9992     Value *NewLI;
9993     if (CreateGatherScatter) {
9994       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9995       Value *VectorGep = State.get(getAddr(), Part);
9996       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9997                                          nullptr, "wide.masked.gather");
9998       State.ILV->addMetadata(NewLI, LI);
9999     } else {
10000       auto *VecPtr =
10001           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10002       if (isMaskRequired)
10003         NewLI = Builder.CreateMaskedLoad(
10004             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10005             PoisonValue::get(DataTy), "wide.masked.load");
10006       else
10007         NewLI =
10008             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10009 
10010       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10011       State.ILV->addMetadata(NewLI, LI);
10012       if (Reverse)
10013         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10014     }
10015 
10016     State.set(getVPSingleValue(), NewLI, Part);
10017   }
10018 }
10019 
10020 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10021 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10022 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10023 // for predication.
10024 static ScalarEpilogueLowering getScalarEpilogueLowering(
10025     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10026     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10027     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10028     LoopVectorizationLegality &LVL) {
10029   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10030   // don't look at hints or options, and don't request a scalar epilogue.
10031   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10032   // LoopAccessInfo (due to code dependency and not being able to reliably get
10033   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10034   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10035   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10036   // back to the old way and vectorize with versioning when forced. See D81345.)
10037   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10038                                                       PGSOQueryType::IRPass) &&
10039                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10040     return CM_ScalarEpilogueNotAllowedOptSize;
10041 
10042   // 2) If set, obey the directives
10043   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10044     switch (PreferPredicateOverEpilogue) {
10045     case PreferPredicateTy::ScalarEpilogue:
10046       return CM_ScalarEpilogueAllowed;
10047     case PreferPredicateTy::PredicateElseScalarEpilogue:
10048       return CM_ScalarEpilogueNotNeededUsePredicate;
10049     case PreferPredicateTy::PredicateOrDontVectorize:
10050       return CM_ScalarEpilogueNotAllowedUsePredicate;
10051     };
10052   }
10053 
10054   // 3) If set, obey the hints
10055   switch (Hints.getPredicate()) {
10056   case LoopVectorizeHints::FK_Enabled:
10057     return CM_ScalarEpilogueNotNeededUsePredicate;
10058   case LoopVectorizeHints::FK_Disabled:
10059     return CM_ScalarEpilogueAllowed;
10060   };
10061 
10062   // 4) if the TTI hook indicates this is profitable, request predication.
10063   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10064                                        LVL.getLAI()))
10065     return CM_ScalarEpilogueNotNeededUsePredicate;
10066 
10067   return CM_ScalarEpilogueAllowed;
10068 }
10069 
10070 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10071   // If Values have been set for this Def return the one relevant for \p Part.
10072   if (hasVectorValue(Def, Part))
10073     return Data.PerPartOutput[Def][Part];
10074 
10075   if (!hasScalarValue(Def, {Part, 0})) {
10076     Value *IRV = Def->getLiveInIRValue();
10077     Value *B = ILV->getBroadcastInstrs(IRV);
10078     set(Def, B, Part);
10079     return B;
10080   }
10081 
10082   Value *ScalarValue = get(Def, {Part, 0});
10083   // If we aren't vectorizing, we can just copy the scalar map values over
10084   // to the vector map.
10085   if (VF.isScalar()) {
10086     set(Def, ScalarValue, Part);
10087     return ScalarValue;
10088   }
10089 
10090   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10091   bool IsUniform = RepR && RepR->isUniform();
10092 
10093   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10094   // Check if there is a scalar value for the selected lane.
10095   if (!hasScalarValue(Def, {Part, LastLane})) {
10096     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10097     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10098            "unexpected recipe found to be invariant");
10099     IsUniform = true;
10100     LastLane = 0;
10101   }
10102 
10103   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10104   // Set the insert point after the last scalarized instruction or after the
10105   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10106   // will directly follow the scalar definitions.
10107   auto OldIP = Builder.saveIP();
10108   auto NewIP =
10109       isa<PHINode>(LastInst)
10110           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10111           : std::next(BasicBlock::iterator(LastInst));
10112   Builder.SetInsertPoint(&*NewIP);
10113 
10114   // However, if we are vectorizing, we need to construct the vector values.
10115   // If the value is known to be uniform after vectorization, we can just
10116   // broadcast the scalar value corresponding to lane zero for each unroll
10117   // iteration. Otherwise, we construct the vector values using
10118   // insertelement instructions. Since the resulting vectors are stored in
10119   // State, we will only generate the insertelements once.
10120   Value *VectorValue = nullptr;
10121   if (IsUniform) {
10122     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10123     set(Def, VectorValue, Part);
10124   } else {
10125     // Initialize packing with insertelements to start from undef.
10126     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10127     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10128     set(Def, Undef, Part);
10129     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10130       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10131     VectorValue = get(Def, Part);
10132   }
10133   Builder.restoreIP(OldIP);
10134   return VectorValue;
10135 }
10136 
10137 // Process the loop in the VPlan-native vectorization path. This path builds
10138 // VPlan upfront in the vectorization pipeline, which allows to apply
10139 // VPlan-to-VPlan transformations from the very beginning without modifying the
10140 // input LLVM IR.
10141 static bool processLoopInVPlanNativePath(
10142     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10143     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10144     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10145     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10146     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10147     LoopVectorizationRequirements &Requirements) {
10148 
10149   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10150     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10151     return false;
10152   }
10153   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10154   Function *F = L->getHeader()->getParent();
10155   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10156 
10157   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10158       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10159 
10160   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10161                                 &Hints, IAI);
10162   // Use the planner for outer loop vectorization.
10163   // TODO: CM is not used at this point inside the planner. Turn CM into an
10164   // optional argument if we don't need it in the future.
10165   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10166                                Requirements, ORE);
10167 
10168   // Get user vectorization factor.
10169   ElementCount UserVF = Hints.getWidth();
10170 
10171   CM.collectElementTypesForWidening();
10172 
10173   // Plan how to best vectorize, return the best VF and its cost.
10174   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10175 
10176   // If we are stress testing VPlan builds, do not attempt to generate vector
10177   // code. Masked vector code generation support will follow soon.
10178   // Also, do not attempt to vectorize if no vector code will be produced.
10179   if (VPlanBuildStressTest || EnableVPlanPredication ||
10180       VectorizationFactor::Disabled() == VF)
10181     return false;
10182 
10183   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10184 
10185   {
10186     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10187                              F->getParent()->getDataLayout());
10188     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10189                            &CM, BFI, PSI, Checks);
10190     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10191                       << L->getHeader()->getParent()->getName() << "\"\n");
10192     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10193   }
10194 
10195   // Mark the loop as already vectorized to avoid vectorizing again.
10196   Hints.setAlreadyVectorized();
10197   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10198   return true;
10199 }
10200 
10201 // Emit a remark if there are stores to floats that required a floating point
10202 // extension. If the vectorized loop was generated with floating point there
10203 // will be a performance penalty from the conversion overhead and the change in
10204 // the vector width.
10205 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10206   SmallVector<Instruction *, 4> Worklist;
10207   for (BasicBlock *BB : L->getBlocks()) {
10208     for (Instruction &Inst : *BB) {
10209       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10210         if (S->getValueOperand()->getType()->isFloatTy())
10211           Worklist.push_back(S);
10212       }
10213     }
10214   }
10215 
10216   // Traverse the floating point stores upwards searching, for floating point
10217   // conversions.
10218   SmallPtrSet<const Instruction *, 4> Visited;
10219   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10220   while (!Worklist.empty()) {
10221     auto *I = Worklist.pop_back_val();
10222     if (!L->contains(I))
10223       continue;
10224     if (!Visited.insert(I).second)
10225       continue;
10226 
10227     // Emit a remark if the floating point store required a floating
10228     // point conversion.
10229     // TODO: More work could be done to identify the root cause such as a
10230     // constant or a function return type and point the user to it.
10231     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10232       ORE->emit([&]() {
10233         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10234                                           I->getDebugLoc(), L->getHeader())
10235                << "floating point conversion changes vector width. "
10236                << "Mixed floating point precision requires an up/down "
10237                << "cast that will negatively impact performance.";
10238       });
10239 
10240     for (Use &Op : I->operands())
10241       if (auto *OpI = dyn_cast<Instruction>(Op))
10242         Worklist.push_back(OpI);
10243   }
10244 }
10245 
10246 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10247     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10248                                !EnableLoopInterleaving),
10249       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10250                               !EnableLoopVectorization) {}
10251 
10252 bool LoopVectorizePass::processLoop(Loop *L) {
10253   assert((EnableVPlanNativePath || L->isInnermost()) &&
10254          "VPlan-native path is not enabled. Only process inner loops.");
10255 
10256 #ifndef NDEBUG
10257   const std::string DebugLocStr = getDebugLocString(L);
10258 #endif /* NDEBUG */
10259 
10260   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10261                     << L->getHeader()->getParent()->getName() << "\" from "
10262                     << DebugLocStr << "\n");
10263 
10264   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10265 
10266   LLVM_DEBUG(
10267       dbgs() << "LV: Loop hints:"
10268              << " force="
10269              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10270                      ? "disabled"
10271                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10272                             ? "enabled"
10273                             : "?"))
10274              << " width=" << Hints.getWidth()
10275              << " interleave=" << Hints.getInterleave() << "\n");
10276 
10277   // Function containing loop
10278   Function *F = L->getHeader()->getParent();
10279 
10280   // Looking at the diagnostic output is the only way to determine if a loop
10281   // was vectorized (other than looking at the IR or machine code), so it
10282   // is important to generate an optimization remark for each loop. Most of
10283   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10284   // generated as OptimizationRemark and OptimizationRemarkMissed are
10285   // less verbose reporting vectorized loops and unvectorized loops that may
10286   // benefit from vectorization, respectively.
10287 
10288   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10289     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10290     return false;
10291   }
10292 
10293   PredicatedScalarEvolution PSE(*SE, *L);
10294 
10295   // Check if it is legal to vectorize the loop.
10296   LoopVectorizationRequirements Requirements;
10297   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10298                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10299   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10300     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10301     Hints.emitRemarkWithHints();
10302     return false;
10303   }
10304 
10305   // Check the function attributes and profiles to find out if this function
10306   // should be optimized for size.
10307   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10308       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10309 
10310   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10311   // here. They may require CFG and instruction level transformations before
10312   // even evaluating whether vectorization is profitable. Since we cannot modify
10313   // the incoming IR, we need to build VPlan upfront in the vectorization
10314   // pipeline.
10315   if (!L->isInnermost())
10316     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10317                                         ORE, BFI, PSI, Hints, Requirements);
10318 
10319   assert(L->isInnermost() && "Inner loop expected.");
10320 
10321   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10322   // count by optimizing for size, to minimize overheads.
10323   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10324   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10325     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10326                       << "This loop is worth vectorizing only if no scalar "
10327                       << "iteration overheads are incurred.");
10328     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10329       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10330     else {
10331       LLVM_DEBUG(dbgs() << "\n");
10332       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10333     }
10334   }
10335 
10336   // Check the function attributes to see if implicit floats are allowed.
10337   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10338   // an integer loop and the vector instructions selected are purely integer
10339   // vector instructions?
10340   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10341     reportVectorizationFailure(
10342         "Can't vectorize when the NoImplicitFloat attribute is used",
10343         "loop not vectorized due to NoImplicitFloat attribute",
10344         "NoImplicitFloat", ORE, L);
10345     Hints.emitRemarkWithHints();
10346     return false;
10347   }
10348 
10349   // Check if the target supports potentially unsafe FP vectorization.
10350   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10351   // for the target we're vectorizing for, to make sure none of the
10352   // additional fp-math flags can help.
10353   if (Hints.isPotentiallyUnsafe() &&
10354       TTI->isFPVectorizationPotentiallyUnsafe()) {
10355     reportVectorizationFailure(
10356         "Potentially unsafe FP op prevents vectorization",
10357         "loop not vectorized due to unsafe FP support.",
10358         "UnsafeFP", ORE, L);
10359     Hints.emitRemarkWithHints();
10360     return false;
10361   }
10362 
10363   bool AllowOrderedReductions;
10364   // If the flag is set, use that instead and override the TTI behaviour.
10365   if (ForceOrderedReductions.getNumOccurrences() > 0)
10366     AllowOrderedReductions = ForceOrderedReductions;
10367   else
10368     AllowOrderedReductions = TTI->enableOrderedReductions();
10369   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10370     ORE->emit([&]() {
10371       auto *ExactFPMathInst = Requirements.getExactFPInst();
10372       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10373                                                  ExactFPMathInst->getDebugLoc(),
10374                                                  ExactFPMathInst->getParent())
10375              << "loop not vectorized: cannot prove it is safe to reorder "
10376                 "floating-point operations";
10377     });
10378     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10379                          "reorder floating-point operations\n");
10380     Hints.emitRemarkWithHints();
10381     return false;
10382   }
10383 
10384   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10385   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10386 
10387   // If an override option has been passed in for interleaved accesses, use it.
10388   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10389     UseInterleaved = EnableInterleavedMemAccesses;
10390 
10391   // Analyze interleaved memory accesses.
10392   if (UseInterleaved) {
10393     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10394   }
10395 
10396   // Use the cost model.
10397   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10398                                 F, &Hints, IAI);
10399   CM.collectValuesToIgnore();
10400   CM.collectElementTypesForWidening();
10401 
10402   // Use the planner for vectorization.
10403   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10404                                Requirements, ORE);
10405 
10406   // Get user vectorization factor and interleave count.
10407   ElementCount UserVF = Hints.getWidth();
10408   unsigned UserIC = Hints.getInterleave();
10409 
10410   // Plan how to best vectorize, return the best VF and its cost.
10411   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10412 
10413   VectorizationFactor VF = VectorizationFactor::Disabled();
10414   unsigned IC = 1;
10415 
10416   if (MaybeVF) {
10417     VF = *MaybeVF;
10418     // Select the interleave count.
10419     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10420   }
10421 
10422   // Identify the diagnostic messages that should be produced.
10423   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10424   bool VectorizeLoop = true, InterleaveLoop = true;
10425   if (VF.Width.isScalar()) {
10426     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10427     VecDiagMsg = std::make_pair(
10428         "VectorizationNotBeneficial",
10429         "the cost-model indicates that vectorization is not beneficial");
10430     VectorizeLoop = false;
10431   }
10432 
10433   if (!MaybeVF && UserIC > 1) {
10434     // Tell the user interleaving was avoided up-front, despite being explicitly
10435     // requested.
10436     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10437                          "interleaving should be avoided up front\n");
10438     IntDiagMsg = std::make_pair(
10439         "InterleavingAvoided",
10440         "Ignoring UserIC, because interleaving was avoided up front");
10441     InterleaveLoop = false;
10442   } else if (IC == 1 && UserIC <= 1) {
10443     // Tell the user interleaving is not beneficial.
10444     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10445     IntDiagMsg = std::make_pair(
10446         "InterleavingNotBeneficial",
10447         "the cost-model indicates that interleaving is not beneficial");
10448     InterleaveLoop = false;
10449     if (UserIC == 1) {
10450       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10451       IntDiagMsg.second +=
10452           " and is explicitly disabled or interleave count is set to 1";
10453     }
10454   } else if (IC > 1 && UserIC == 1) {
10455     // Tell the user interleaving is beneficial, but it explicitly disabled.
10456     LLVM_DEBUG(
10457         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10458     IntDiagMsg = std::make_pair(
10459         "InterleavingBeneficialButDisabled",
10460         "the cost-model indicates that interleaving is beneficial "
10461         "but is explicitly disabled or interleave count is set to 1");
10462     InterleaveLoop = false;
10463   }
10464 
10465   // Override IC if user provided an interleave count.
10466   IC = UserIC > 0 ? UserIC : IC;
10467 
10468   // Emit diagnostic messages, if any.
10469   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10470   if (!VectorizeLoop && !InterleaveLoop) {
10471     // Do not vectorize or interleaving the loop.
10472     ORE->emit([&]() {
10473       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10474                                       L->getStartLoc(), L->getHeader())
10475              << VecDiagMsg.second;
10476     });
10477     ORE->emit([&]() {
10478       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10479                                       L->getStartLoc(), L->getHeader())
10480              << IntDiagMsg.second;
10481     });
10482     return false;
10483   } else if (!VectorizeLoop && InterleaveLoop) {
10484     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10485     ORE->emit([&]() {
10486       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10487                                         L->getStartLoc(), L->getHeader())
10488              << VecDiagMsg.second;
10489     });
10490   } else if (VectorizeLoop && !InterleaveLoop) {
10491     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10492                       << ") in " << DebugLocStr << '\n');
10493     ORE->emit([&]() {
10494       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10495                                         L->getStartLoc(), L->getHeader())
10496              << IntDiagMsg.second;
10497     });
10498   } else if (VectorizeLoop && InterleaveLoop) {
10499     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10500                       << ") in " << DebugLocStr << '\n');
10501     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10502   }
10503 
10504   bool DisableRuntimeUnroll = false;
10505   MDNode *OrigLoopID = L->getLoopID();
10506   {
10507     // Optimistically generate runtime checks. Drop them if they turn out to not
10508     // be profitable. Limit the scope of Checks, so the cleanup happens
10509     // immediately after vector codegeneration is done.
10510     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10511                              F->getParent()->getDataLayout());
10512     if (!VF.Width.isScalar() || IC > 1)
10513       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10514 
10515     using namespace ore;
10516     if (!VectorizeLoop) {
10517       assert(IC > 1 && "interleave count should not be 1 or 0");
10518       // If we decided that it is not legal to vectorize the loop, then
10519       // interleave it.
10520       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10521                                  &CM, BFI, PSI, Checks);
10522 
10523       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10524       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10525 
10526       ORE->emit([&]() {
10527         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10528                                   L->getHeader())
10529                << "interleaved loop (interleaved count: "
10530                << NV("InterleaveCount", IC) << ")";
10531       });
10532     } else {
10533       // If we decided that it is *legal* to vectorize the loop, then do it.
10534 
10535       // Consider vectorizing the epilogue too if it's profitable.
10536       VectorizationFactor EpilogueVF =
10537           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10538       if (EpilogueVF.Width.isVector()) {
10539 
10540         // The first pass vectorizes the main loop and creates a scalar epilogue
10541         // to be vectorized by executing the plan (potentially with a different
10542         // factor) again shortly afterwards.
10543         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10544         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10545                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10546 
10547         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10548         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10549                         DT);
10550         ++LoopsVectorized;
10551 
10552         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10553         formLCSSARecursively(*L, *DT, LI, SE);
10554 
10555         // Second pass vectorizes the epilogue and adjusts the control flow
10556         // edges from the first pass.
10557         EPI.MainLoopVF = EPI.EpilogueVF;
10558         EPI.MainLoopUF = EPI.EpilogueUF;
10559         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10560                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10561                                                  Checks);
10562 
10563         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10564         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10565                         DT);
10566         ++LoopsEpilogueVectorized;
10567 
10568         if (!MainILV.areSafetyChecksAdded())
10569           DisableRuntimeUnroll = true;
10570       } else {
10571         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10572                                &LVL, &CM, BFI, PSI, Checks);
10573 
10574         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10575         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10576         ++LoopsVectorized;
10577 
10578         // Add metadata to disable runtime unrolling a scalar loop when there
10579         // are no runtime checks about strides and memory. A scalar loop that is
10580         // rarely used is not worth unrolling.
10581         if (!LB.areSafetyChecksAdded())
10582           DisableRuntimeUnroll = true;
10583       }
10584       // Report the vectorization decision.
10585       ORE->emit([&]() {
10586         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10587                                   L->getHeader())
10588                << "vectorized loop (vectorization width: "
10589                << NV("VectorizationFactor", VF.Width)
10590                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10591       });
10592     }
10593 
10594     if (ORE->allowExtraAnalysis(LV_NAME))
10595       checkMixedPrecision(L, ORE);
10596   }
10597 
10598   Optional<MDNode *> RemainderLoopID =
10599       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10600                                       LLVMLoopVectorizeFollowupEpilogue});
10601   if (RemainderLoopID.hasValue()) {
10602     L->setLoopID(RemainderLoopID.getValue());
10603   } else {
10604     if (DisableRuntimeUnroll)
10605       AddRuntimeUnrollDisableMetaData(L);
10606 
10607     // Mark the loop as already vectorized to avoid vectorizing again.
10608     Hints.setAlreadyVectorized();
10609   }
10610 
10611   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10612   return true;
10613 }
10614 
10615 LoopVectorizeResult LoopVectorizePass::runImpl(
10616     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10617     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10618     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10619     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10620     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10621   SE = &SE_;
10622   LI = &LI_;
10623   TTI = &TTI_;
10624   DT = &DT_;
10625   BFI = &BFI_;
10626   TLI = TLI_;
10627   AA = &AA_;
10628   AC = &AC_;
10629   GetLAA = &GetLAA_;
10630   DB = &DB_;
10631   ORE = &ORE_;
10632   PSI = PSI_;
10633 
10634   // Don't attempt if
10635   // 1. the target claims to have no vector registers, and
10636   // 2. interleaving won't help ILP.
10637   //
10638   // The second condition is necessary because, even if the target has no
10639   // vector registers, loop vectorization may still enable scalar
10640   // interleaving.
10641   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10642       TTI->getMaxInterleaveFactor(1) < 2)
10643     return LoopVectorizeResult(false, false);
10644 
10645   bool Changed = false, CFGChanged = false;
10646 
10647   // The vectorizer requires loops to be in simplified form.
10648   // Since simplification may add new inner loops, it has to run before the
10649   // legality and profitability checks. This means running the loop vectorizer
10650   // will simplify all loops, regardless of whether anything end up being
10651   // vectorized.
10652   for (auto &L : *LI)
10653     Changed |= CFGChanged |=
10654         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10655 
10656   // Build up a worklist of inner-loops to vectorize. This is necessary as
10657   // the act of vectorizing or partially unrolling a loop creates new loops
10658   // and can invalidate iterators across the loops.
10659   SmallVector<Loop *, 8> Worklist;
10660 
10661   for (Loop *L : *LI)
10662     collectSupportedLoops(*L, LI, ORE, Worklist);
10663 
10664   LoopsAnalyzed += Worklist.size();
10665 
10666   // Now walk the identified inner loops.
10667   while (!Worklist.empty()) {
10668     Loop *L = Worklist.pop_back_val();
10669 
10670     // For the inner loops we actually process, form LCSSA to simplify the
10671     // transform.
10672     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10673 
10674     Changed |= CFGChanged |= processLoop(L);
10675   }
10676 
10677   // Process each loop nest in the function.
10678   return LoopVectorizeResult(Changed, CFGChanged);
10679 }
10680 
10681 PreservedAnalyses LoopVectorizePass::run(Function &F,
10682                                          FunctionAnalysisManager &AM) {
10683     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10684     auto &LI = AM.getResult<LoopAnalysis>(F);
10685     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10686     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10687     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10688     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10689     auto &AA = AM.getResult<AAManager>(F);
10690     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10691     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10692     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10693 
10694     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10695     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10696         [&](Loop &L) -> const LoopAccessInfo & {
10697       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10698                                         TLI, TTI, nullptr, nullptr, nullptr};
10699       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10700     };
10701     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10702     ProfileSummaryInfo *PSI =
10703         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10704     LoopVectorizeResult Result =
10705         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10706     if (!Result.MadeAnyChange)
10707       return PreservedAnalyses::all();
10708     PreservedAnalyses PA;
10709 
10710     // We currently do not preserve loopinfo/dominator analyses with outer loop
10711     // vectorization. Until this is addressed, mark these analyses as preserved
10712     // only for non-VPlan-native path.
10713     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10714     if (!EnableVPlanNativePath) {
10715       PA.preserve<LoopAnalysis>();
10716       PA.preserve<DominatorTreeAnalysis>();
10717     }
10718 
10719     if (Result.MadeCFGChange) {
10720       // Making CFG changes likely means a loop got vectorized. Indicate that
10721       // extra simplification passes should be run.
10722       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10723       // be run if runtime checks have been added.
10724       AM.getResult<ShouldRunExtraVectorPasses>(F);
10725       PA.preserve<ShouldRunExtraVectorPasses>();
10726     } else {
10727       PA.preserveSet<CFGAnalyses>();
10728     }
10729     return PA;
10730 }
10731 
10732 void LoopVectorizePass::printPipeline(
10733     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10734   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10735       OS, MapClassName2PassName);
10736 
10737   OS << "<";
10738   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10739   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10740   OS << ">";
10741 }
10742