xref: /llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (revision e1a3aa8c0fb0103427c45a5c861fc98aa44a4821)
1 //===- LoopVectorizationLegality.cpp --------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides loop vectorization legality analysis. Original code
10 // resided in LoopVectorize.cpp for a long time.
11 //
12 // At this point, it is implemented as a utility class, not as an analysis
13 // pass. It should be easy to create an analysis pass around it if there
14 // is a need (but D45420 needs to happen first).
15 //
16 
17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/Analysis/LoopInfo.h"
20 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
21 #include "llvm/Analysis/TargetLibraryInfo.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/Analysis/VectorUtils.h"
25 #include "llvm/IR/IntrinsicInst.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/Transforms/Utils/SizeOpts.h"
28 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
29 
30 using namespace llvm;
31 using namespace PatternMatch;
32 
33 #define LV_NAME "loop-vectorize"
34 #define DEBUG_TYPE LV_NAME
35 
36 static cl::opt<bool>
37     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
38                        cl::desc("Enable if-conversion during vectorization."));
39 
40 static cl::opt<bool>
41 AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
42                        cl::desc("Enable recognition of non-constant strided "
43                                 "pointer induction variables."));
44 
45 namespace llvm {
46 cl::opt<bool>
47     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
48                          cl::desc("Allow enabling loop hints to reorder "
49                                   "FP operations during vectorization."));
50 } // namespace llvm
51 
52 // TODO: Move size-based thresholds out of legality checking, make cost based
53 // decisions instead of hard thresholds.
54 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
55     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
56     cl::desc("The maximum number of SCEV checks allowed."));
57 
58 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
59     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
60     cl::desc("The maximum number of SCEV checks allowed with a "
61              "vectorize(enable) pragma"));
62 
63 static cl::opt<LoopVectorizeHints::ScalableForceKind>
64     ForceScalableVectorization(
65         "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified),
66         cl::Hidden,
67         cl::desc("Control whether the compiler can use scalable vectors to "
68                  "vectorize a loop"),
69         cl::values(
70             clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
71                        "Scalable vectorization is disabled."),
72             clEnumValN(
73                 LoopVectorizeHints::SK_PreferScalable, "preferred",
74                 "Scalable vectorization is available and favored when the "
75                 "cost is inconclusive."),
76             clEnumValN(
77                 LoopVectorizeHints::SK_PreferScalable, "on",
78                 "Scalable vectorization is available and favored when the "
79                 "cost is inconclusive.")));
80 
81 /// Maximum vectorization interleave count.
82 static const unsigned MaxInterleaveFactor = 16;
83 
84 namespace llvm {
85 
86 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
87   switch (Kind) {
88   case HK_WIDTH:
89     return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
90   case HK_INTERLEAVE:
91     return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
92   case HK_FORCE:
93     return (Val <= 1);
94   case HK_ISVECTORIZED:
95   case HK_PREDICATE:
96   case HK_SCALABLE:
97     return (Val == 0 || Val == 1);
98   }
99   return false;
100 }
101 
102 LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
103                                        bool InterleaveOnlyWhenForced,
104                                        OptimizationRemarkEmitter &ORE,
105                                        const TargetTransformInfo *TTI)
106     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
107       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
108       Force("vectorize.enable", FK_Undefined, HK_FORCE),
109       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
110       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
111       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
112       TheLoop(L), ORE(ORE) {
113   // Populate values with existing loop metadata.
114   getHintsFromMetadata();
115 
116   // force-vector-interleave overrides DisableInterleaving.
117   if (VectorizerParams::isInterleaveForced())
118     Interleave.Value = VectorizerParams::VectorizationInterleave;
119 
120   // If the metadata doesn't explicitly specify whether to enable scalable
121   // vectorization, then decide based on the following criteria (increasing
122   // level of priority):
123   //  - Target default
124   //  - Metadata width
125   //  - Force option (always overrides)
126   if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
127     if (TTI)
128       Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
129                                                           : SK_FixedWidthOnly;
130 
131     if (Width.Value)
132       // If the width is set, but the metadata says nothing about the scalable
133       // property, then assume it concerns only a fixed-width UserVF.
134       // If width is not set, the flag takes precedence.
135       Scalable.Value = SK_FixedWidthOnly;
136   }
137 
138   // If the flag is set to force any use of scalable vectors, override the loop
139   // hints.
140   if (ForceScalableVectorization.getValue() !=
141       LoopVectorizeHints::SK_Unspecified)
142     Scalable.Value = ForceScalableVectorization.getValue();
143 
144   // Scalable vectorization is disabled if no preference is specified.
145   if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
146     Scalable.Value = SK_FixedWidthOnly;
147 
148   if (IsVectorized.Value != 1)
149     // If the vectorization width and interleaving count are both 1 then
150     // consider the loop to have been already vectorized because there's
151     // nothing more that we can do.
152     IsVectorized.Value =
153         getWidth() == ElementCount::getFixed(1) && getInterleave() == 1;
154   LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
155              << "LV: Interleaving disabled by the pass manager\n");
156 }
157 
158 void LoopVectorizeHints::setAlreadyVectorized() {
159   LLVMContext &Context = TheLoop->getHeader()->getContext();
160 
161   MDNode *IsVectorizedMD = MDNode::get(
162       Context,
163       {MDString::get(Context, "llvm.loop.isvectorized"),
164        ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
165   MDNode *LoopID = TheLoop->getLoopID();
166   MDNode *NewLoopID =
167       makePostTransformationMetadata(Context, LoopID,
168                                      {Twine(Prefix(), "vectorize.").str(),
169                                       Twine(Prefix(), "interleave.").str()},
170                                      {IsVectorizedMD});
171   TheLoop->setLoopID(NewLoopID);
172 
173   // Update internal cache.
174   IsVectorized.Value = 1;
175 }
176 
177 bool LoopVectorizeHints::allowVectorization(
178     Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
179   if (getForce() == LoopVectorizeHints::FK_Disabled) {
180     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
181     emitRemarkWithHints();
182     return false;
183   }
184 
185   if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
186     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
187     emitRemarkWithHints();
188     return false;
189   }
190 
191   if (getIsVectorized() == 1) {
192     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
193     // FIXME: Add interleave.disable metadata. This will allow
194     // vectorize.disable to be used without disabling the pass and errors
195     // to differentiate between disabled vectorization and a width of 1.
196     ORE.emit([&]() {
197       return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
198                                         "AllDisabled", L->getStartLoc(),
199                                         L->getHeader())
200              << "loop not vectorized: vectorization and interleaving are "
201                 "explicitly disabled, or the loop has already been "
202                 "vectorized";
203     });
204     return false;
205   }
206 
207   return true;
208 }
209 
210 void LoopVectorizeHints::emitRemarkWithHints() const {
211   using namespace ore;
212 
213   ORE.emit([&]() {
214     if (Force.Value == LoopVectorizeHints::FK_Disabled)
215       return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
216                                       TheLoop->getStartLoc(),
217                                       TheLoop->getHeader())
218              << "loop not vectorized: vectorization is explicitly disabled";
219 
220     OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
221                                TheLoop->getHeader());
222     R << "loop not vectorized";
223     if (Force.Value == LoopVectorizeHints::FK_Enabled) {
224       R << " (Force=" << NV("Force", true);
225       if (Width.Value != 0)
226         R << ", Vector Width=" << NV("VectorWidth", getWidth());
227       if (getInterleave() != 0)
228         R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
229       R << ")";
230     }
231     return R;
232   });
233 }
234 
235 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
236   if (getWidth() == ElementCount::getFixed(1))
237     return LV_NAME;
238   if (getForce() == LoopVectorizeHints::FK_Disabled)
239     return LV_NAME;
240   if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
241     return LV_NAME;
242   return OptimizationRemarkAnalysis::AlwaysPrint;
243 }
244 
245 bool LoopVectorizeHints::allowReordering() const {
246   // Allow the vectorizer to change the order of operations if enabling
247   // loop hints are provided
248   ElementCount EC = getWidth();
249   return HintsAllowReordering &&
250          (getForce() == LoopVectorizeHints::FK_Enabled ||
251           EC.getKnownMinValue() > 1);
252 }
253 
254 void LoopVectorizeHints::getHintsFromMetadata() {
255   MDNode *LoopID = TheLoop->getLoopID();
256   if (!LoopID)
257     return;
258 
259   // First operand should refer to the loop id itself.
260   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
261   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
262 
263   for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
264     const MDString *S = nullptr;
265     SmallVector<Metadata *, 4> Args;
266 
267     // The expected hint is either a MDString or a MDNode with the first
268     // operand a MDString.
269     if (const MDNode *MD = dyn_cast<MDNode>(MDO)) {
270       if (!MD || MD->getNumOperands() == 0)
271         continue;
272       S = dyn_cast<MDString>(MD->getOperand(0));
273       for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
274         Args.push_back(MD->getOperand(Idx));
275     } else {
276       S = dyn_cast<MDString>(MDO);
277       assert(Args.size() == 0 && "too many arguments for MDString");
278     }
279 
280     if (!S)
281       continue;
282 
283     // Check if the hint starts with the loop metadata prefix.
284     StringRef Name = S->getString();
285     if (Args.size() == 1)
286       setHint(Name, Args[0]);
287   }
288 }
289 
290 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
291   if (!Name.starts_with(Prefix()))
292     return;
293   Name = Name.substr(Prefix().size(), StringRef::npos);
294 
295   const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
296   if (!C)
297     return;
298   unsigned Val = C->getZExtValue();
299 
300   Hint *Hints[] = {&Width,        &Interleave, &Force,
301                    &IsVectorized, &Predicate,  &Scalable};
302   for (auto *H : Hints) {
303     if (Name == H->Name) {
304       if (H->validate(Val))
305         H->Value = Val;
306       else
307         LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
308       break;
309     }
310   }
311 }
312 
313 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
314 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
315 // executing the inner loop will execute the same iterations). This check is
316 // very constrained for now but it will be relaxed in the future. \p Lp is
317 // considered uniform if it meets all the following conditions:
318 //   1) it has a canonical IV (starting from 0 and with stride 1),
319 //   2) its latch terminator is a conditional branch and,
320 //   3) its latch condition is a compare instruction whose operands are the
321 //      canonical IV and an OuterLp invariant.
322 // This check doesn't take into account the uniformity of other conditions not
323 // related to the loop latch because they don't affect the loop uniformity.
324 //
325 // NOTE: We decided to keep all these checks and its associated documentation
326 // together so that we can easily have a picture of the current supported loop
327 // nests. However, some of the current checks don't depend on \p OuterLp and
328 // would be redundantly executed for each \p Lp if we invoked this function for
329 // different candidate outer loops. This is not the case for now because we
330 // don't currently have the infrastructure to evaluate multiple candidate outer
331 // loops and \p OuterLp will be a fixed parameter while we only support explicit
332 // outer loop vectorization. It's also very likely that these checks go away
333 // before introducing the aforementioned infrastructure. However, if this is not
334 // the case, we should move the \p OuterLp independent checks to a separate
335 // function that is only executed once for each \p Lp.
336 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
337   assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
338 
339   // If Lp is the outer loop, it's uniform by definition.
340   if (Lp == OuterLp)
341     return true;
342   assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
343 
344   // 1.
345   PHINode *IV = Lp->getCanonicalInductionVariable();
346   if (!IV) {
347     LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
348     return false;
349   }
350 
351   // 2.
352   BasicBlock *Latch = Lp->getLoopLatch();
353   auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
354   if (!LatchBr || LatchBr->isUnconditional()) {
355     LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
356     return false;
357   }
358 
359   // 3.
360   auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
361   if (!LatchCmp) {
362     LLVM_DEBUG(
363         dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
364     return false;
365   }
366 
367   Value *CondOp0 = LatchCmp->getOperand(0);
368   Value *CondOp1 = LatchCmp->getOperand(1);
369   Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
370   if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
371       !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
372     LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
373     return false;
374   }
375 
376   return true;
377 }
378 
379 // Return true if \p Lp and all its nested loops are uniform with regard to \p
380 // OuterLp.
381 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
382   if (!isUniformLoop(Lp, OuterLp))
383     return false;
384 
385   // Check if nested loops are uniform.
386   for (Loop *SubLp : *Lp)
387     if (!isUniformLoopNest(SubLp, OuterLp))
388       return false;
389 
390   return true;
391 }
392 
393 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
394   if (Ty->isPointerTy())
395     return DL.getIntPtrType(Ty);
396 
397   // It is possible that char's or short's overflow when we ask for the loop's
398   // trip count, work around this by changing the type size.
399   if (Ty->getScalarSizeInBits() < 32)
400     return Type::getInt32Ty(Ty->getContext());
401 
402   return Ty;
403 }
404 
405 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
406   Ty0 = convertPointerToIntegerType(DL, Ty0);
407   Ty1 = convertPointerToIntegerType(DL, Ty1);
408   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
409     return Ty0;
410   return Ty1;
411 }
412 
413 /// Check that the instruction has outside loop users and is not an
414 /// identified reduction variable.
415 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
416                                SmallPtrSetImpl<Value *> &AllowedExit) {
417   // Reductions, Inductions and non-header phis are allowed to have exit users. All
418   // other instructions must not have external users.
419   if (!AllowedExit.count(Inst))
420     // Check that all of the users of the loop are inside the BB.
421     for (User *U : Inst->users()) {
422       Instruction *UI = cast<Instruction>(U);
423       // This user may be a reduction exit value.
424       if (!TheLoop->contains(UI)) {
425         LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
426         return true;
427       }
428     }
429   return false;
430 }
431 
432 /// Returns true if A and B have same pointer operands or same SCEVs addresses
433 static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
434                                StoreInst *B) {
435   // Compare store
436   if (A == B)
437     return true;
438 
439   // Otherwise Compare pointers
440   Value *APtr = A->getPointerOperand();
441   Value *BPtr = B->getPointerOperand();
442   if (APtr == BPtr)
443     return true;
444 
445   // Otherwise compare address SCEVs
446   return SE->getSCEV(APtr) == SE->getSCEV(BPtr);
447 }
448 
449 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
450                                                 Value *Ptr) const {
451   // FIXME: Currently, the set of symbolic strides is sometimes queried before
452   // it's collected.  This happens from canVectorizeWithIfConvert, when the
453   // pointer is checked to reference consecutive elements suitable for a
454   // masked access.
455   const auto &Strides =
456     LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>();
457 
458   Function *F = TheLoop->getHeader()->getParent();
459   bool OptForSize = F->hasOptSize() ||
460                     llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
461                                                 PGSOQueryType::IRPass);
462   bool CanAddPredicate = !OptForSize;
463   int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
464                             CanAddPredicate, false).value_or(0);
465   if (Stride == 1 || Stride == -1)
466     return Stride;
467   return 0;
468 }
469 
470 bool LoopVectorizationLegality::isInvariant(Value *V) const {
471   return LAI->isInvariant(V);
472 }
473 
474 namespace {
475 /// A rewriter to build the SCEVs for each of the VF lanes in the expected
476 /// vectorized loop, which can then be compared to detect their uniformity. This
477 /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
478 /// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
479 /// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
480 /// uniformity.
481 class SCEVAddRecForUniformityRewriter
482     : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
483   /// Multiplier to be applied to the step of AddRecs in TheLoop.
484   unsigned StepMultiplier;
485 
486   /// Offset to be added to the AddRecs in TheLoop.
487   unsigned Offset;
488 
489   /// Loop for which to rewrite AddRecsFor.
490   Loop *TheLoop;
491 
492   /// Is any sub-expressions not analyzable w.r.t. uniformity?
493   bool CannotAnalyze = false;
494 
495   bool canAnalyze() const { return !CannotAnalyze; }
496 
497 public:
498   SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
499                                   unsigned Offset, Loop *TheLoop)
500       : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
501         TheLoop(TheLoop) {}
502 
503   const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
504     assert(Expr->getLoop() == TheLoop &&
505            "addrec outside of TheLoop must be invariant and should have been "
506            "handled earlier");
507     // Build a new AddRec by multiplying the step by StepMultiplier and
508     // incrementing the start by Offset * step.
509     Type *Ty = Expr->getType();
510     auto *Step = Expr->getStepRecurrence(SE);
511     if (!SE.isLoopInvariant(Step, TheLoop)) {
512       CannotAnalyze = true;
513       return Expr;
514     }
515     auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
516     auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
517     auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
518     return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
519   }
520 
521   const SCEV *visit(const SCEV *S) {
522     if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
523       return S;
524     return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
525   }
526 
527   const SCEV *visitUnknown(const SCEVUnknown *S) {
528     if (SE.isLoopInvariant(S, TheLoop))
529       return S;
530     // The value could vary across iterations.
531     CannotAnalyze = true;
532     return S;
533   }
534 
535   const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
536     // Could not analyze the expression.
537     CannotAnalyze = true;
538     return S;
539   }
540 
541   static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
542                              unsigned StepMultiplier, unsigned Offset,
543                              Loop *TheLoop) {
544     /// Bail out if the expression does not contain an UDiv expression.
545     /// Uniform values which are not loop invariant require operations to strip
546     /// out the lowest bits. For now just look for UDivs and use it to avoid
547     /// re-writing UDIV-free expressions for other lanes to limit compile time.
548     if (!SCEVExprContains(S,
549                           [](const SCEV *S) { return isa<SCEVUDivExpr>(S); }))
550       return SE.getCouldNotCompute();
551 
552     SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
553                                              TheLoop);
554     const SCEV *Result = Rewriter.visit(S);
555 
556     if (Rewriter.canAnalyze())
557       return Result;
558     return SE.getCouldNotCompute();
559   }
560 };
561 
562 } // namespace
563 
564 bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
565   if (isInvariant(V))
566     return true;
567   if (VF.isScalable())
568     return false;
569   if (VF.isScalar())
570     return true;
571 
572   // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
573   // never considered uniform.
574   auto *SE = PSE.getSE();
575   if (!SE->isSCEVable(V->getType()))
576     return false;
577   const SCEV *S = SE->getSCEV(V);
578 
579   // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
580   // lane 0 matches the expressions for all other lanes.
581   unsigned FixedVF = VF.getKnownMinValue();
582   const SCEV *FirstLaneExpr =
583       SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
584   if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
585     return false;
586 
587   // Make sure the expressions for lanes FixedVF-1..1 match the expression for
588   // lane 0. We check lanes in reverse order for compile-time, as frequently
589   // checking the last lane is sufficient to rule out uniformity.
590   return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) {
591     const SCEV *IthLaneExpr =
592         SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop);
593     return FirstLaneExpr == IthLaneExpr;
594   });
595 }
596 
597 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
598                                                ElementCount VF) const {
599   Value *Ptr = getLoadStorePointerOperand(&I);
600   if (!Ptr)
601     return false;
602   // Note: There's nothing inherent which prevents predicated loads and
603   // stores from being uniform.  The current lowering simply doesn't handle
604   // it; in particular, the cost model distinguishes scatter/gather from
605   // scalar w/predication, and we currently rely on the scalar path.
606   return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
607 }
608 
609 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
610   assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
611   // Store the result and return it at the end instead of exiting early, in case
612   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
613   bool Result = true;
614   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
615 
616   for (BasicBlock *BB : TheLoop->blocks()) {
617     // Check whether the BB terminator is a BranchInst. Any other terminator is
618     // not supported yet.
619     auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
620     if (!Br) {
621       reportVectorizationFailure("Unsupported basic block terminator",
622           "loop control flow is not understood by vectorizer",
623           "CFGNotUnderstood", ORE, TheLoop);
624       if (DoExtraAnalysis)
625         Result = false;
626       else
627         return false;
628     }
629 
630     // Check whether the BranchInst is a supported one. Only unconditional
631     // branches, conditional branches with an outer loop invariant condition or
632     // backedges are supported.
633     // FIXME: We skip these checks when VPlan predication is enabled as we
634     // want to allow divergent branches. This whole check will be removed
635     // once VPlan predication is on by default.
636     if (Br && Br->isConditional() &&
637         !TheLoop->isLoopInvariant(Br->getCondition()) &&
638         !LI->isLoopHeader(Br->getSuccessor(0)) &&
639         !LI->isLoopHeader(Br->getSuccessor(1))) {
640       reportVectorizationFailure("Unsupported conditional branch",
641           "loop control flow is not understood by vectorizer",
642           "CFGNotUnderstood", ORE, TheLoop);
643       if (DoExtraAnalysis)
644         Result = false;
645       else
646         return false;
647     }
648   }
649 
650   // Check whether inner loops are uniform. At this point, we only support
651   // simple outer loops scenarios with uniform nested loops.
652   if (!isUniformLoopNest(TheLoop /*loop nest*/,
653                          TheLoop /*context outer loop*/)) {
654     reportVectorizationFailure("Outer loop contains divergent loops",
655         "loop control flow is not understood by vectorizer",
656         "CFGNotUnderstood", ORE, TheLoop);
657     if (DoExtraAnalysis)
658       Result = false;
659     else
660       return false;
661   }
662 
663   // Check whether we are able to set up outer loop induction.
664   if (!setupOuterLoopInductions()) {
665     reportVectorizationFailure("Unsupported outer loop Phi(s)",
666                                "Unsupported outer loop Phi(s)",
667                                "UnsupportedPhi", ORE, TheLoop);
668     if (DoExtraAnalysis)
669       Result = false;
670     else
671       return false;
672   }
673 
674   return Result;
675 }
676 
677 void LoopVectorizationLegality::addInductionPhi(
678     PHINode *Phi, const InductionDescriptor &ID,
679     SmallPtrSetImpl<Value *> &AllowedExit) {
680   Inductions[Phi] = ID;
681 
682   // In case this induction also comes with casts that we know we can ignore
683   // in the vectorized loop body, record them here. All casts could be recorded
684   // here for ignoring, but suffices to record only the first (as it is the
685   // only one that may bw used outside the cast sequence).
686   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
687   if (!Casts.empty())
688     InductionCastsToIgnore.insert(*Casts.begin());
689 
690   Type *PhiTy = Phi->getType();
691   const DataLayout &DL = Phi->getDataLayout();
692 
693   // Get the widest type.
694   if (!PhiTy->isFloatingPointTy()) {
695     if (!WidestIndTy)
696       WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
697     else
698       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
699   }
700 
701   // Int inductions are special because we only allow one IV.
702   if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
703       ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
704       isa<Constant>(ID.getStartValue()) &&
705       cast<Constant>(ID.getStartValue())->isNullValue()) {
706 
707     // Use the phi node with the widest type as induction. Use the last
708     // one if there are multiple (no good reason for doing this other
709     // than it is expedient). We've checked that it begins at zero and
710     // steps by one, so this is a canonical induction variable.
711     if (!PrimaryInduction || PhiTy == WidestIndTy)
712       PrimaryInduction = Phi;
713   }
714 
715   // Both the PHI node itself, and the "post-increment" value feeding
716   // back into the PHI node may have external users.
717   // We can allow those uses, except if the SCEVs we have for them rely
718   // on predicates that only hold within the loop, since allowing the exit
719   // currently means re-using this SCEV outside the loop (see PR33706 for more
720   // details).
721   if (PSE.getPredicate().isAlwaysTrue()) {
722     AllowedExit.insert(Phi);
723     AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
724   }
725 
726   LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
727 }
728 
729 bool LoopVectorizationLegality::setupOuterLoopInductions() {
730   BasicBlock *Header = TheLoop->getHeader();
731 
732   // Returns true if a given Phi is a supported induction.
733   auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
734     InductionDescriptor ID;
735     if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
736         ID.getKind() == InductionDescriptor::IK_IntInduction) {
737       addInductionPhi(&Phi, ID, AllowedExit);
738       return true;
739     }
740     // Bail out for any Phi in the outer loop header that is not a supported
741     // induction.
742     LLVM_DEBUG(
743         dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
744     return false;
745   };
746 
747   return llvm::all_of(Header->phis(), IsSupportedPhi);
748 }
749 
750 /// Checks if a function is scalarizable according to the TLI, in
751 /// the sense that it should be vectorized and then expanded in
752 /// multiple scalar calls. This is represented in the
753 /// TLI via mappings that do not specify a vector name, as in the
754 /// following example:
755 ///
756 ///    const VecDesc VecIntrinsics[] = {
757 ///      {"llvm.phx.abs.i32", "", 4}
758 ///    };
759 static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
760   const StringRef ScalarName = CI.getCalledFunction()->getName();
761   bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
762   // Check that all known VFs are not associated to a vector
763   // function, i.e. the vector name is emty.
764   if (Scalarize) {
765     ElementCount WidestFixedVF, WidestScalableVF;
766     TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF);
767     for (ElementCount VF = ElementCount::getFixed(2);
768          ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2)
769       Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
770     for (ElementCount VF = ElementCount::getScalable(1);
771          ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2)
772       Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
773     assert((WidestScalableVF.isZero() || !Scalarize) &&
774            "Caller may decide to scalarize a variant using a scalable VF");
775   }
776   return Scalarize;
777 }
778 
779 bool LoopVectorizationLegality::canVectorizeInstrs() {
780   BasicBlock *Header = TheLoop->getHeader();
781 
782   // For each block in the loop.
783   for (BasicBlock *BB : TheLoop->blocks()) {
784     // Scan the instructions in the block and look for hazards.
785     for (Instruction &I : *BB) {
786       if (auto *Phi = dyn_cast<PHINode>(&I)) {
787         Type *PhiTy = Phi->getType();
788         // Check that this PHI type is allowed.
789         if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
790             !PhiTy->isPointerTy()) {
791           reportVectorizationFailure("Found a non-int non-pointer PHI",
792                                      "loop control flow is not understood by vectorizer",
793                                      "CFGNotUnderstood", ORE, TheLoop);
794           return false;
795         }
796 
797         // If this PHINode is not in the header block, then we know that we
798         // can convert it to select during if-conversion. No need to check if
799         // the PHIs in this block are induction or reduction variables.
800         if (BB != Header) {
801           // Non-header phi nodes that have outside uses can be vectorized. Add
802           // them to the list of allowed exits.
803           // Unsafe cyclic dependencies with header phis are identified during
804           // legalization for reduction, induction and fixed order
805           // recurrences.
806           AllowedExit.insert(&I);
807           continue;
808         }
809 
810         // We only allow if-converted PHIs with exactly two incoming values.
811         if (Phi->getNumIncomingValues() != 2) {
812           reportVectorizationFailure("Found an invalid PHI",
813               "loop control flow is not understood by vectorizer",
814               "CFGNotUnderstood", ORE, TheLoop, Phi);
815           return false;
816         }
817 
818         RecurrenceDescriptor RedDes;
819         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
820                                                  DT, PSE.getSE())) {
821           Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
822           AllowedExit.insert(RedDes.getLoopExitInstr());
823           Reductions[Phi] = RedDes;
824           continue;
825         }
826 
827         // We prevent matching non-constant strided pointer IVS to preserve
828         // historical vectorizer behavior after a generalization of the
829         // IVDescriptor code.  The intent is to remove this check, but we
830         // have to fix issues around code quality for such loops first.
831         auto IsDisallowedStridedPointerInduction =
832             [](const InductionDescriptor &ID) {
833               if (AllowStridedPointerIVs)
834                 return false;
835               return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
836                      ID.getConstIntStepValue() == nullptr;
837             };
838 
839         // TODO: Instead of recording the AllowedExit, it would be good to
840         // record the complementary set: NotAllowedExit. These include (but may
841         // not be limited to):
842         // 1. Reduction phis as they represent the one-before-last value, which
843         // is not available when vectorized
844         // 2. Induction phis and increment when SCEV predicates cannot be used
845         // outside the loop - see addInductionPhi
846         // 3. Non-Phis with outside uses when SCEV predicates cannot be used
847         // outside the loop - see call to hasOutsideLoopUser in the non-phi
848         // handling below
849         // 4. FixedOrderRecurrence phis that can possibly be handled by
850         // extraction.
851         // By recording these, we can then reason about ways to vectorize each
852         // of these NotAllowedExit.
853         InductionDescriptor ID;
854         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
855             !IsDisallowedStridedPointerInduction(ID)) {
856           addInductionPhi(Phi, ID, AllowedExit);
857           Requirements->addExactFPMathInst(ID.getExactFPMathInst());
858           continue;
859         }
860 
861         if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
862           AllowedExit.insert(Phi);
863           FixedOrderRecurrences.insert(Phi);
864           continue;
865         }
866 
867         // As a last resort, coerce the PHI to a AddRec expression
868         // and re-try classifying it a an induction PHI.
869         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
870             !IsDisallowedStridedPointerInduction(ID)) {
871           addInductionPhi(Phi, ID, AllowedExit);
872           continue;
873         }
874 
875         reportVectorizationFailure("Found an unidentified PHI",
876             "value that could not be identified as "
877             "reduction is used outside the loop",
878             "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
879         return false;
880       } // end of PHI handling
881 
882       // We handle calls that:
883       //   * Are debug info intrinsics.
884       //   * Have a mapping to an IR intrinsic.
885       //   * Have a vector version available.
886       auto *CI = dyn_cast<CallInst>(&I);
887 
888       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
889           !isa<DbgInfoIntrinsic>(CI) &&
890           !(CI->getCalledFunction() && TLI &&
891             (!VFDatabase::getMappings(*CI).empty() ||
892              isTLIScalarize(*TLI, *CI)))) {
893         // If the call is a recognized math libary call, it is likely that
894         // we can vectorize it given loosened floating-point constraints.
895         LibFunc Func;
896         bool IsMathLibCall =
897             TLI && CI->getCalledFunction() &&
898             CI->getType()->isFloatingPointTy() &&
899             TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
900             TLI->hasOptimizedCodeGen(Func);
901 
902         if (IsMathLibCall) {
903           // TODO: Ideally, we should not use clang-specific language here,
904           // but it's hard to provide meaningful yet generic advice.
905           // Also, should this be guarded by allowExtraAnalysis() and/or be part
906           // of the returned info from isFunctionVectorizable()?
907           reportVectorizationFailure(
908               "Found a non-intrinsic callsite",
909               "library call cannot be vectorized. "
910               "Try compiling with -fno-math-errno, -ffast-math, "
911               "or similar flags",
912               "CantVectorizeLibcall", ORE, TheLoop, CI);
913         } else {
914           reportVectorizationFailure("Found a non-intrinsic callsite",
915                                      "call instruction cannot be vectorized",
916                                      "CantVectorizeLibcall", ORE, TheLoop, CI);
917         }
918         return false;
919       }
920 
921       // Some intrinsics have scalar arguments and should be same in order for
922       // them to be vectorized (i.e. loop invariant).
923       if (CI) {
924         auto *SE = PSE.getSE();
925         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
926         for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
927           if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
928             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
929                                      TheLoop)) {
930               reportVectorizationFailure("Found unvectorizable intrinsic",
931                   "intrinsic instruction cannot be vectorized",
932                   "CantVectorizeIntrinsic", ORE, TheLoop, CI);
933               return false;
934             }
935           }
936       }
937 
938       // If we found a vectorized variant of a function, note that so LV can
939       // make better decisions about maximum VF.
940       if (CI && !VFDatabase::getMappings(*CI).empty())
941         VecCallVariantsFound = true;
942 
943       // Check that the instruction return type is vectorizable.
944       // Also, we can't vectorize extractelement instructions.
945       if ((!VectorType::isValidElementType(I.getType()) &&
946            !I.getType()->isVoidTy()) ||
947           isa<ExtractElementInst>(I)) {
948         reportVectorizationFailure("Found unvectorizable type",
949             "instruction return type cannot be vectorized",
950             "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
951         return false;
952       }
953 
954       // Check that the stored type is vectorizable.
955       if (auto *ST = dyn_cast<StoreInst>(&I)) {
956         Type *T = ST->getValueOperand()->getType();
957         if (!VectorType::isValidElementType(T)) {
958           reportVectorizationFailure("Store instruction cannot be vectorized",
959                                      "store instruction cannot be vectorized",
960                                      "CantVectorizeStore", ORE, TheLoop, ST);
961           return false;
962         }
963 
964         // For nontemporal stores, check that a nontemporal vector version is
965         // supported on the target.
966         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
967           // Arbitrarily try a vector of 2 elements.
968           auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
969           assert(VecTy && "did not find vectorized version of stored type");
970           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
971             reportVectorizationFailure(
972                 "nontemporal store instruction cannot be vectorized",
973                 "nontemporal store instruction cannot be vectorized",
974                 "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
975             return false;
976           }
977         }
978 
979       } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
980         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
981           // For nontemporal loads, check that a nontemporal vector version is
982           // supported on the target (arbitrarily try a vector of 2 elements).
983           auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
984           assert(VecTy && "did not find vectorized version of load type");
985           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
986             reportVectorizationFailure(
987                 "nontemporal load instruction cannot be vectorized",
988                 "nontemporal load instruction cannot be vectorized",
989                 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
990             return false;
991           }
992         }
993 
994         // FP instructions can allow unsafe algebra, thus vectorizable by
995         // non-IEEE-754 compliant SIMD units.
996         // This applies to floating-point math operations and calls, not memory
997         // operations, shuffles, or casts, as they don't change precision or
998         // semantics.
999       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
1000                  !I.isFast()) {
1001         LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1002         Hints->setPotentiallyUnsafe();
1003       }
1004 
1005       // Reduction instructions are allowed to have exit users.
1006       // All other instructions must not have external users.
1007       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
1008         // We can safely vectorize loops where instructions within the loop are
1009         // used outside the loop only if the SCEV predicates within the loop is
1010         // same as outside the loop. Allowing the exit means reusing the SCEV
1011         // outside the loop.
1012         if (PSE.getPredicate().isAlwaysTrue()) {
1013           AllowedExit.insert(&I);
1014           continue;
1015         }
1016         reportVectorizationFailure("Value cannot be used outside the loop",
1017                                    "value cannot be used outside the loop",
1018                                    "ValueUsedOutsideLoop", ORE, TheLoop, &I);
1019         return false;
1020       }
1021     } // next instr.
1022   }
1023 
1024   if (!PrimaryInduction) {
1025     if (Inductions.empty()) {
1026       reportVectorizationFailure("Did not find one integer induction var",
1027           "loop induction variable could not be identified",
1028           "NoInductionVariable", ORE, TheLoop);
1029       return false;
1030     }
1031     if (!WidestIndTy) {
1032       reportVectorizationFailure("Did not find one integer induction var",
1033           "integer loop induction variable could not be identified",
1034           "NoIntegerInductionVariable", ORE, TheLoop);
1035       return false;
1036     }
1037     LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1038   }
1039 
1040   // Now we know the widest induction type, check if our found induction
1041   // is the same size. If it's not, unset it here and InnerLoopVectorizer
1042   // will create another.
1043   if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
1044     PrimaryInduction = nullptr;
1045 
1046   return true;
1047 }
1048 
1049 bool LoopVectorizationLegality::canVectorizeMemory() {
1050   LAI = &LAIs.getInfo(*TheLoop);
1051   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1052   if (LAR) {
1053     ORE->emit([&]() {
1054       return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
1055                                         "loop not vectorized: ", *LAR);
1056     });
1057   }
1058 
1059   if (!LAI->canVectorizeMemory())
1060     return false;
1061 
1062   if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1063     reportVectorizationFailure("We don't allow storing to uniform addresses",
1064                                "write to a loop invariant address could not "
1065                                "be vectorized",
1066                                "CantVectorizeStoreToLoopInvariantAddress", ORE,
1067                                TheLoop);
1068     return false;
1069   }
1070 
1071   // We can vectorize stores to invariant address when final reduction value is
1072   // guaranteed to be stored at the end of the loop. Also, if decision to
1073   // vectorize loop is made, runtime checks are added so as to make sure that
1074   // invariant address won't alias with any other objects.
1075   if (!LAI->getStoresToInvariantAddresses().empty()) {
1076     // For each invariant address, check if last stored value is unconditional
1077     // and the address is not calculated inside the loop.
1078     for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1079       if (!isInvariantStoreOfReduction(SI))
1080         continue;
1081 
1082       if (blockNeedsPredication(SI->getParent())) {
1083         reportVectorizationFailure(
1084             "We don't allow storing to uniform addresses",
1085             "write of conditional recurring variant value to a loop "
1086             "invariant address could not be vectorized",
1087             "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1088         return false;
1089       }
1090 
1091       // Invariant address should be defined outside of loop. LICM pass usually
1092       // makes sure it happens, but in rare cases it does not, we do not want
1093       // to overcomplicate vectorization to support this case.
1094       if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) {
1095         if (TheLoop->contains(Ptr)) {
1096           reportVectorizationFailure(
1097               "Invariant address is calculated inside the loop",
1098               "write to a loop invariant address could not "
1099               "be vectorized",
1100               "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1101           return false;
1102         }
1103       }
1104     }
1105 
1106     if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1107       // For each invariant address, check its last stored value is the result
1108       // of one of our reductions.
1109       //
1110       // We do not check if dependence with loads exists because that is already
1111       // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1112       ScalarEvolution *SE = PSE.getSE();
1113       SmallVector<StoreInst *, 4> UnhandledStores;
1114       for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1115         if (isInvariantStoreOfReduction(SI)) {
1116           // Earlier stores to this address are effectively deadcode.
1117           // With opaque pointers it is possible for one pointer to be used with
1118           // different sizes of stored values:
1119           //    store i32 0, ptr %x
1120           //    store i8 0, ptr %x
1121           // The latest store doesn't complitely overwrite the first one in the
1122           // example. That is why we have to make sure that types of stored
1123           // values are same.
1124           // TODO: Check that bitwidth of unhandled store is smaller then the
1125           // one that overwrites it and add a test.
1126           erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
1127             return storeToSameAddress(SE, SI, I) &&
1128                    I->getValueOperand()->getType() ==
1129                        SI->getValueOperand()->getType();
1130           });
1131           continue;
1132         }
1133         UnhandledStores.push_back(SI);
1134       }
1135 
1136       bool IsOK = UnhandledStores.empty();
1137       // TODO: we should also validate against InvariantMemSets.
1138       if (!IsOK) {
1139         reportVectorizationFailure(
1140             "We don't allow storing to uniform addresses",
1141             "write to a loop invariant address could not "
1142             "be vectorized",
1143             "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1144         return false;
1145       }
1146     }
1147   }
1148 
1149   PSE.addPredicate(LAI->getPSE().getPredicate());
1150   return true;
1151 }
1152 
1153 bool LoopVectorizationLegality::canVectorizeFPMath(
1154     bool EnableStrictReductions) {
1155 
1156   // First check if there is any ExactFP math or if we allow reassociations
1157   if (!Requirements->getExactFPInst() || Hints->allowReordering())
1158     return true;
1159 
1160   // If the above is false, we have ExactFPMath & do not allow reordering.
1161   // If the EnableStrictReductions flag is set, first check if we have any
1162   // Exact FP induction vars, which we cannot vectorize.
1163   if (!EnableStrictReductions ||
1164       any_of(getInductionVars(), [&](auto &Induction) -> bool {
1165         InductionDescriptor IndDesc = Induction.second;
1166         return IndDesc.getExactFPMathInst();
1167       }))
1168     return false;
1169 
1170   // We can now only vectorize if all reductions with Exact FP math also
1171   // have the isOrdered flag set, which indicates that we can move the
1172   // reduction operations in-loop.
1173   return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
1174     const RecurrenceDescriptor &RdxDesc = Reduction.second;
1175     return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
1176   }));
1177 }
1178 
1179 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1180   return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
1181     const RecurrenceDescriptor &RdxDesc = Reduction.second;
1182     return RdxDesc.IntermediateStore == SI;
1183   });
1184 }
1185 
1186 bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1187   return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
1188     const RecurrenceDescriptor &RdxDesc = Reduction.second;
1189     if (!RdxDesc.IntermediateStore)
1190       return false;
1191 
1192     ScalarEvolution *SE = PSE.getSE();
1193     Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1194     return V == InvariantAddress ||
1195            SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
1196   });
1197 }
1198 
1199 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
1200   Value *In0 = const_cast<Value *>(V);
1201   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
1202   if (!PN)
1203     return false;
1204 
1205   return Inductions.count(PN);
1206 }
1207 
1208 const InductionDescriptor *
1209 LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
1210   if (!isInductionPhi(Phi))
1211     return nullptr;
1212   auto &ID = getInductionVars().find(Phi)->second;
1213   if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
1214       ID.getKind() == InductionDescriptor::IK_FpInduction)
1215     return &ID;
1216   return nullptr;
1217 }
1218 
1219 const InductionDescriptor *
1220 LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
1221   if (!isInductionPhi(Phi))
1222     return nullptr;
1223   auto &ID = getInductionVars().find(Phi)->second;
1224   if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1225     return &ID;
1226   return nullptr;
1227 }
1228 
1229 bool LoopVectorizationLegality::isCastedInductionVariable(
1230     const Value *V) const {
1231   auto *Inst = dyn_cast<Instruction>(V);
1232   return (Inst && InductionCastsToIgnore.count(Inst));
1233 }
1234 
1235 bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
1236   return isInductionPhi(V) || isCastedInductionVariable(V);
1237 }
1238 
1239 bool LoopVectorizationLegality::isFixedOrderRecurrence(
1240     const PHINode *Phi) const {
1241   return FixedOrderRecurrences.count(Phi);
1242 }
1243 
1244 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
1245   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1246 }
1247 
1248 bool LoopVectorizationLegality::blockCanBePredicated(
1249     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
1250     SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
1251   for (Instruction &I : *BB) {
1252     // We can predicate blocks with calls to assume, as long as we drop them in
1253     // case we flatten the CFG via predication.
1254     if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
1255       MaskedOp.insert(&I);
1256       continue;
1257     }
1258 
1259     // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1260     // TODO: there might be cases that it should block the vectorization. Let's
1261     // ignore those for now.
1262     if (isa<NoAliasScopeDeclInst>(&I))
1263       continue;
1264 
1265     // We can allow masked calls if there's at least one vector variant, even
1266     // if we end up scalarizing due to the cost model calculations.
1267     // TODO: Allow other calls if they have appropriate attributes... readonly
1268     // and argmemonly?
1269     if (CallInst *CI = dyn_cast<CallInst>(&I))
1270       if (VFDatabase::hasMaskedVariant(*CI)) {
1271         MaskedOp.insert(CI);
1272         continue;
1273       }
1274 
1275     // Loads are handled via masking (or speculated if safe to do so.)
1276     if (auto *LI = dyn_cast<LoadInst>(&I)) {
1277       if (!SafePtrs.count(LI->getPointerOperand()))
1278         MaskedOp.insert(LI);
1279       continue;
1280     }
1281 
1282     // Predicated store requires some form of masking:
1283     // 1) masked store HW instruction,
1284     // 2) emulation via load-blend-store (only if safe and legal to do so,
1285     //    be aware on the race conditions), or
1286     // 3) element-by-element predicate check and scalar store.
1287     if (auto *SI = dyn_cast<StoreInst>(&I)) {
1288       MaskedOp.insert(SI);
1289       continue;
1290     }
1291 
1292     if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
1293       return false;
1294   }
1295 
1296   return true;
1297 }
1298 
1299 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1300   if (!EnableIfConversion) {
1301     reportVectorizationFailure("If-conversion is disabled",
1302                                "if-conversion is disabled",
1303                                "IfConversionDisabled",
1304                                ORE, TheLoop);
1305     return false;
1306   }
1307 
1308   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1309 
1310   // A list of pointers which are known to be dereferenceable within scope of
1311   // the loop body for each iteration of the loop which executes.  That is,
1312   // the memory pointed to can be dereferenced (with the access size implied by
1313   // the value's type) unconditionally within the loop header without
1314   // introducing a new fault.
1315   SmallPtrSet<Value *, 8> SafePointers;
1316 
1317   // Collect safe addresses.
1318   for (BasicBlock *BB : TheLoop->blocks()) {
1319     if (!blockNeedsPredication(BB)) {
1320       for (Instruction &I : *BB)
1321         if (auto *Ptr = getLoadStorePointerOperand(&I))
1322           SafePointers.insert(Ptr);
1323       continue;
1324     }
1325 
1326     // For a block which requires predication, a address may be safe to access
1327     // in the loop w/o predication if we can prove dereferenceability facts
1328     // sufficient to ensure it'll never fault within the loop. For the moment,
1329     // we restrict this to loads; stores are more complicated due to
1330     // concurrency restrictions.
1331     ScalarEvolution &SE = *PSE.getSE();
1332     for (Instruction &I : *BB) {
1333       LoadInst *LI = dyn_cast<LoadInst>(&I);
1334       if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
1335           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC))
1336         SafePointers.insert(LI->getPointerOperand());
1337     }
1338   }
1339 
1340   // Collect the blocks that need predication.
1341   for (BasicBlock *BB : TheLoop->blocks()) {
1342     // We don't support switch statements inside loops.
1343     if (!isa<BranchInst>(BB->getTerminator())) {
1344       reportVectorizationFailure("Loop contains a switch statement",
1345                                  "loop contains a switch statement",
1346                                  "LoopContainsSwitch", ORE, TheLoop,
1347                                  BB->getTerminator());
1348       return false;
1349     }
1350 
1351     // We must be able to predicate all blocks that need to be predicated.
1352     if (blockNeedsPredication(BB) &&
1353         !blockCanBePredicated(BB, SafePointers, MaskedOp)) {
1354       reportVectorizationFailure(
1355           "Control flow cannot be substituted for a select",
1356           "control flow cannot be substituted for a select", "NoCFGForSelect",
1357           ORE, TheLoop, BB->getTerminator());
1358       return false;
1359     }
1360   }
1361 
1362   // We can if-convert this loop.
1363   return true;
1364 }
1365 
1366 // Helper function to canVectorizeLoopNestCFG.
1367 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1368                                                     bool UseVPlanNativePath) {
1369   assert((UseVPlanNativePath || Lp->isInnermost()) &&
1370          "VPlan-native path is not enabled.");
1371 
1372   // TODO: ORE should be improved to show more accurate information when an
1373   // outer loop can't be vectorized because a nested loop is not understood or
1374   // legal. Something like: "outer_loop_location: loop not vectorized:
1375   // (inner_loop_location) loop control flow is not understood by vectorizer".
1376 
1377   // Store the result and return it at the end instead of exiting early, in case
1378   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1379   bool Result = true;
1380   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1381 
1382   // We must have a loop in canonical form. Loops with indirectbr in them cannot
1383   // be canonicalized.
1384   if (!Lp->getLoopPreheader()) {
1385     reportVectorizationFailure("Loop doesn't have a legal pre-header",
1386         "loop control flow is not understood by vectorizer",
1387         "CFGNotUnderstood", ORE, TheLoop);
1388     if (DoExtraAnalysis)
1389       Result = false;
1390     else
1391       return false;
1392   }
1393 
1394   // We must have a single backedge.
1395   if (Lp->getNumBackEdges() != 1) {
1396     reportVectorizationFailure("The loop must have a single backedge",
1397         "loop control flow is not understood by vectorizer",
1398         "CFGNotUnderstood", ORE, TheLoop);
1399     if (DoExtraAnalysis)
1400       Result = false;
1401     else
1402       return false;
1403   }
1404 
1405   return Result;
1406 }
1407 
1408 bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1409     Loop *Lp, bool UseVPlanNativePath) {
1410   // Store the result and return it at the end instead of exiting early, in case
1411   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1412   bool Result = true;
1413   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1414   if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1415     if (DoExtraAnalysis)
1416       Result = false;
1417     else
1418       return false;
1419   }
1420 
1421   // Recursively check whether the loop control flow of nested loops is
1422   // understood.
1423   for (Loop *SubLp : *Lp)
1424     if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1425       if (DoExtraAnalysis)
1426         Result = false;
1427       else
1428         return false;
1429     }
1430 
1431   return Result;
1432 }
1433 
1434 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1435   // Store the result and return it at the end instead of exiting early, in case
1436   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1437   bool Result = true;
1438 
1439   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1440   // Check whether the loop-related control flow in the loop nest is expected by
1441   // vectorizer.
1442   if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1443     if (DoExtraAnalysis)
1444       Result = false;
1445     else
1446       return false;
1447   }
1448 
1449   // We need to have a loop header.
1450   LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1451                     << '\n');
1452 
1453   // Specific checks for outer loops. We skip the remaining legal checks at this
1454   // point because they don't support outer loops.
1455   if (!TheLoop->isInnermost()) {
1456     assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1457 
1458     if (!canVectorizeOuterLoop()) {
1459       reportVectorizationFailure("Unsupported outer loop",
1460                                  "unsupported outer loop",
1461                                  "UnsupportedOuterLoop",
1462                                  ORE, TheLoop);
1463       // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1464       // outer loops.
1465       return false;
1466     }
1467 
1468     LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1469     return Result;
1470   }
1471 
1472   assert(TheLoop->isInnermost() && "Inner loop expected.");
1473   // Check if we can if-convert non-single-bb loops.
1474   unsigned NumBlocks = TheLoop->getNumBlocks();
1475   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1476     LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1477     if (DoExtraAnalysis)
1478       Result = false;
1479     else
1480       return false;
1481   }
1482 
1483   // Check if we can vectorize the instructions and CFG in this loop.
1484   if (!canVectorizeInstrs()) {
1485     LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1486     if (DoExtraAnalysis)
1487       Result = false;
1488     else
1489       return false;
1490   }
1491 
1492   // Go over each instruction and look at memory deps.
1493   if (!canVectorizeMemory()) {
1494     LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1495     if (DoExtraAnalysis)
1496       Result = false;
1497     else
1498       return false;
1499   }
1500 
1501   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
1502     reportVectorizationFailure("could not determine number of loop iterations",
1503                                "could not determine number of loop iterations",
1504                                "CantComputeNumberOfIterations", ORE, TheLoop);
1505     if (DoExtraAnalysis)
1506       Result = false;
1507     else
1508       return false;
1509   }
1510 
1511   LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1512                     << (LAI->getRuntimePointerChecking()->Need
1513                             ? " (with a runtime bound check)"
1514                             : "")
1515                     << "!\n");
1516 
1517   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1518   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1519     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1520 
1521   if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
1522     reportVectorizationFailure("Too many SCEV checks needed",
1523         "Too many SCEV assumptions need to be made and checked at runtime",
1524         "TooManySCEVRunTimeChecks", ORE, TheLoop);
1525     if (DoExtraAnalysis)
1526       Result = false;
1527     else
1528       return false;
1529   }
1530 
1531   // Okay! We've done all the tests. If any have failed, return false. Otherwise
1532   // we can vectorize, and at this point we don't have any other mem analysis
1533   // which may limit our maximum vectorization factor, so just return true with
1534   // no restrictions.
1535   return Result;
1536 }
1537 
1538 bool LoopVectorizationLegality::canFoldTailByMasking() const {
1539 
1540   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1541 
1542   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
1543 
1544   for (const auto &Reduction : getReductionVars())
1545     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
1546 
1547   // TODO: handle non-reduction outside users when tail is folded by masking.
1548   for (auto *AE : AllowedExit) {
1549     // Check that all users of allowed exit values are inside the loop or
1550     // are the live-out of a reduction.
1551     if (ReductionLiveOuts.count(AE))
1552       continue;
1553     for (User *U : AE->users()) {
1554       Instruction *UI = cast<Instruction>(U);
1555       if (TheLoop->contains(UI))
1556         continue;
1557       LLVM_DEBUG(
1558           dbgs()
1559           << "LV: Cannot fold tail by masking, loop has an outside user for "
1560           << *UI << "\n");
1561       return false;
1562     }
1563   }
1564 
1565   for (const auto &Entry : getInductionVars()) {
1566     PHINode *OrigPhi = Entry.first;
1567     for (User *U : OrigPhi->users()) {
1568       auto *UI = cast<Instruction>(U);
1569       if (!TheLoop->contains(UI)) {
1570         LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1571                              "outside user for "
1572                           << *UI << "\n");
1573         return false;
1574       }
1575     }
1576   }
1577 
1578   // The list of pointers that we can safely read and write to remains empty.
1579   SmallPtrSet<Value *, 8> SafePointers;
1580 
1581   // Check all blocks for predication, including those that ordinarily do not
1582   // need predication such as the header block.
1583   SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
1584   for (BasicBlock *BB : TheLoop->blocks()) {
1585     if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
1586       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1587       return false;
1588     }
1589   }
1590 
1591   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1592 
1593   return true;
1594 }
1595 
1596 void LoopVectorizationLegality::prepareToFoldTailByMasking() {
1597   // The list of pointers that we can safely read and write to remains empty.
1598   SmallPtrSet<Value *, 8> SafePointers;
1599 
1600   // Mark all blocks for predication, including those that ordinarily do not
1601   // need predication such as the header block.
1602   for (BasicBlock *BB : TheLoop->blocks()) {
1603     [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
1604     assert(R && "Must be able to predicate block when tail-folding.");
1605   }
1606 }
1607 
1608 } // namespace llvm
1609