1 //===- LoopVectorizationLegality.cpp --------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides loop vectorization legality analysis. Original code 10 // resided in LoopVectorize.cpp for a long time. 11 // 12 // At this point, it is implemented as a utility class, not as an analysis 13 // pass. It should be easy to create an analysis pass around it if there 14 // is a need (but D45420 needs to happen first). 15 // 16 17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 18 #include "llvm/Analysis/Loads.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 21 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 22 #include "llvm/Analysis/TargetLibraryInfo.h" 23 #include "llvm/Analysis/TargetTransformInfo.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/Analysis/VectorUtils.h" 26 #include "llvm/IR/IntrinsicInst.h" 27 #include "llvm/IR/PatternMatch.h" 28 #include "llvm/Transforms/Utils/SizeOpts.h" 29 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 30 31 using namespace llvm; 32 using namespace PatternMatch; 33 34 #define LV_NAME "loop-vectorize" 35 #define DEBUG_TYPE LV_NAME 36 37 static cl::opt<bool> 38 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 39 cl::desc("Enable if-conversion during vectorization.")); 40 41 static cl::opt<bool> 42 AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, 43 cl::desc("Enable recognition of non-constant strided " 44 "pointer induction variables.")); 45 46 namespace llvm { 47 cl::opt<bool> 48 HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, 49 cl::desc("Allow enabling loop hints to reorder " 50 "FP operations during vectorization.")); 51 } // namespace llvm 52 53 // TODO: Move size-based thresholds out of legality checking, make cost based 54 // decisions instead of hard thresholds. 55 static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 56 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 57 cl::desc("The maximum number of SCEV checks allowed.")); 58 59 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 60 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 61 cl::desc("The maximum number of SCEV checks allowed with a " 62 "vectorize(enable) pragma")); 63 64 static cl::opt<LoopVectorizeHints::ScalableForceKind> 65 ForceScalableVectorization( 66 "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), 67 cl::Hidden, 68 cl::desc("Control whether the compiler can use scalable vectors to " 69 "vectorize a loop"), 70 cl::values( 71 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", 72 "Scalable vectorization is disabled."), 73 clEnumValN( 74 LoopVectorizeHints::SK_PreferScalable, "preferred", 75 "Scalable vectorization is available and favored when the " 76 "cost is inconclusive."), 77 clEnumValN( 78 LoopVectorizeHints::SK_PreferScalable, "on", 79 "Scalable vectorization is available and favored when the " 80 "cost is inconclusive."))); 81 82 static cl::opt<bool> EnableHistogramVectorization( 83 "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, 84 cl::desc("Enables autovectorization of some loops containing histograms")); 85 86 /// Maximum vectorization interleave count. 87 static const unsigned MaxInterleaveFactor = 16; 88 89 namespace llvm { 90 91 bool LoopVectorizeHints::Hint::validate(unsigned Val) { 92 switch (Kind) { 93 case HK_WIDTH: 94 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 95 case HK_INTERLEAVE: 96 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 97 case HK_FORCE: 98 return (Val <= 1); 99 case HK_ISVECTORIZED: 100 case HK_PREDICATE: 101 case HK_SCALABLE: 102 return (Val == 0 || Val == 1); 103 } 104 return false; 105 } 106 107 LoopVectorizeHints::LoopVectorizeHints(const Loop *L, 108 bool InterleaveOnlyWhenForced, 109 OptimizationRemarkEmitter &ORE, 110 const TargetTransformInfo *TTI) 111 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), 112 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), 113 Force("vectorize.enable", FK_Undefined, HK_FORCE), 114 IsVectorized("isvectorized", 0, HK_ISVECTORIZED), 115 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), 116 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE), 117 TheLoop(L), ORE(ORE) { 118 // Populate values with existing loop metadata. 119 getHintsFromMetadata(); 120 121 // force-vector-interleave overrides DisableInterleaving. 122 if (VectorizerParams::isInterleaveForced()) 123 Interleave.Value = VectorizerParams::VectorizationInterleave; 124 125 // If the metadata doesn't explicitly specify whether to enable scalable 126 // vectorization, then decide based on the following criteria (increasing 127 // level of priority): 128 // - Target default 129 // - Metadata width 130 // - Force option (always overrides) 131 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { 132 if (TTI) 133 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable 134 : SK_FixedWidthOnly; 135 136 if (Width.Value) 137 // If the width is set, but the metadata says nothing about the scalable 138 // property, then assume it concerns only a fixed-width UserVF. 139 // If width is not set, the flag takes precedence. 140 Scalable.Value = SK_FixedWidthOnly; 141 } 142 143 // If the flag is set to force any use of scalable vectors, override the loop 144 // hints. 145 if (ForceScalableVectorization.getValue() != 146 LoopVectorizeHints::SK_Unspecified) 147 Scalable.Value = ForceScalableVectorization.getValue(); 148 149 // Scalable vectorization is disabled if no preference is specified. 150 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) 151 Scalable.Value = SK_FixedWidthOnly; 152 153 if (IsVectorized.Value != 1) 154 // If the vectorization width and interleaving count are both 1 then 155 // consider the loop to have been already vectorized because there's 156 // nothing more that we can do. 157 IsVectorized.Value = 158 getWidth() == ElementCount::getFixed(1) && getInterleave() == 1; 159 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs() 160 << "LV: Interleaving disabled by the pass manager\n"); 161 } 162 163 void LoopVectorizeHints::setAlreadyVectorized() { 164 LLVMContext &Context = TheLoop->getHeader()->getContext(); 165 166 MDNode *IsVectorizedMD = MDNode::get( 167 Context, 168 {MDString::get(Context, "llvm.loop.isvectorized"), 169 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); 170 MDNode *LoopID = TheLoop->getLoopID(); 171 MDNode *NewLoopID = 172 makePostTransformationMetadata(Context, LoopID, 173 {Twine(Prefix(), "vectorize.").str(), 174 Twine(Prefix(), "interleave.").str()}, 175 {IsVectorizedMD}); 176 TheLoop->setLoopID(NewLoopID); 177 178 // Update internal cache. 179 IsVectorized.Value = 1; 180 } 181 182 bool LoopVectorizeHints::allowVectorization( 183 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { 184 if (getForce() == LoopVectorizeHints::FK_Disabled) { 185 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 186 emitRemarkWithHints(); 187 return false; 188 } 189 190 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { 191 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 192 emitRemarkWithHints(); 193 return false; 194 } 195 196 if (getIsVectorized() == 1) { 197 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 198 // FIXME: Add interleave.disable metadata. This will allow 199 // vectorize.disable to be used without disabling the pass and errors 200 // to differentiate between disabled vectorization and a width of 1. 201 ORE.emit([&]() { 202 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), 203 "AllDisabled", L->getStartLoc(), 204 L->getHeader()) 205 << "loop not vectorized: vectorization and interleaving are " 206 "explicitly disabled, or the loop has already been " 207 "vectorized"; 208 }); 209 return false; 210 } 211 212 return true; 213 } 214 215 void LoopVectorizeHints::emitRemarkWithHints() const { 216 using namespace ore; 217 218 ORE.emit([&]() { 219 if (Force.Value == LoopVectorizeHints::FK_Disabled) 220 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", 221 TheLoop->getStartLoc(), 222 TheLoop->getHeader()) 223 << "loop not vectorized: vectorization is explicitly disabled"; 224 225 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(), 226 TheLoop->getHeader()); 227 R << "loop not vectorized"; 228 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 229 R << " (Force=" << NV("Force", true); 230 if (Width.Value != 0) 231 R << ", Vector Width=" << NV("VectorWidth", getWidth()); 232 if (getInterleave() != 0) 233 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave()); 234 R << ")"; 235 } 236 return R; 237 }); 238 } 239 240 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { 241 if (getWidth() == ElementCount::getFixed(1)) 242 return LV_NAME; 243 if (getForce() == LoopVectorizeHints::FK_Disabled) 244 return LV_NAME; 245 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) 246 return LV_NAME; 247 return OptimizationRemarkAnalysis::AlwaysPrint; 248 } 249 250 bool LoopVectorizeHints::allowReordering() const { 251 // Allow the vectorizer to change the order of operations if enabling 252 // loop hints are provided 253 ElementCount EC = getWidth(); 254 return HintsAllowReordering && 255 (getForce() == LoopVectorizeHints::FK_Enabled || 256 EC.getKnownMinValue() > 1); 257 } 258 259 void LoopVectorizeHints::getHintsFromMetadata() { 260 MDNode *LoopID = TheLoop->getLoopID(); 261 if (!LoopID) 262 return; 263 264 // First operand should refer to the loop id itself. 265 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 266 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 267 268 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) { 269 const MDString *S = nullptr; 270 SmallVector<Metadata *, 4> Args; 271 272 // The expected hint is either a MDString or a MDNode with the first 273 // operand a MDString. 274 if (const MDNode *MD = dyn_cast<MDNode>(MDO)) { 275 if (!MD || MD->getNumOperands() == 0) 276 continue; 277 S = dyn_cast<MDString>(MD->getOperand(0)); 278 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx) 279 Args.push_back(MD->getOperand(Idx)); 280 } else { 281 S = dyn_cast<MDString>(MDO); 282 assert(Args.size() == 0 && "too many arguments for MDString"); 283 } 284 285 if (!S) 286 continue; 287 288 // Check if the hint starts with the loop metadata prefix. 289 StringRef Name = S->getString(); 290 if (Args.size() == 1) 291 setHint(Name, Args[0]); 292 } 293 } 294 295 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { 296 if (!Name.starts_with(Prefix())) 297 return; 298 Name = Name.substr(Prefix().size(), StringRef::npos); 299 300 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 301 if (!C) 302 return; 303 unsigned Val = C->getZExtValue(); 304 305 Hint *Hints[] = {&Width, &Interleave, &Force, 306 &IsVectorized, &Predicate, &Scalable}; 307 for (auto *H : Hints) { 308 if (Name == H->Name) { 309 if (H->validate(Val)) 310 H->Value = Val; 311 else 312 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 313 break; 314 } 315 } 316 } 317 318 // Return true if the inner loop \p Lp is uniform with regard to the outer loop 319 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes 320 // executing the inner loop will execute the same iterations). This check is 321 // very constrained for now but it will be relaxed in the future. \p Lp is 322 // considered uniform if it meets all the following conditions: 323 // 1) it has a canonical IV (starting from 0 and with stride 1), 324 // 2) its latch terminator is a conditional branch and, 325 // 3) its latch condition is a compare instruction whose operands are the 326 // canonical IV and an OuterLp invariant. 327 // This check doesn't take into account the uniformity of other conditions not 328 // related to the loop latch because they don't affect the loop uniformity. 329 // 330 // NOTE: We decided to keep all these checks and its associated documentation 331 // together so that we can easily have a picture of the current supported loop 332 // nests. However, some of the current checks don't depend on \p OuterLp and 333 // would be redundantly executed for each \p Lp if we invoked this function for 334 // different candidate outer loops. This is not the case for now because we 335 // don't currently have the infrastructure to evaluate multiple candidate outer 336 // loops and \p OuterLp will be a fixed parameter while we only support explicit 337 // outer loop vectorization. It's also very likely that these checks go away 338 // before introducing the aforementioned infrastructure. However, if this is not 339 // the case, we should move the \p OuterLp independent checks to a separate 340 // function that is only executed once for each \p Lp. 341 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { 342 assert(Lp->getLoopLatch() && "Expected loop with a single latch."); 343 344 // If Lp is the outer loop, it's uniform by definition. 345 if (Lp == OuterLp) 346 return true; 347 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); 348 349 // 1. 350 PHINode *IV = Lp->getCanonicalInductionVariable(); 351 if (!IV) { 352 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); 353 return false; 354 } 355 356 // 2. 357 BasicBlock *Latch = Lp->getLoopLatch(); 358 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); 359 if (!LatchBr || LatchBr->isUnconditional()) { 360 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); 361 return false; 362 } 363 364 // 3. 365 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); 366 if (!LatchCmp) { 367 LLVM_DEBUG( 368 dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); 369 return false; 370 } 371 372 Value *CondOp0 = LatchCmp->getOperand(0); 373 Value *CondOp1 = LatchCmp->getOperand(1); 374 Value *IVUpdate = IV->getIncomingValueForBlock(Latch); 375 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && 376 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { 377 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); 378 return false; 379 } 380 381 return true; 382 } 383 384 // Return true if \p Lp and all its nested loops are uniform with regard to \p 385 // OuterLp. 386 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { 387 if (!isUniformLoop(Lp, OuterLp)) 388 return false; 389 390 // Check if nested loops are uniform. 391 for (Loop *SubLp : *Lp) 392 if (!isUniformLoopNest(SubLp, OuterLp)) 393 return false; 394 395 return true; 396 } 397 398 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 399 if (Ty->isPointerTy()) 400 return DL.getIntPtrType(Ty); 401 402 // It is possible that char's or short's overflow when we ask for the loop's 403 // trip count, work around this by changing the type size. 404 if (Ty->getScalarSizeInBits() < 32) 405 return Type::getInt32Ty(Ty->getContext()); 406 407 return Ty; 408 } 409 410 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 411 Ty0 = convertPointerToIntegerType(DL, Ty0); 412 Ty1 = convertPointerToIntegerType(DL, Ty1); 413 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 414 return Ty0; 415 return Ty1; 416 } 417 418 /// Check that the instruction has outside loop users and is not an 419 /// identified reduction variable. 420 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 421 SmallPtrSetImpl<Value *> &AllowedExit) { 422 // Reductions, Inductions and non-header phis are allowed to have exit users. All 423 // other instructions must not have external users. 424 if (!AllowedExit.count(Inst)) 425 // Check that all of the users of the loop are inside the BB. 426 for (User *U : Inst->users()) { 427 Instruction *UI = cast<Instruction>(U); 428 // This user may be a reduction exit value. 429 if (!TheLoop->contains(UI)) { 430 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 431 return true; 432 } 433 } 434 return false; 435 } 436 437 /// Returns true if A and B have same pointer operands or same SCEVs addresses 438 static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, 439 StoreInst *B) { 440 // Compare store 441 if (A == B) 442 return true; 443 444 // Otherwise Compare pointers 445 Value *APtr = A->getPointerOperand(); 446 Value *BPtr = B->getPointerOperand(); 447 if (APtr == BPtr) 448 return true; 449 450 // Otherwise compare address SCEVs 451 return SE->getSCEV(APtr) == SE->getSCEV(BPtr); 452 } 453 454 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, 455 Value *Ptr) const { 456 // FIXME: Currently, the set of symbolic strides is sometimes queried before 457 // it's collected. This happens from canVectorizeWithIfConvert, when the 458 // pointer is checked to reference consecutive elements suitable for a 459 // masked access. 460 const auto &Strides = 461 LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); 462 463 Function *F = TheLoop->getHeader()->getParent(); 464 bool OptForSize = F->hasOptSize() || 465 llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, 466 PGSOQueryType::IRPass); 467 bool CanAddPredicate = !OptForSize; 468 int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, 469 CanAddPredicate, false).value_or(0); 470 if (Stride == 1 || Stride == -1) 471 return Stride; 472 return 0; 473 } 474 475 bool LoopVectorizationLegality::isInvariant(Value *V) const { 476 return LAI->isInvariant(V); 477 } 478 479 namespace { 480 /// A rewriter to build the SCEVs for each of the VF lanes in the expected 481 /// vectorized loop, which can then be compared to detect their uniformity. This 482 /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) 483 /// with new AddRecs where the step is multiplied by StepMultiplier and Offset * 484 /// Step is added. Also checks if all sub-expressions are analyzable w.r.t. 485 /// uniformity. 486 class SCEVAddRecForUniformityRewriter 487 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { 488 /// Multiplier to be applied to the step of AddRecs in TheLoop. 489 unsigned StepMultiplier; 490 491 /// Offset to be added to the AddRecs in TheLoop. 492 unsigned Offset; 493 494 /// Loop for which to rewrite AddRecsFor. 495 Loop *TheLoop; 496 497 /// Is any sub-expressions not analyzable w.r.t. uniformity? 498 bool CannotAnalyze = false; 499 500 bool canAnalyze() const { return !CannotAnalyze; } 501 502 public: 503 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, 504 unsigned Offset, Loop *TheLoop) 505 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), 506 TheLoop(TheLoop) {} 507 508 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 509 assert(Expr->getLoop() == TheLoop && 510 "addrec outside of TheLoop must be invariant and should have been " 511 "handled earlier"); 512 // Build a new AddRec by multiplying the step by StepMultiplier and 513 // incrementing the start by Offset * step. 514 Type *Ty = Expr->getType(); 515 const SCEV *Step = Expr->getStepRecurrence(SE); 516 if (!SE.isLoopInvariant(Step, TheLoop)) { 517 CannotAnalyze = true; 518 return Expr; 519 } 520 const SCEV *NewStep = 521 SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); 522 const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); 523 const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); 524 return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); 525 } 526 527 const SCEV *visit(const SCEV *S) { 528 if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop)) 529 return S; 530 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); 531 } 532 533 const SCEV *visitUnknown(const SCEVUnknown *S) { 534 if (SE.isLoopInvariant(S, TheLoop)) 535 return S; 536 // The value could vary across iterations. 537 CannotAnalyze = true; 538 return S; 539 } 540 541 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { 542 // Could not analyze the expression. 543 CannotAnalyze = true; 544 return S; 545 } 546 547 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, 548 unsigned StepMultiplier, unsigned Offset, 549 Loop *TheLoop) { 550 /// Bail out if the expression does not contain an UDiv expression. 551 /// Uniform values which are not loop invariant require operations to strip 552 /// out the lowest bits. For now just look for UDivs and use it to avoid 553 /// re-writing UDIV-free expressions for other lanes to limit compile time. 554 if (!SCEVExprContains(S, 555 [](const SCEV *S) { return isa<SCEVUDivExpr>(S); })) 556 return SE.getCouldNotCompute(); 557 558 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, 559 TheLoop); 560 const SCEV *Result = Rewriter.visit(S); 561 562 if (Rewriter.canAnalyze()) 563 return Result; 564 return SE.getCouldNotCompute(); 565 } 566 }; 567 568 } // namespace 569 570 bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { 571 if (isInvariant(V)) 572 return true; 573 if (VF.isScalable()) 574 return false; 575 if (VF.isScalar()) 576 return true; 577 578 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is 579 // never considered uniform. 580 auto *SE = PSE.getSE(); 581 if (!SE->isSCEVable(V->getType())) 582 return false; 583 const SCEV *S = SE->getSCEV(V); 584 585 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for 586 // lane 0 matches the expressions for all other lanes. 587 unsigned FixedVF = VF.getKnownMinValue(); 588 const SCEV *FirstLaneExpr = 589 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop); 590 if (isa<SCEVCouldNotCompute>(FirstLaneExpr)) 591 return false; 592 593 // Make sure the expressions for lanes FixedVF-1..1 match the expression for 594 // lane 0. We check lanes in reverse order for compile-time, as frequently 595 // checking the last lane is sufficient to rule out uniformity. 596 return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) { 597 const SCEV *IthLaneExpr = 598 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop); 599 return FirstLaneExpr == IthLaneExpr; 600 }); 601 } 602 603 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, 604 ElementCount VF) const { 605 Value *Ptr = getLoadStorePointerOperand(&I); 606 if (!Ptr) 607 return false; 608 // Note: There's nothing inherent which prevents predicated loads and 609 // stores from being uniform. The current lowering simply doesn't handle 610 // it; in particular, the cost model distinguishes scatter/gather from 611 // scalar w/predication, and we currently rely on the scalar path. 612 return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); 613 } 614 615 bool LoopVectorizationLegality::canVectorizeOuterLoop() { 616 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); 617 // Store the result and return it at the end instead of exiting early, in case 618 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 619 bool Result = true; 620 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 621 622 for (BasicBlock *BB : TheLoop->blocks()) { 623 // Check whether the BB terminator is a BranchInst. Any other terminator is 624 // not supported yet. 625 auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); 626 if (!Br) { 627 reportVectorizationFailure("Unsupported basic block terminator", 628 "loop control flow is not understood by vectorizer", 629 "CFGNotUnderstood", ORE, TheLoop); 630 if (DoExtraAnalysis) 631 Result = false; 632 else 633 return false; 634 } 635 636 // Check whether the BranchInst is a supported one. Only unconditional 637 // branches, conditional branches with an outer loop invariant condition or 638 // backedges are supported. 639 // FIXME: We skip these checks when VPlan predication is enabled as we 640 // want to allow divergent branches. This whole check will be removed 641 // once VPlan predication is on by default. 642 if (Br && Br->isConditional() && 643 !TheLoop->isLoopInvariant(Br->getCondition()) && 644 !LI->isLoopHeader(Br->getSuccessor(0)) && 645 !LI->isLoopHeader(Br->getSuccessor(1))) { 646 reportVectorizationFailure("Unsupported conditional branch", 647 "loop control flow is not understood by vectorizer", 648 "CFGNotUnderstood", ORE, TheLoop); 649 if (DoExtraAnalysis) 650 Result = false; 651 else 652 return false; 653 } 654 } 655 656 // Check whether inner loops are uniform. At this point, we only support 657 // simple outer loops scenarios with uniform nested loops. 658 if (!isUniformLoopNest(TheLoop /*loop nest*/, 659 TheLoop /*context outer loop*/)) { 660 reportVectorizationFailure("Outer loop contains divergent loops", 661 "loop control flow is not understood by vectorizer", 662 "CFGNotUnderstood", ORE, TheLoop); 663 if (DoExtraAnalysis) 664 Result = false; 665 else 666 return false; 667 } 668 669 // Check whether we are able to set up outer loop induction. 670 if (!setupOuterLoopInductions()) { 671 reportVectorizationFailure("Unsupported outer loop Phi(s)", 672 "Unsupported outer loop Phi(s)", 673 "UnsupportedPhi", ORE, TheLoop); 674 if (DoExtraAnalysis) 675 Result = false; 676 else 677 return false; 678 } 679 680 return Result; 681 } 682 683 void LoopVectorizationLegality::addInductionPhi( 684 PHINode *Phi, const InductionDescriptor &ID, 685 SmallPtrSetImpl<Value *> &AllowedExit) { 686 Inductions[Phi] = ID; 687 688 // In case this induction also comes with casts that we know we can ignore 689 // in the vectorized loop body, record them here. All casts could be recorded 690 // here for ignoring, but suffices to record only the first (as it is the 691 // only one that may bw used outside the cast sequence). 692 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 693 if (!Casts.empty()) 694 InductionCastsToIgnore.insert(*Casts.begin()); 695 696 Type *PhiTy = Phi->getType(); 697 const DataLayout &DL = Phi->getDataLayout(); 698 699 // Get the widest type. 700 if (!PhiTy->isFloatingPointTy()) { 701 if (!WidestIndTy) 702 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 703 else 704 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 705 } 706 707 // Int inductions are special because we only allow one IV. 708 if (ID.getKind() == InductionDescriptor::IK_IntInduction && 709 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && 710 isa<Constant>(ID.getStartValue()) && 711 cast<Constant>(ID.getStartValue())->isNullValue()) { 712 713 // Use the phi node with the widest type as induction. Use the last 714 // one if there are multiple (no good reason for doing this other 715 // than it is expedient). We've checked that it begins at zero and 716 // steps by one, so this is a canonical induction variable. 717 if (!PrimaryInduction || PhiTy == WidestIndTy) 718 PrimaryInduction = Phi; 719 } 720 721 // Both the PHI node itself, and the "post-increment" value feeding 722 // back into the PHI node may have external users. 723 // We can allow those uses, except if the SCEVs we have for them rely 724 // on predicates that only hold within the loop, since allowing the exit 725 // currently means re-using this SCEV outside the loop (see PR33706 for more 726 // details). 727 if (PSE.getPredicate().isAlwaysTrue()) { 728 AllowedExit.insert(Phi); 729 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); 730 } 731 732 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); 733 } 734 735 bool LoopVectorizationLegality::setupOuterLoopInductions() { 736 BasicBlock *Header = TheLoop->getHeader(); 737 738 // Returns true if a given Phi is a supported induction. 739 auto IsSupportedPhi = [&](PHINode &Phi) -> bool { 740 InductionDescriptor ID; 741 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && 742 ID.getKind() == InductionDescriptor::IK_IntInduction) { 743 addInductionPhi(&Phi, ID, AllowedExit); 744 return true; 745 } 746 // Bail out for any Phi in the outer loop header that is not a supported 747 // induction. 748 LLVM_DEBUG( 749 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n"); 750 return false; 751 }; 752 753 return llvm::all_of(Header->phis(), IsSupportedPhi); 754 } 755 756 /// Checks if a function is scalarizable according to the TLI, in 757 /// the sense that it should be vectorized and then expanded in 758 /// multiple scalar calls. This is represented in the 759 /// TLI via mappings that do not specify a vector name, as in the 760 /// following example: 761 /// 762 /// const VecDesc VecIntrinsics[] = { 763 /// {"llvm.phx.abs.i32", "", 4} 764 /// }; 765 static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { 766 const StringRef ScalarName = CI.getCalledFunction()->getName(); 767 bool Scalarize = TLI.isFunctionVectorizable(ScalarName); 768 // Check that all known VFs are not associated to a vector 769 // function, i.e. the vector name is emty. 770 if (Scalarize) { 771 ElementCount WidestFixedVF, WidestScalableVF; 772 TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); 773 for (ElementCount VF = ElementCount::getFixed(2); 774 ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) 775 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 776 for (ElementCount VF = ElementCount::getScalable(1); 777 ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) 778 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 779 assert((WidestScalableVF.isZero() || !Scalarize) && 780 "Caller may decide to scalarize a variant using a scalable VF"); 781 } 782 return Scalarize; 783 } 784 785 bool LoopVectorizationLegality::canVectorizeInstrs() { 786 BasicBlock *Header = TheLoop->getHeader(); 787 788 // For each block in the loop. 789 for (BasicBlock *BB : TheLoop->blocks()) { 790 // Scan the instructions in the block and look for hazards. 791 for (Instruction &I : *BB) { 792 if (auto *Phi = dyn_cast<PHINode>(&I)) { 793 Type *PhiTy = Phi->getType(); 794 // Check that this PHI type is allowed. 795 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 796 !PhiTy->isPointerTy()) { 797 reportVectorizationFailure("Found a non-int non-pointer PHI", 798 "loop control flow is not understood by vectorizer", 799 "CFGNotUnderstood", ORE, TheLoop); 800 return false; 801 } 802 803 // If this PHINode is not in the header block, then we know that we 804 // can convert it to select during if-conversion. No need to check if 805 // the PHIs in this block are induction or reduction variables. 806 if (BB != Header) { 807 // Non-header phi nodes that have outside uses can be vectorized. Add 808 // them to the list of allowed exits. 809 // Unsafe cyclic dependencies with header phis are identified during 810 // legalization for reduction, induction and fixed order 811 // recurrences. 812 AllowedExit.insert(&I); 813 continue; 814 } 815 816 // We only allow if-converted PHIs with exactly two incoming values. 817 if (Phi->getNumIncomingValues() != 2) { 818 reportVectorizationFailure("Found an invalid PHI", 819 "loop control flow is not understood by vectorizer", 820 "CFGNotUnderstood", ORE, TheLoop, Phi); 821 return false; 822 } 823 824 RecurrenceDescriptor RedDes; 825 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, 826 DT, PSE.getSE())) { 827 Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); 828 AllowedExit.insert(RedDes.getLoopExitInstr()); 829 Reductions[Phi] = RedDes; 830 continue; 831 } 832 833 // We prevent matching non-constant strided pointer IVS to preserve 834 // historical vectorizer behavior after a generalization of the 835 // IVDescriptor code. The intent is to remove this check, but we 836 // have to fix issues around code quality for such loops first. 837 auto IsDisallowedStridedPointerInduction = 838 [](const InductionDescriptor &ID) { 839 if (AllowStridedPointerIVs) 840 return false; 841 return ID.getKind() == InductionDescriptor::IK_PtrInduction && 842 ID.getConstIntStepValue() == nullptr; 843 }; 844 845 // TODO: Instead of recording the AllowedExit, it would be good to 846 // record the complementary set: NotAllowedExit. These include (but may 847 // not be limited to): 848 // 1. Reduction phis as they represent the one-before-last value, which 849 // is not available when vectorized 850 // 2. Induction phis and increment when SCEV predicates cannot be used 851 // outside the loop - see addInductionPhi 852 // 3. Non-Phis with outside uses when SCEV predicates cannot be used 853 // outside the loop - see call to hasOutsideLoopUser in the non-phi 854 // handling below 855 // 4. FixedOrderRecurrence phis that can possibly be handled by 856 // extraction. 857 // By recording these, we can then reason about ways to vectorize each 858 // of these NotAllowedExit. 859 InductionDescriptor ID; 860 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && 861 !IsDisallowedStridedPointerInduction(ID)) { 862 addInductionPhi(Phi, ID, AllowedExit); 863 Requirements->addExactFPMathInst(ID.getExactFPMathInst()); 864 continue; 865 } 866 867 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { 868 AllowedExit.insert(Phi); 869 FixedOrderRecurrences.insert(Phi); 870 continue; 871 } 872 873 // As a last resort, coerce the PHI to a AddRec expression 874 // and re-try classifying it a an induction PHI. 875 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && 876 !IsDisallowedStridedPointerInduction(ID)) { 877 addInductionPhi(Phi, ID, AllowedExit); 878 continue; 879 } 880 881 reportVectorizationFailure("Found an unidentified PHI", 882 "value that could not be identified as " 883 "reduction is used outside the loop", 884 "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); 885 return false; 886 } // end of PHI handling 887 888 // We handle calls that: 889 // * Are debug info intrinsics. 890 // * Have a mapping to an IR intrinsic. 891 // * Have a vector version available. 892 auto *CI = dyn_cast<CallInst>(&I); 893 894 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && 895 !isa<DbgInfoIntrinsic>(CI) && 896 !(CI->getCalledFunction() && TLI && 897 (!VFDatabase::getMappings(*CI).empty() || 898 isTLIScalarize(*TLI, *CI)))) { 899 // If the call is a recognized math libary call, it is likely that 900 // we can vectorize it given loosened floating-point constraints. 901 LibFunc Func; 902 bool IsMathLibCall = 903 TLI && CI->getCalledFunction() && 904 CI->getType()->isFloatingPointTy() && 905 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && 906 TLI->hasOptimizedCodeGen(Func); 907 908 if (IsMathLibCall) { 909 // TODO: Ideally, we should not use clang-specific language here, 910 // but it's hard to provide meaningful yet generic advice. 911 // Also, should this be guarded by allowExtraAnalysis() and/or be part 912 // of the returned info from isFunctionVectorizable()? 913 reportVectorizationFailure( 914 "Found a non-intrinsic callsite", 915 "library call cannot be vectorized. " 916 "Try compiling with -fno-math-errno, -ffast-math, " 917 "or similar flags", 918 "CantVectorizeLibcall", ORE, TheLoop, CI); 919 } else { 920 reportVectorizationFailure("Found a non-intrinsic callsite", 921 "call instruction cannot be vectorized", 922 "CantVectorizeLibcall", ORE, TheLoop, CI); 923 } 924 return false; 925 } 926 927 // Some intrinsics have scalar arguments and should be same in order for 928 // them to be vectorized (i.e. loop invariant). 929 if (CI) { 930 auto *SE = PSE.getSE(); 931 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); 932 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) 933 if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) { 934 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), 935 TheLoop)) { 936 reportVectorizationFailure("Found unvectorizable intrinsic", 937 "intrinsic instruction cannot be vectorized", 938 "CantVectorizeIntrinsic", ORE, TheLoop, CI); 939 return false; 940 } 941 } 942 } 943 944 // If we found a vectorized variant of a function, note that so LV can 945 // make better decisions about maximum VF. 946 if (CI && !VFDatabase::getMappings(*CI).empty()) 947 VecCallVariantsFound = true; 948 949 // Check that the instruction return type is vectorizable. 950 // We can't vectorize casts from vector type to scalar type. 951 // Also, we can't vectorize extractelement instructions. 952 if ((!VectorType::isValidElementType(I.getType()) && 953 !I.getType()->isVoidTy()) || 954 (isa<CastInst>(I) && 955 !VectorType::isValidElementType(I.getOperand(0)->getType())) || 956 isa<ExtractElementInst>(I)) { 957 reportVectorizationFailure("Found unvectorizable type", 958 "instruction return type cannot be vectorized", 959 "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); 960 return false; 961 } 962 963 // Check that the stored type is vectorizable. 964 if (auto *ST = dyn_cast<StoreInst>(&I)) { 965 Type *T = ST->getValueOperand()->getType(); 966 if (!VectorType::isValidElementType(T)) { 967 reportVectorizationFailure("Store instruction cannot be vectorized", 968 "store instruction cannot be vectorized", 969 "CantVectorizeStore", ORE, TheLoop, ST); 970 return false; 971 } 972 973 // For nontemporal stores, check that a nontemporal vector version is 974 // supported on the target. 975 if (ST->getMetadata(LLVMContext::MD_nontemporal)) { 976 // Arbitrarily try a vector of 2 elements. 977 auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); 978 assert(VecTy && "did not find vectorized version of stored type"); 979 if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { 980 reportVectorizationFailure( 981 "nontemporal store instruction cannot be vectorized", 982 "nontemporal store instruction cannot be vectorized", 983 "CantVectorizeNontemporalStore", ORE, TheLoop, ST); 984 return false; 985 } 986 } 987 988 } else if (auto *LD = dyn_cast<LoadInst>(&I)) { 989 if (LD->getMetadata(LLVMContext::MD_nontemporal)) { 990 // For nontemporal loads, check that a nontemporal vector version is 991 // supported on the target (arbitrarily try a vector of 2 elements). 992 auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); 993 assert(VecTy && "did not find vectorized version of load type"); 994 if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { 995 reportVectorizationFailure( 996 "nontemporal load instruction cannot be vectorized", 997 "nontemporal load instruction cannot be vectorized", 998 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); 999 return false; 1000 } 1001 } 1002 1003 // FP instructions can allow unsafe algebra, thus vectorizable by 1004 // non-IEEE-754 compliant SIMD units. 1005 // This applies to floating-point math operations and calls, not memory 1006 // operations, shuffles, or casts, as they don't change precision or 1007 // semantics. 1008 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && 1009 !I.isFast()) { 1010 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); 1011 Hints->setPotentiallyUnsafe(); 1012 } 1013 1014 // Reduction instructions are allowed to have exit users. 1015 // All other instructions must not have external users. 1016 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { 1017 // We can safely vectorize loops where instructions within the loop are 1018 // used outside the loop only if the SCEV predicates within the loop is 1019 // same as outside the loop. Allowing the exit means reusing the SCEV 1020 // outside the loop. 1021 if (PSE.getPredicate().isAlwaysTrue()) { 1022 AllowedExit.insert(&I); 1023 continue; 1024 } 1025 reportVectorizationFailure("Value cannot be used outside the loop", 1026 "value cannot be used outside the loop", 1027 "ValueUsedOutsideLoop", ORE, TheLoop, &I); 1028 return false; 1029 } 1030 } // next instr. 1031 } 1032 1033 if (!PrimaryInduction) { 1034 if (Inductions.empty()) { 1035 reportVectorizationFailure("Did not find one integer induction var", 1036 "loop induction variable could not be identified", 1037 "NoInductionVariable", ORE, TheLoop); 1038 return false; 1039 } 1040 if (!WidestIndTy) { 1041 reportVectorizationFailure("Did not find one integer induction var", 1042 "integer loop induction variable could not be identified", 1043 "NoIntegerInductionVariable", ORE, TheLoop); 1044 return false; 1045 } 1046 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1047 } 1048 1049 // Now we know the widest induction type, check if our found induction 1050 // is the same size. If it's not, unset it here and InnerLoopVectorizer 1051 // will create another. 1052 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) 1053 PrimaryInduction = nullptr; 1054 1055 return true; 1056 } 1057 1058 /// Find histogram operations that match high-level code in loops: 1059 /// \code 1060 /// buckets[indices[i]]+=step; 1061 /// \endcode 1062 /// 1063 /// It matches a pattern starting from \p HSt, which Stores to the 'buckets' 1064 /// array the computed histogram. It uses a BinOp to sum all counts, storing 1065 /// them using a loop-variant index Load from the 'indices' input array. 1066 /// 1067 /// On successful matches it updates the STATISTIC 'HistogramsDetected', 1068 /// regardless of hardware support. When there is support, it additionally 1069 /// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers 1070 /// used to update histogram in \p HistogramPtrs. 1071 static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop, 1072 const PredicatedScalarEvolution &PSE, 1073 SmallVectorImpl<HistogramInfo> &Histograms) { 1074 1075 // Store value must come from a Binary Operation. 1076 Instruction *HPtrInstr = nullptr; 1077 BinaryOperator *HBinOp = nullptr; 1078 if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) 1079 return false; 1080 1081 // BinOp must be an Add or a Sub modifying the bucket value by a 1082 // loop invariant amount. 1083 // FIXME: We assume the loop invariant term is on the RHS. 1084 // Fine for an immediate/constant, but maybe not a generic value? 1085 Value *HIncVal = nullptr; 1086 if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) && 1087 !match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) 1088 return false; 1089 1090 // Make sure the increment value is loop invariant. 1091 if (!TheLoop->isLoopInvariant(HIncVal)) 1092 return false; 1093 1094 // The address to store is calculated through a GEP Instruction. 1095 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(HPtrInstr); 1096 if (!GEP) 1097 return false; 1098 1099 // Restrict address calculation to constant indices except for the last term. 1100 Value *HIdx = nullptr; 1101 for (Value *Index : GEP->indices()) { 1102 if (HIdx) 1103 return false; 1104 if (!isa<ConstantInt>(Index)) 1105 HIdx = Index; 1106 } 1107 1108 if (!HIdx) 1109 return false; 1110 1111 // Check that the index is calculated by loading from another array. Ignore 1112 // any extensions. 1113 // FIXME: Support indices from other sources than a linear load from memory? 1114 // We're currently trying to match an operation looping over an array 1115 // of indices, but there could be additional levels of indirection 1116 // in place, or possibly some additional calculation to form the index 1117 // from the loaded data. 1118 Value *VPtrVal; 1119 if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal))))) 1120 return false; 1121 1122 // Make sure the index address varies in this loop, not an outer loop. 1123 const auto *AR = dyn_cast<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal)); 1124 if (!AR || AR->getLoop() != TheLoop) 1125 return false; 1126 1127 // Ensure we'll have the same mask by checking that all parts of the histogram 1128 // (gather load, update, scatter store) are in the same block. 1129 LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0)); 1130 BasicBlock *LdBB = IndexedLoad->getParent(); 1131 if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent()) 1132 return false; 1133 1134 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n"); 1135 1136 // Store the operations that make up the histogram. 1137 Histograms.emplace_back(IndexedLoad, HBinOp, HSt); 1138 return true; 1139 } 1140 1141 bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() { 1142 // For now, we only support an IndirectUnsafe dependency that calculates 1143 // a histogram 1144 if (!EnableHistogramVectorization) 1145 return false; 1146 1147 // Find a single IndirectUnsafe dependency. 1148 const MemoryDepChecker::Dependence *IUDep = nullptr; 1149 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 1150 const auto *Deps = DepChecker.getDependences(); 1151 // If there were too many dependences, LAA abandons recording them. We can't 1152 // proceed safely if we don't know what the dependences are. 1153 if (!Deps) 1154 return false; 1155 1156 for (const MemoryDepChecker::Dependence &Dep : *Deps) { 1157 // Ignore dependencies that are either known to be safe or can be 1158 // checked at runtime. 1159 if (MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) != 1160 MemoryDepChecker::VectorizationSafetyStatus::Unsafe) 1161 continue; 1162 1163 // We're only interested in IndirectUnsafe dependencies here, where the 1164 // address might come from a load from memory. We also only want to handle 1165 // one such dependency, at least for now. 1166 if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep) 1167 return false; 1168 1169 IUDep = &Dep; 1170 } 1171 if (!IUDep) 1172 return false; 1173 1174 // For now only normal loads and stores are supported. 1175 LoadInst *LI = dyn_cast<LoadInst>(IUDep->getSource(DepChecker)); 1176 StoreInst *SI = dyn_cast<StoreInst>(IUDep->getDestination(DepChecker)); 1177 1178 if (!LI || !SI) 1179 return false; 1180 1181 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n"); 1182 return findHistogram(LI, SI, TheLoop, LAI->getPSE(), Histograms); 1183 } 1184 1185 bool LoopVectorizationLegality::canVectorizeMemory() { 1186 LAI = &LAIs.getInfo(*TheLoop); 1187 const OptimizationRemarkAnalysis *LAR = LAI->getReport(); 1188 if (LAR) { 1189 ORE->emit([&]() { 1190 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), 1191 "loop not vectorized: ", *LAR); 1192 }); 1193 } 1194 1195 if (!LAI->canVectorizeMemory()) 1196 return canVectorizeIndirectUnsafeDependences(); 1197 1198 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { 1199 reportVectorizationFailure("We don't allow storing to uniform addresses", 1200 "write to a loop invariant address could not " 1201 "be vectorized", 1202 "CantVectorizeStoreToLoopInvariantAddress", ORE, 1203 TheLoop); 1204 return false; 1205 } 1206 1207 // We can vectorize stores to invariant address when final reduction value is 1208 // guaranteed to be stored at the end of the loop. Also, if decision to 1209 // vectorize loop is made, runtime checks are added so as to make sure that 1210 // invariant address won't alias with any other objects. 1211 if (!LAI->getStoresToInvariantAddresses().empty()) { 1212 // For each invariant address, check if last stored value is unconditional 1213 // and the address is not calculated inside the loop. 1214 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1215 if (!isInvariantStoreOfReduction(SI)) 1216 continue; 1217 1218 if (blockNeedsPredication(SI->getParent())) { 1219 reportVectorizationFailure( 1220 "We don't allow storing to uniform addresses", 1221 "write of conditional recurring variant value to a loop " 1222 "invariant address could not be vectorized", 1223 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1224 return false; 1225 } 1226 1227 // Invariant address should be defined outside of loop. LICM pass usually 1228 // makes sure it happens, but in rare cases it does not, we do not want 1229 // to overcomplicate vectorization to support this case. 1230 if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) { 1231 if (TheLoop->contains(Ptr)) { 1232 reportVectorizationFailure( 1233 "Invariant address is calculated inside the loop", 1234 "write to a loop invariant address could not " 1235 "be vectorized", 1236 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1237 return false; 1238 } 1239 } 1240 } 1241 1242 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) { 1243 // For each invariant address, check its last stored value is the result 1244 // of one of our reductions. 1245 // 1246 // We do not check if dependence with loads exists because that is already 1247 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. 1248 ScalarEvolution *SE = PSE.getSE(); 1249 SmallVector<StoreInst *, 4> UnhandledStores; 1250 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1251 if (isInvariantStoreOfReduction(SI)) { 1252 // Earlier stores to this address are effectively deadcode. 1253 // With opaque pointers it is possible for one pointer to be used with 1254 // different sizes of stored values: 1255 // store i32 0, ptr %x 1256 // store i8 0, ptr %x 1257 // The latest store doesn't complitely overwrite the first one in the 1258 // example. That is why we have to make sure that types of stored 1259 // values are same. 1260 // TODO: Check that bitwidth of unhandled store is smaller then the 1261 // one that overwrites it and add a test. 1262 erase_if(UnhandledStores, [SE, SI](StoreInst *I) { 1263 return storeToSameAddress(SE, SI, I) && 1264 I->getValueOperand()->getType() == 1265 SI->getValueOperand()->getType(); 1266 }); 1267 continue; 1268 } 1269 UnhandledStores.push_back(SI); 1270 } 1271 1272 bool IsOK = UnhandledStores.empty(); 1273 // TODO: we should also validate against InvariantMemSets. 1274 if (!IsOK) { 1275 reportVectorizationFailure( 1276 "We don't allow storing to uniform addresses", 1277 "write to a loop invariant address could not " 1278 "be vectorized", 1279 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1280 return false; 1281 } 1282 } 1283 } 1284 1285 PSE.addPredicate(LAI->getPSE().getPredicate()); 1286 return true; 1287 } 1288 1289 bool LoopVectorizationLegality::canVectorizeFPMath( 1290 bool EnableStrictReductions) { 1291 1292 // First check if there is any ExactFP math or if we allow reassociations 1293 if (!Requirements->getExactFPInst() || Hints->allowReordering()) 1294 return true; 1295 1296 // If the above is false, we have ExactFPMath & do not allow reordering. 1297 // If the EnableStrictReductions flag is set, first check if we have any 1298 // Exact FP induction vars, which we cannot vectorize. 1299 if (!EnableStrictReductions || 1300 any_of(getInductionVars(), [&](auto &Induction) -> bool { 1301 InductionDescriptor IndDesc = Induction.second; 1302 return IndDesc.getExactFPMathInst(); 1303 })) 1304 return false; 1305 1306 // We can now only vectorize if all reductions with Exact FP math also 1307 // have the isOrdered flag set, which indicates that we can move the 1308 // reduction operations in-loop. 1309 return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { 1310 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1311 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); 1312 })); 1313 } 1314 1315 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { 1316 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1317 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1318 return RdxDesc.IntermediateStore == SI; 1319 }); 1320 } 1321 1322 bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { 1323 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1324 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1325 if (!RdxDesc.IntermediateStore) 1326 return false; 1327 1328 ScalarEvolution *SE = PSE.getSE(); 1329 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); 1330 return V == InvariantAddress || 1331 SE->getSCEV(V) == SE->getSCEV(InvariantAddress); 1332 }); 1333 } 1334 1335 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { 1336 Value *In0 = const_cast<Value *>(V); 1337 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 1338 if (!PN) 1339 return false; 1340 1341 return Inductions.count(PN); 1342 } 1343 1344 const InductionDescriptor * 1345 LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { 1346 if (!isInductionPhi(Phi)) 1347 return nullptr; 1348 auto &ID = getInductionVars().find(Phi)->second; 1349 if (ID.getKind() == InductionDescriptor::IK_IntInduction || 1350 ID.getKind() == InductionDescriptor::IK_FpInduction) 1351 return &ID; 1352 return nullptr; 1353 } 1354 1355 const InductionDescriptor * 1356 LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { 1357 if (!isInductionPhi(Phi)) 1358 return nullptr; 1359 auto &ID = getInductionVars().find(Phi)->second; 1360 if (ID.getKind() == InductionDescriptor::IK_PtrInduction) 1361 return &ID; 1362 return nullptr; 1363 } 1364 1365 bool LoopVectorizationLegality::isCastedInductionVariable( 1366 const Value *V) const { 1367 auto *Inst = dyn_cast<Instruction>(V); 1368 return (Inst && InductionCastsToIgnore.count(Inst)); 1369 } 1370 1371 bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { 1372 return isInductionPhi(V) || isCastedInductionVariable(V); 1373 } 1374 1375 bool LoopVectorizationLegality::isFixedOrderRecurrence( 1376 const PHINode *Phi) const { 1377 return FixedOrderRecurrences.count(Phi); 1378 } 1379 1380 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { 1381 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 1382 } 1383 1384 bool LoopVectorizationLegality::blockCanBePredicated( 1385 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, 1386 SmallPtrSetImpl<const Instruction *> &MaskedOp) const { 1387 for (Instruction &I : *BB) { 1388 // We can predicate blocks with calls to assume, as long as we drop them in 1389 // case we flatten the CFG via predication. 1390 if (match(&I, m_Intrinsic<Intrinsic::assume>())) { 1391 MaskedOp.insert(&I); 1392 continue; 1393 } 1394 1395 // Do not let llvm.experimental.noalias.scope.decl block the vectorization. 1396 // TODO: there might be cases that it should block the vectorization. Let's 1397 // ignore those for now. 1398 if (isa<NoAliasScopeDeclInst>(&I)) 1399 continue; 1400 1401 // We can allow masked calls if there's at least one vector variant, even 1402 // if we end up scalarizing due to the cost model calculations. 1403 // TODO: Allow other calls if they have appropriate attributes... readonly 1404 // and argmemonly? 1405 if (CallInst *CI = dyn_cast<CallInst>(&I)) 1406 if (VFDatabase::hasMaskedVariant(*CI)) { 1407 MaskedOp.insert(CI); 1408 continue; 1409 } 1410 1411 // Loads are handled via masking (or speculated if safe to do so.) 1412 if (auto *LI = dyn_cast<LoadInst>(&I)) { 1413 if (!SafePtrs.count(LI->getPointerOperand())) 1414 MaskedOp.insert(LI); 1415 continue; 1416 } 1417 1418 // Predicated store requires some form of masking: 1419 // 1) masked store HW instruction, 1420 // 2) emulation via load-blend-store (only if safe and legal to do so, 1421 // be aware on the race conditions), or 1422 // 3) element-by-element predicate check and scalar store. 1423 if (auto *SI = dyn_cast<StoreInst>(&I)) { 1424 MaskedOp.insert(SI); 1425 continue; 1426 } 1427 1428 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) 1429 return false; 1430 } 1431 1432 return true; 1433 } 1434 1435 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1436 if (!EnableIfConversion) { 1437 reportVectorizationFailure("If-conversion is disabled", 1438 "if-conversion is disabled", 1439 "IfConversionDisabled", 1440 ORE, TheLoop); 1441 return false; 1442 } 1443 1444 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1445 1446 // A list of pointers which are known to be dereferenceable within scope of 1447 // the loop body for each iteration of the loop which executes. That is, 1448 // the memory pointed to can be dereferenced (with the access size implied by 1449 // the value's type) unconditionally within the loop header without 1450 // introducing a new fault. 1451 SmallPtrSet<Value *, 8> SafePointers; 1452 1453 // Collect safe addresses. 1454 for (BasicBlock *BB : TheLoop->blocks()) { 1455 if (!blockNeedsPredication(BB)) { 1456 for (Instruction &I : *BB) 1457 if (auto *Ptr = getLoadStorePointerOperand(&I)) 1458 SafePointers.insert(Ptr); 1459 continue; 1460 } 1461 1462 // For a block which requires predication, a address may be safe to access 1463 // in the loop w/o predication if we can prove dereferenceability facts 1464 // sufficient to ensure it'll never fault within the loop. For the moment, 1465 // we restrict this to loads; stores are more complicated due to 1466 // concurrency restrictions. 1467 ScalarEvolution &SE = *PSE.getSE(); 1468 SmallVector<const SCEVPredicate *, 4> Predicates; 1469 for (Instruction &I : *BB) { 1470 LoadInst *LI = dyn_cast<LoadInst>(&I); 1471 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so 1472 // that it will consider loops that need guarding by SCEV checks. The 1473 // vectoriser will generate these checks if we decide to vectorise. 1474 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && 1475 isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC, 1476 &Predicates)) 1477 SafePointers.insert(LI->getPointerOperand()); 1478 Predicates.clear(); 1479 } 1480 } 1481 1482 // Collect the blocks that need predication. 1483 for (BasicBlock *BB : TheLoop->blocks()) { 1484 // We support only branches and switch statements as terminators inside the 1485 // loop. 1486 if (isa<SwitchInst>(BB->getTerminator())) { 1487 if (TheLoop->isLoopExiting(BB)) { 1488 reportVectorizationFailure("Loop contains an unsupported switch", 1489 "loop contains an unsupported switch", 1490 "LoopContainsUnsupportedSwitch", ORE, 1491 TheLoop, BB->getTerminator()); 1492 return false; 1493 } 1494 } else if (!isa<BranchInst>(BB->getTerminator())) { 1495 reportVectorizationFailure("Loop contains an unsupported terminator", 1496 "loop contains an unsupported terminator", 1497 "LoopContainsUnsupportedTerminator", ORE, 1498 TheLoop, BB->getTerminator()); 1499 return false; 1500 } 1501 1502 // We must be able to predicate all blocks that need to be predicated. 1503 if (blockNeedsPredication(BB) && 1504 !blockCanBePredicated(BB, SafePointers, MaskedOp)) { 1505 reportVectorizationFailure( 1506 "Control flow cannot be substituted for a select", 1507 "control flow cannot be substituted for a select", "NoCFGForSelect", 1508 ORE, TheLoop, BB->getTerminator()); 1509 return false; 1510 } 1511 } 1512 1513 // We can if-convert this loop. 1514 return true; 1515 } 1516 1517 // Helper function to canVectorizeLoopNestCFG. 1518 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, 1519 bool UseVPlanNativePath) { 1520 assert((UseVPlanNativePath || Lp->isInnermost()) && 1521 "VPlan-native path is not enabled."); 1522 1523 // TODO: ORE should be improved to show more accurate information when an 1524 // outer loop can't be vectorized because a nested loop is not understood or 1525 // legal. Something like: "outer_loop_location: loop not vectorized: 1526 // (inner_loop_location) loop control flow is not understood by vectorizer". 1527 1528 // Store the result and return it at the end instead of exiting early, in case 1529 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1530 bool Result = true; 1531 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1532 1533 // We must have a loop in canonical form. Loops with indirectbr in them cannot 1534 // be canonicalized. 1535 if (!Lp->getLoopPreheader()) { 1536 reportVectorizationFailure("Loop doesn't have a legal pre-header", 1537 "loop control flow is not understood by vectorizer", 1538 "CFGNotUnderstood", ORE, TheLoop); 1539 if (DoExtraAnalysis) 1540 Result = false; 1541 else 1542 return false; 1543 } 1544 1545 // We must have a single backedge. 1546 if (Lp->getNumBackEdges() != 1) { 1547 reportVectorizationFailure("The loop must have a single backedge", 1548 "loop control flow is not understood by vectorizer", 1549 "CFGNotUnderstood", ORE, TheLoop); 1550 if (DoExtraAnalysis) 1551 Result = false; 1552 else 1553 return false; 1554 } 1555 1556 return Result; 1557 } 1558 1559 bool LoopVectorizationLegality::canVectorizeLoopNestCFG( 1560 Loop *Lp, bool UseVPlanNativePath) { 1561 // Store the result and return it at the end instead of exiting early, in case 1562 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1563 bool Result = true; 1564 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1565 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { 1566 if (DoExtraAnalysis) 1567 Result = false; 1568 else 1569 return false; 1570 } 1571 1572 // Recursively check whether the loop control flow of nested loops is 1573 // understood. 1574 for (Loop *SubLp : *Lp) 1575 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { 1576 if (DoExtraAnalysis) 1577 Result = false; 1578 else 1579 return false; 1580 } 1581 1582 return Result; 1583 } 1584 1585 bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { 1586 BasicBlock *LatchBB = TheLoop->getLoopLatch(); 1587 if (!LatchBB) { 1588 reportVectorizationFailure("Loop does not have a latch", 1589 "Cannot vectorize early exit loop", 1590 "NoLatchEarlyExit", ORE, TheLoop); 1591 return false; 1592 } 1593 1594 if (Reductions.size() || FixedOrderRecurrences.size()) { 1595 reportVectorizationFailure( 1596 "Found reductions or recurrences in early-exit loop", 1597 "Cannot vectorize early exit loop with reductions or recurrences", 1598 "RecurrencesInEarlyExitLoop", ORE, TheLoop); 1599 return false; 1600 } 1601 1602 SmallVector<BasicBlock *, 8> ExitingBlocks; 1603 TheLoop->getExitingBlocks(ExitingBlocks); 1604 1605 // Keep a record of all the exiting blocks. 1606 SmallVector<const SCEVPredicate *, 4> Predicates; 1607 for (BasicBlock *BB : ExitingBlocks) { 1608 const SCEV *EC = 1609 PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates); 1610 if (isa<SCEVCouldNotCompute>(EC)) { 1611 UncountableExitingBlocks.push_back(BB); 1612 1613 SmallVector<BasicBlock *, 2> Succs(successors(BB)); 1614 if (Succs.size() != 2) { 1615 reportVectorizationFailure( 1616 "Early exiting block does not have exactly two successors", 1617 "Incorrect number of successors from early exiting block", 1618 "EarlyExitTooManySuccessors", ORE, TheLoop); 1619 return false; 1620 } 1621 1622 BasicBlock *ExitBlock; 1623 if (!TheLoop->contains(Succs[0])) 1624 ExitBlock = Succs[0]; 1625 else { 1626 assert(!TheLoop->contains(Succs[1])); 1627 ExitBlock = Succs[1]; 1628 } 1629 UncountableExitBlocks.push_back(ExitBlock); 1630 } else 1631 CountableExitingBlocks.push_back(BB); 1632 } 1633 // We can safely ignore the predicates here because when vectorizing the loop 1634 // the PredicatatedScalarEvolution class will keep track of all predicates 1635 // for each exiting block anyway. This happens when calling 1636 // PSE.getSymbolicMaxBackedgeTakenCount() below. 1637 Predicates.clear(); 1638 1639 // We only support one uncountable early exit. 1640 if (getUncountableExitingBlocks().size() != 1) { 1641 reportVectorizationFailure( 1642 "Loop has too many uncountable exits", 1643 "Cannot vectorize early exit loop with more than one early exit", 1644 "TooManyUncountableEarlyExits", ORE, TheLoop); 1645 return false; 1646 } 1647 1648 // The only supported early exit loops so far are ones where the early 1649 // exiting block is a unique predecessor of the latch block. 1650 BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor(); 1651 if (LatchPredBB != getUncountableEarlyExitingBlock()) { 1652 reportVectorizationFailure("Early exit is not the latch predecessor", 1653 "Cannot vectorize early exit loop", 1654 "EarlyExitNotLatchPredecessor", ORE, TheLoop); 1655 return false; 1656 } 1657 1658 // The latch block must have a countable exit. 1659 if (isa<SCEVCouldNotCompute>( 1660 PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) { 1661 reportVectorizationFailure( 1662 "Cannot determine exact exit count for latch block", 1663 "Cannot vectorize early exit loop", 1664 "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop); 1665 return false; 1666 } 1667 assert(llvm::is_contained(CountableExitingBlocks, LatchBB) && 1668 "Latch block not found in list of countable exits!"); 1669 1670 // Check to see if there are instructions that could potentially generate 1671 // exceptions or have side-effects. 1672 auto IsSafeOperation = [](Instruction *I) -> bool { 1673 switch (I->getOpcode()) { 1674 case Instruction::Load: 1675 case Instruction::Store: 1676 case Instruction::PHI: 1677 case Instruction::Br: 1678 // These are checked separately. 1679 return true; 1680 default: 1681 return isSafeToSpeculativelyExecute(I); 1682 } 1683 }; 1684 1685 for (auto *BB : TheLoop->blocks()) 1686 for (auto &I : *BB) { 1687 if (I.mayWriteToMemory()) { 1688 // We don't support writes to memory. 1689 reportVectorizationFailure( 1690 "Writes to memory unsupported in early exit loops", 1691 "Cannot vectorize early exit loop with writes to memory", 1692 "WritesInEarlyExitLoop", ORE, TheLoop); 1693 return false; 1694 } else if (!IsSafeOperation(&I)) { 1695 reportVectorizationFailure("Early exit loop contains operations that " 1696 "cannot be speculatively executed", 1697 "Early exit loop contains operations that " 1698 "cannot be speculatively executed", 1699 "UnsafeOperationsEarlyExitLoop", ORE, 1700 TheLoop); 1701 return false; 1702 } 1703 } 1704 1705 // The vectoriser cannot handle loads that occur after the early exit block. 1706 assert(LatchBB->getUniquePredecessor() == getUncountableEarlyExitingBlock() && 1707 "Expected latch predecessor to be the early exiting block"); 1708 1709 // TODO: Handle loops that may fault. 1710 Predicates.clear(); 1711 if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, 1712 &Predicates)) { 1713 reportVectorizationFailure( 1714 "Loop may fault", 1715 "Cannot vectorize potentially faulting early exit loop", 1716 "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); 1717 return false; 1718 } 1719 1720 [[maybe_unused]] const SCEV *SymbolicMaxBTC = 1721 PSE.getSymbolicMaxBackedgeTakenCount(); 1722 // Since we have an exact exit count for the latch and the early exit 1723 // dominates the latch, then this should guarantee a computed SCEV value. 1724 assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) && 1725 "Failed to get symbolic expression for backedge taken count"); 1726 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max " 1727 "backedge taken count: " 1728 << *SymbolicMaxBTC << '\n'); 1729 return true; 1730 } 1731 1732 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { 1733 // Store the result and return it at the end instead of exiting early, in case 1734 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1735 bool Result = true; 1736 1737 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1738 // Check whether the loop-related control flow in the loop nest is expected by 1739 // vectorizer. 1740 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { 1741 if (DoExtraAnalysis) { 1742 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest"); 1743 Result = false; 1744 } else { 1745 return false; 1746 } 1747 } 1748 1749 // We need to have a loop header. 1750 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() 1751 << '\n'); 1752 1753 // Specific checks for outer loops. We skip the remaining legal checks at this 1754 // point because they don't support outer loops. 1755 if (!TheLoop->isInnermost()) { 1756 assert(UseVPlanNativePath && "VPlan-native path is not enabled."); 1757 1758 if (!canVectorizeOuterLoop()) { 1759 reportVectorizationFailure("Unsupported outer loop", 1760 "unsupported outer loop", 1761 "UnsupportedOuterLoop", 1762 ORE, TheLoop); 1763 // TODO: Implement DoExtraAnalysis when subsequent legal checks support 1764 // outer loops. 1765 return false; 1766 } 1767 1768 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); 1769 return Result; 1770 } 1771 1772 assert(TheLoop->isInnermost() && "Inner loop expected."); 1773 // Check if we can if-convert non-single-bb loops. 1774 unsigned NumBlocks = TheLoop->getNumBlocks(); 1775 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1776 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1777 if (DoExtraAnalysis) 1778 Result = false; 1779 else 1780 return false; 1781 } 1782 1783 // Check if we can vectorize the instructions and CFG in this loop. 1784 if (!canVectorizeInstrs()) { 1785 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1786 if (DoExtraAnalysis) 1787 Result = false; 1788 else 1789 return false; 1790 } 1791 1792 HasUncountableEarlyExit = false; 1793 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 1794 if (!isVectorizableEarlyExitLoop()) { 1795 if (DoExtraAnalysis) 1796 Result = false; 1797 else 1798 return false; 1799 } else 1800 HasUncountableEarlyExit = true; 1801 } 1802 1803 // Go over each instruction and look at memory deps. 1804 if (!canVectorizeMemory()) { 1805 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1806 if (DoExtraAnalysis) 1807 Result = false; 1808 else 1809 return false; 1810 } 1811 1812 if (Result) { 1813 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" 1814 << (LAI->getRuntimePointerChecking()->Need 1815 ? " (with a runtime bound check)" 1816 : "") 1817 << "!\n"); 1818 } 1819 1820 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 1821 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 1822 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 1823 1824 if (PSE.getPredicate().getComplexity() > SCEVThreshold) { 1825 LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable " 1826 "due to SCEVThreshold"); 1827 reportVectorizationFailure("Too many SCEV checks needed", 1828 "Too many SCEV assumptions need to be made and checked at runtime", 1829 "TooManySCEVRunTimeChecks", ORE, TheLoop); 1830 if (DoExtraAnalysis) 1831 Result = false; 1832 else 1833 return false; 1834 } 1835 1836 // Okay! We've done all the tests. If any have failed, return false. Otherwise 1837 // we can vectorize, and at this point we don't have any other mem analysis 1838 // which may limit our maximum vectorization factor, so just return true with 1839 // no restrictions. 1840 return Result; 1841 } 1842 1843 bool LoopVectorizationLegality::canFoldTailByMasking() const { 1844 1845 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); 1846 1847 SmallPtrSet<const Value *, 8> ReductionLiveOuts; 1848 1849 for (const auto &Reduction : getReductionVars()) 1850 ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); 1851 1852 // TODO: handle non-reduction outside users when tail is folded by masking. 1853 for (auto *AE : AllowedExit) { 1854 // Check that all users of allowed exit values are inside the loop or 1855 // are the live-out of a reduction. 1856 if (ReductionLiveOuts.count(AE)) 1857 continue; 1858 for (User *U : AE->users()) { 1859 Instruction *UI = cast<Instruction>(U); 1860 if (TheLoop->contains(UI)) 1861 continue; 1862 LLVM_DEBUG( 1863 dbgs() 1864 << "LV: Cannot fold tail by masking, loop has an outside user for " 1865 << *UI << "\n"); 1866 return false; 1867 } 1868 } 1869 1870 for (const auto &Entry : getInductionVars()) { 1871 PHINode *OrigPhi = Entry.first; 1872 for (User *U : OrigPhi->users()) { 1873 auto *UI = cast<Instruction>(U); 1874 if (!TheLoop->contains(UI)) { 1875 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an " 1876 "outside user for " 1877 << *UI << "\n"); 1878 return false; 1879 } 1880 } 1881 } 1882 1883 // The list of pointers that we can safely read and write to remains empty. 1884 SmallPtrSet<Value *, 8> SafePointers; 1885 1886 // Check all blocks for predication, including those that ordinarily do not 1887 // need predication such as the header block. 1888 SmallPtrSet<const Instruction *, 8> TmpMaskedOp; 1889 for (BasicBlock *BB : TheLoop->blocks()) { 1890 if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) { 1891 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n"); 1892 return false; 1893 } 1894 } 1895 1896 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); 1897 1898 return true; 1899 } 1900 1901 void LoopVectorizationLegality::prepareToFoldTailByMasking() { 1902 // The list of pointers that we can safely read and write to remains empty. 1903 SmallPtrSet<Value *, 8> SafePointers; 1904 1905 // Mark all blocks for predication, including those that ordinarily do not 1906 // need predication such as the header block. 1907 for (BasicBlock *BB : TheLoop->blocks()) { 1908 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp); 1909 assert(R && "Must be able to predicate block when tail-folding."); 1910 } 1911 } 1912 1913 } // namespace llvm 1914