1 //===- LoopVectorizationLegality.cpp --------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides loop vectorization legality analysis. Original code 10 // resided in LoopVectorize.cpp for a long time. 11 // 12 // At this point, it is implemented as a utility class, not as an analysis 13 // pass. It should be easy to create an analysis pass around it if there 14 // is a need (but D45420 needs to happen first). 15 // 16 17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 18 #include "llvm/Analysis/Loads.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 21 #include "llvm/Analysis/TargetLibraryInfo.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/Analysis/VectorUtils.h" 25 #include "llvm/IR/IntrinsicInst.h" 26 #include "llvm/IR/PatternMatch.h" 27 #include "llvm/Transforms/Utils/SizeOpts.h" 28 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 29 30 using namespace llvm; 31 using namespace PatternMatch; 32 33 #define LV_NAME "loop-vectorize" 34 #define DEBUG_TYPE LV_NAME 35 36 static cl::opt<bool> 37 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 38 cl::desc("Enable if-conversion during vectorization.")); 39 40 static cl::opt<bool> 41 AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, 42 cl::desc("Enable recognition of non-constant strided " 43 "pointer induction variables.")); 44 45 namespace llvm { 46 cl::opt<bool> 47 HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, 48 cl::desc("Allow enabling loop hints to reorder " 49 "FP operations during vectorization.")); 50 } // namespace llvm 51 52 // TODO: Move size-based thresholds out of legality checking, make cost based 53 // decisions instead of hard thresholds. 54 static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 55 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 56 cl::desc("The maximum number of SCEV checks allowed.")); 57 58 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 59 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 60 cl::desc("The maximum number of SCEV checks allowed with a " 61 "vectorize(enable) pragma")); 62 63 static cl::opt<LoopVectorizeHints::ScalableForceKind> 64 ForceScalableVectorization( 65 "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), 66 cl::Hidden, 67 cl::desc("Control whether the compiler can use scalable vectors to " 68 "vectorize a loop"), 69 cl::values( 70 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", 71 "Scalable vectorization is disabled."), 72 clEnumValN( 73 LoopVectorizeHints::SK_PreferScalable, "preferred", 74 "Scalable vectorization is available and favored when the " 75 "cost is inconclusive."), 76 clEnumValN( 77 LoopVectorizeHints::SK_PreferScalable, "on", 78 "Scalable vectorization is available and favored when the " 79 "cost is inconclusive."))); 80 81 /// Maximum vectorization interleave count. 82 static const unsigned MaxInterleaveFactor = 16; 83 84 namespace llvm { 85 86 bool LoopVectorizeHints::Hint::validate(unsigned Val) { 87 switch (Kind) { 88 case HK_WIDTH: 89 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 90 case HK_INTERLEAVE: 91 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 92 case HK_FORCE: 93 return (Val <= 1); 94 case HK_ISVECTORIZED: 95 case HK_PREDICATE: 96 case HK_SCALABLE: 97 return (Val == 0 || Val == 1); 98 } 99 return false; 100 } 101 102 LoopVectorizeHints::LoopVectorizeHints(const Loop *L, 103 bool InterleaveOnlyWhenForced, 104 OptimizationRemarkEmitter &ORE, 105 const TargetTransformInfo *TTI) 106 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), 107 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), 108 Force("vectorize.enable", FK_Undefined, HK_FORCE), 109 IsVectorized("isvectorized", 0, HK_ISVECTORIZED), 110 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), 111 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE), 112 TheLoop(L), ORE(ORE) { 113 // Populate values with existing loop metadata. 114 getHintsFromMetadata(); 115 116 // force-vector-interleave overrides DisableInterleaving. 117 if (VectorizerParams::isInterleaveForced()) 118 Interleave.Value = VectorizerParams::VectorizationInterleave; 119 120 // If the metadata doesn't explicitly specify whether to enable scalable 121 // vectorization, then decide based on the following criteria (increasing 122 // level of priority): 123 // - Target default 124 // - Metadata width 125 // - Force option (always overrides) 126 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { 127 if (TTI) 128 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable 129 : SK_FixedWidthOnly; 130 131 if (Width.Value) 132 // If the width is set, but the metadata says nothing about the scalable 133 // property, then assume it concerns only a fixed-width UserVF. 134 // If width is not set, the flag takes precedence. 135 Scalable.Value = SK_FixedWidthOnly; 136 } 137 138 // If the flag is set to force any use of scalable vectors, override the loop 139 // hints. 140 if (ForceScalableVectorization.getValue() != 141 LoopVectorizeHints::SK_Unspecified) 142 Scalable.Value = ForceScalableVectorization.getValue(); 143 144 // Scalable vectorization is disabled if no preference is specified. 145 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) 146 Scalable.Value = SK_FixedWidthOnly; 147 148 if (IsVectorized.Value != 1) 149 // If the vectorization width and interleaving count are both 1 then 150 // consider the loop to have been already vectorized because there's 151 // nothing more that we can do. 152 IsVectorized.Value = 153 getWidth() == ElementCount::getFixed(1) && getInterleave() == 1; 154 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs() 155 << "LV: Interleaving disabled by the pass manager\n"); 156 } 157 158 void LoopVectorizeHints::setAlreadyVectorized() { 159 LLVMContext &Context = TheLoop->getHeader()->getContext(); 160 161 MDNode *IsVectorizedMD = MDNode::get( 162 Context, 163 {MDString::get(Context, "llvm.loop.isvectorized"), 164 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); 165 MDNode *LoopID = TheLoop->getLoopID(); 166 MDNode *NewLoopID = 167 makePostTransformationMetadata(Context, LoopID, 168 {Twine(Prefix(), "vectorize.").str(), 169 Twine(Prefix(), "interleave.").str()}, 170 {IsVectorizedMD}); 171 TheLoop->setLoopID(NewLoopID); 172 173 // Update internal cache. 174 IsVectorized.Value = 1; 175 } 176 177 bool LoopVectorizeHints::allowVectorization( 178 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { 179 if (getForce() == LoopVectorizeHints::FK_Disabled) { 180 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 181 emitRemarkWithHints(); 182 return false; 183 } 184 185 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { 186 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 187 emitRemarkWithHints(); 188 return false; 189 } 190 191 if (getIsVectorized() == 1) { 192 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 193 // FIXME: Add interleave.disable metadata. This will allow 194 // vectorize.disable to be used without disabling the pass and errors 195 // to differentiate between disabled vectorization and a width of 1. 196 ORE.emit([&]() { 197 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), 198 "AllDisabled", L->getStartLoc(), 199 L->getHeader()) 200 << "loop not vectorized: vectorization and interleaving are " 201 "explicitly disabled, or the loop has already been " 202 "vectorized"; 203 }); 204 return false; 205 } 206 207 return true; 208 } 209 210 void LoopVectorizeHints::emitRemarkWithHints() const { 211 using namespace ore; 212 213 ORE.emit([&]() { 214 if (Force.Value == LoopVectorizeHints::FK_Disabled) 215 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", 216 TheLoop->getStartLoc(), 217 TheLoop->getHeader()) 218 << "loop not vectorized: vectorization is explicitly disabled"; 219 220 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(), 221 TheLoop->getHeader()); 222 R << "loop not vectorized"; 223 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 224 R << " (Force=" << NV("Force", true); 225 if (Width.Value != 0) 226 R << ", Vector Width=" << NV("VectorWidth", getWidth()); 227 if (getInterleave() != 0) 228 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave()); 229 R << ")"; 230 } 231 return R; 232 }); 233 } 234 235 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { 236 if (getWidth() == ElementCount::getFixed(1)) 237 return LV_NAME; 238 if (getForce() == LoopVectorizeHints::FK_Disabled) 239 return LV_NAME; 240 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) 241 return LV_NAME; 242 return OptimizationRemarkAnalysis::AlwaysPrint; 243 } 244 245 bool LoopVectorizeHints::allowReordering() const { 246 // Allow the vectorizer to change the order of operations if enabling 247 // loop hints are provided 248 ElementCount EC = getWidth(); 249 return HintsAllowReordering && 250 (getForce() == LoopVectorizeHints::FK_Enabled || 251 EC.getKnownMinValue() > 1); 252 } 253 254 void LoopVectorizeHints::getHintsFromMetadata() { 255 MDNode *LoopID = TheLoop->getLoopID(); 256 if (!LoopID) 257 return; 258 259 // First operand should refer to the loop id itself. 260 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 261 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 262 263 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) { 264 const MDString *S = nullptr; 265 SmallVector<Metadata *, 4> Args; 266 267 // The expected hint is either a MDString or a MDNode with the first 268 // operand a MDString. 269 if (const MDNode *MD = dyn_cast<MDNode>(MDO)) { 270 if (!MD || MD->getNumOperands() == 0) 271 continue; 272 S = dyn_cast<MDString>(MD->getOperand(0)); 273 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx) 274 Args.push_back(MD->getOperand(Idx)); 275 } else { 276 S = dyn_cast<MDString>(MDO); 277 assert(Args.size() == 0 && "too many arguments for MDString"); 278 } 279 280 if (!S) 281 continue; 282 283 // Check if the hint starts with the loop metadata prefix. 284 StringRef Name = S->getString(); 285 if (Args.size() == 1) 286 setHint(Name, Args[0]); 287 } 288 } 289 290 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { 291 if (!Name.starts_with(Prefix())) 292 return; 293 Name = Name.substr(Prefix().size(), StringRef::npos); 294 295 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 296 if (!C) 297 return; 298 unsigned Val = C->getZExtValue(); 299 300 Hint *Hints[] = {&Width, &Interleave, &Force, 301 &IsVectorized, &Predicate, &Scalable}; 302 for (auto *H : Hints) { 303 if (Name == H->Name) { 304 if (H->validate(Val)) 305 H->Value = Val; 306 else 307 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 308 break; 309 } 310 } 311 } 312 313 // Return true if the inner loop \p Lp is uniform with regard to the outer loop 314 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes 315 // executing the inner loop will execute the same iterations). This check is 316 // very constrained for now but it will be relaxed in the future. \p Lp is 317 // considered uniform if it meets all the following conditions: 318 // 1) it has a canonical IV (starting from 0 and with stride 1), 319 // 2) its latch terminator is a conditional branch and, 320 // 3) its latch condition is a compare instruction whose operands are the 321 // canonical IV and an OuterLp invariant. 322 // This check doesn't take into account the uniformity of other conditions not 323 // related to the loop latch because they don't affect the loop uniformity. 324 // 325 // NOTE: We decided to keep all these checks and its associated documentation 326 // together so that we can easily have a picture of the current supported loop 327 // nests. However, some of the current checks don't depend on \p OuterLp and 328 // would be redundantly executed for each \p Lp if we invoked this function for 329 // different candidate outer loops. This is not the case for now because we 330 // don't currently have the infrastructure to evaluate multiple candidate outer 331 // loops and \p OuterLp will be a fixed parameter while we only support explicit 332 // outer loop vectorization. It's also very likely that these checks go away 333 // before introducing the aforementioned infrastructure. However, if this is not 334 // the case, we should move the \p OuterLp independent checks to a separate 335 // function that is only executed once for each \p Lp. 336 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { 337 assert(Lp->getLoopLatch() && "Expected loop with a single latch."); 338 339 // If Lp is the outer loop, it's uniform by definition. 340 if (Lp == OuterLp) 341 return true; 342 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); 343 344 // 1. 345 PHINode *IV = Lp->getCanonicalInductionVariable(); 346 if (!IV) { 347 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); 348 return false; 349 } 350 351 // 2. 352 BasicBlock *Latch = Lp->getLoopLatch(); 353 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); 354 if (!LatchBr || LatchBr->isUnconditional()) { 355 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); 356 return false; 357 } 358 359 // 3. 360 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); 361 if (!LatchCmp) { 362 LLVM_DEBUG( 363 dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); 364 return false; 365 } 366 367 Value *CondOp0 = LatchCmp->getOperand(0); 368 Value *CondOp1 = LatchCmp->getOperand(1); 369 Value *IVUpdate = IV->getIncomingValueForBlock(Latch); 370 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && 371 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { 372 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); 373 return false; 374 } 375 376 return true; 377 } 378 379 // Return true if \p Lp and all its nested loops are uniform with regard to \p 380 // OuterLp. 381 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { 382 if (!isUniformLoop(Lp, OuterLp)) 383 return false; 384 385 // Check if nested loops are uniform. 386 for (Loop *SubLp : *Lp) 387 if (!isUniformLoopNest(SubLp, OuterLp)) 388 return false; 389 390 return true; 391 } 392 393 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 394 if (Ty->isPointerTy()) 395 return DL.getIntPtrType(Ty); 396 397 // It is possible that char's or short's overflow when we ask for the loop's 398 // trip count, work around this by changing the type size. 399 if (Ty->getScalarSizeInBits() < 32) 400 return Type::getInt32Ty(Ty->getContext()); 401 402 return Ty; 403 } 404 405 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 406 Ty0 = convertPointerToIntegerType(DL, Ty0); 407 Ty1 = convertPointerToIntegerType(DL, Ty1); 408 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 409 return Ty0; 410 return Ty1; 411 } 412 413 /// Check that the instruction has outside loop users and is not an 414 /// identified reduction variable. 415 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 416 SmallPtrSetImpl<Value *> &AllowedExit) { 417 // Reductions, Inductions and non-header phis are allowed to have exit users. All 418 // other instructions must not have external users. 419 if (!AllowedExit.count(Inst)) 420 // Check that all of the users of the loop are inside the BB. 421 for (User *U : Inst->users()) { 422 Instruction *UI = cast<Instruction>(U); 423 // This user may be a reduction exit value. 424 if (!TheLoop->contains(UI)) { 425 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 426 return true; 427 } 428 } 429 return false; 430 } 431 432 /// Returns true if A and B have same pointer operands or same SCEVs addresses 433 static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, 434 StoreInst *B) { 435 // Compare store 436 if (A == B) 437 return true; 438 439 // Otherwise Compare pointers 440 Value *APtr = A->getPointerOperand(); 441 Value *BPtr = B->getPointerOperand(); 442 if (APtr == BPtr) 443 return true; 444 445 // Otherwise compare address SCEVs 446 return SE->getSCEV(APtr) == SE->getSCEV(BPtr); 447 } 448 449 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, 450 Value *Ptr) const { 451 // FIXME: Currently, the set of symbolic strides is sometimes queried before 452 // it's collected. This happens from canVectorizeWithIfConvert, when the 453 // pointer is checked to reference consecutive elements suitable for a 454 // masked access. 455 const auto &Strides = 456 LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); 457 458 Function *F = TheLoop->getHeader()->getParent(); 459 bool OptForSize = F->hasOptSize() || 460 llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, 461 PGSOQueryType::IRPass); 462 bool CanAddPredicate = !OptForSize; 463 int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, 464 CanAddPredicate, false).value_or(0); 465 if (Stride == 1 || Stride == -1) 466 return Stride; 467 return 0; 468 } 469 470 bool LoopVectorizationLegality::isInvariant(Value *V) const { 471 return LAI->isInvariant(V); 472 } 473 474 namespace { 475 /// A rewriter to build the SCEVs for each of the VF lanes in the expected 476 /// vectorized loop, which can then be compared to detect their uniformity. This 477 /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) 478 /// with new AddRecs where the step is multiplied by StepMultiplier and Offset * 479 /// Step is added. Also checks if all sub-expressions are analyzable w.r.t. 480 /// uniformity. 481 class SCEVAddRecForUniformityRewriter 482 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { 483 /// Multiplier to be applied to the step of AddRecs in TheLoop. 484 unsigned StepMultiplier; 485 486 /// Offset to be added to the AddRecs in TheLoop. 487 unsigned Offset; 488 489 /// Loop for which to rewrite AddRecsFor. 490 Loop *TheLoop; 491 492 /// Is any sub-expressions not analyzable w.r.t. uniformity? 493 bool CannotAnalyze = false; 494 495 bool canAnalyze() const { return !CannotAnalyze; } 496 497 public: 498 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, 499 unsigned Offset, Loop *TheLoop) 500 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), 501 TheLoop(TheLoop) {} 502 503 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 504 assert(Expr->getLoop() == TheLoop && 505 "addrec outside of TheLoop must be invariant and should have been " 506 "handled earlier"); 507 // Build a new AddRec by multiplying the step by StepMultiplier and 508 // incrementing the start by Offset * step. 509 Type *Ty = Expr->getType(); 510 const SCEV *Step = Expr->getStepRecurrence(SE); 511 if (!SE.isLoopInvariant(Step, TheLoop)) { 512 CannotAnalyze = true; 513 return Expr; 514 } 515 const SCEV *NewStep = 516 SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); 517 const SCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); 518 const SCEV *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); 519 return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); 520 } 521 522 const SCEV *visit(const SCEV *S) { 523 if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop)) 524 return S; 525 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); 526 } 527 528 const SCEV *visitUnknown(const SCEVUnknown *S) { 529 if (SE.isLoopInvariant(S, TheLoop)) 530 return S; 531 // The value could vary across iterations. 532 CannotAnalyze = true; 533 return S; 534 } 535 536 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { 537 // Could not analyze the expression. 538 CannotAnalyze = true; 539 return S; 540 } 541 542 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, 543 unsigned StepMultiplier, unsigned Offset, 544 Loop *TheLoop) { 545 /// Bail out if the expression does not contain an UDiv expression. 546 /// Uniform values which are not loop invariant require operations to strip 547 /// out the lowest bits. For now just look for UDivs and use it to avoid 548 /// re-writing UDIV-free expressions for other lanes to limit compile time. 549 if (!SCEVExprContains(S, 550 [](const SCEV *S) { return isa<SCEVUDivExpr>(S); })) 551 return SE.getCouldNotCompute(); 552 553 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, 554 TheLoop); 555 const SCEV *Result = Rewriter.visit(S); 556 557 if (Rewriter.canAnalyze()) 558 return Result; 559 return SE.getCouldNotCompute(); 560 } 561 }; 562 563 } // namespace 564 565 bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { 566 if (isInvariant(V)) 567 return true; 568 if (VF.isScalable()) 569 return false; 570 if (VF.isScalar()) 571 return true; 572 573 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is 574 // never considered uniform. 575 auto *SE = PSE.getSE(); 576 if (!SE->isSCEVable(V->getType())) 577 return false; 578 const SCEV *S = SE->getSCEV(V); 579 580 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for 581 // lane 0 matches the expressions for all other lanes. 582 unsigned FixedVF = VF.getKnownMinValue(); 583 const SCEV *FirstLaneExpr = 584 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop); 585 if (isa<SCEVCouldNotCompute>(FirstLaneExpr)) 586 return false; 587 588 // Make sure the expressions for lanes FixedVF-1..1 match the expression for 589 // lane 0. We check lanes in reverse order for compile-time, as frequently 590 // checking the last lane is sufficient to rule out uniformity. 591 return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) { 592 const SCEV *IthLaneExpr = 593 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop); 594 return FirstLaneExpr == IthLaneExpr; 595 }); 596 } 597 598 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, 599 ElementCount VF) const { 600 Value *Ptr = getLoadStorePointerOperand(&I); 601 if (!Ptr) 602 return false; 603 // Note: There's nothing inherent which prevents predicated loads and 604 // stores from being uniform. The current lowering simply doesn't handle 605 // it; in particular, the cost model distinguishes scatter/gather from 606 // scalar w/predication, and we currently rely on the scalar path. 607 return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); 608 } 609 610 bool LoopVectorizationLegality::canVectorizeOuterLoop() { 611 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); 612 // Store the result and return it at the end instead of exiting early, in case 613 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 614 bool Result = true; 615 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 616 617 for (BasicBlock *BB : TheLoop->blocks()) { 618 // Check whether the BB terminator is a BranchInst. Any other terminator is 619 // not supported yet. 620 auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); 621 if (!Br) { 622 reportVectorizationFailure("Unsupported basic block terminator", 623 "loop control flow is not understood by vectorizer", 624 "CFGNotUnderstood", ORE, TheLoop); 625 if (DoExtraAnalysis) 626 Result = false; 627 else 628 return false; 629 } 630 631 // Check whether the BranchInst is a supported one. Only unconditional 632 // branches, conditional branches with an outer loop invariant condition or 633 // backedges are supported. 634 // FIXME: We skip these checks when VPlan predication is enabled as we 635 // want to allow divergent branches. This whole check will be removed 636 // once VPlan predication is on by default. 637 if (Br && Br->isConditional() && 638 !TheLoop->isLoopInvariant(Br->getCondition()) && 639 !LI->isLoopHeader(Br->getSuccessor(0)) && 640 !LI->isLoopHeader(Br->getSuccessor(1))) { 641 reportVectorizationFailure("Unsupported conditional branch", 642 "loop control flow is not understood by vectorizer", 643 "CFGNotUnderstood", ORE, TheLoop); 644 if (DoExtraAnalysis) 645 Result = false; 646 else 647 return false; 648 } 649 } 650 651 // Check whether inner loops are uniform. At this point, we only support 652 // simple outer loops scenarios with uniform nested loops. 653 if (!isUniformLoopNest(TheLoop /*loop nest*/, 654 TheLoop /*context outer loop*/)) { 655 reportVectorizationFailure("Outer loop contains divergent loops", 656 "loop control flow is not understood by vectorizer", 657 "CFGNotUnderstood", ORE, TheLoop); 658 if (DoExtraAnalysis) 659 Result = false; 660 else 661 return false; 662 } 663 664 // Check whether we are able to set up outer loop induction. 665 if (!setupOuterLoopInductions()) { 666 reportVectorizationFailure("Unsupported outer loop Phi(s)", 667 "Unsupported outer loop Phi(s)", 668 "UnsupportedPhi", ORE, TheLoop); 669 if (DoExtraAnalysis) 670 Result = false; 671 else 672 return false; 673 } 674 675 return Result; 676 } 677 678 void LoopVectorizationLegality::addInductionPhi( 679 PHINode *Phi, const InductionDescriptor &ID, 680 SmallPtrSetImpl<Value *> &AllowedExit) { 681 Inductions[Phi] = ID; 682 683 // In case this induction also comes with casts that we know we can ignore 684 // in the vectorized loop body, record them here. All casts could be recorded 685 // here for ignoring, but suffices to record only the first (as it is the 686 // only one that may bw used outside the cast sequence). 687 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 688 if (!Casts.empty()) 689 InductionCastsToIgnore.insert(*Casts.begin()); 690 691 Type *PhiTy = Phi->getType(); 692 const DataLayout &DL = Phi->getDataLayout(); 693 694 // Get the widest type. 695 if (!PhiTy->isFloatingPointTy()) { 696 if (!WidestIndTy) 697 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 698 else 699 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 700 } 701 702 // Int inductions are special because we only allow one IV. 703 if (ID.getKind() == InductionDescriptor::IK_IntInduction && 704 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && 705 isa<Constant>(ID.getStartValue()) && 706 cast<Constant>(ID.getStartValue())->isNullValue()) { 707 708 // Use the phi node with the widest type as induction. Use the last 709 // one if there are multiple (no good reason for doing this other 710 // than it is expedient). We've checked that it begins at zero and 711 // steps by one, so this is a canonical induction variable. 712 if (!PrimaryInduction || PhiTy == WidestIndTy) 713 PrimaryInduction = Phi; 714 } 715 716 // Both the PHI node itself, and the "post-increment" value feeding 717 // back into the PHI node may have external users. 718 // We can allow those uses, except if the SCEVs we have for them rely 719 // on predicates that only hold within the loop, since allowing the exit 720 // currently means re-using this SCEV outside the loop (see PR33706 for more 721 // details). 722 if (PSE.getPredicate().isAlwaysTrue()) { 723 AllowedExit.insert(Phi); 724 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); 725 } 726 727 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); 728 } 729 730 bool LoopVectorizationLegality::setupOuterLoopInductions() { 731 BasicBlock *Header = TheLoop->getHeader(); 732 733 // Returns true if a given Phi is a supported induction. 734 auto IsSupportedPhi = [&](PHINode &Phi) -> bool { 735 InductionDescriptor ID; 736 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && 737 ID.getKind() == InductionDescriptor::IK_IntInduction) { 738 addInductionPhi(&Phi, ID, AllowedExit); 739 return true; 740 } 741 // Bail out for any Phi in the outer loop header that is not a supported 742 // induction. 743 LLVM_DEBUG( 744 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n"); 745 return false; 746 }; 747 748 return llvm::all_of(Header->phis(), IsSupportedPhi); 749 } 750 751 /// Checks if a function is scalarizable according to the TLI, in 752 /// the sense that it should be vectorized and then expanded in 753 /// multiple scalar calls. This is represented in the 754 /// TLI via mappings that do not specify a vector name, as in the 755 /// following example: 756 /// 757 /// const VecDesc VecIntrinsics[] = { 758 /// {"llvm.phx.abs.i32", "", 4} 759 /// }; 760 static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { 761 const StringRef ScalarName = CI.getCalledFunction()->getName(); 762 bool Scalarize = TLI.isFunctionVectorizable(ScalarName); 763 // Check that all known VFs are not associated to a vector 764 // function, i.e. the vector name is emty. 765 if (Scalarize) { 766 ElementCount WidestFixedVF, WidestScalableVF; 767 TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); 768 for (ElementCount VF = ElementCount::getFixed(2); 769 ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) 770 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 771 for (ElementCount VF = ElementCount::getScalable(1); 772 ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) 773 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 774 assert((WidestScalableVF.isZero() || !Scalarize) && 775 "Caller may decide to scalarize a variant using a scalable VF"); 776 } 777 return Scalarize; 778 } 779 780 bool LoopVectorizationLegality::canVectorizeInstrs() { 781 BasicBlock *Header = TheLoop->getHeader(); 782 783 // For each block in the loop. 784 for (BasicBlock *BB : TheLoop->blocks()) { 785 // Scan the instructions in the block and look for hazards. 786 for (Instruction &I : *BB) { 787 if (auto *Phi = dyn_cast<PHINode>(&I)) { 788 Type *PhiTy = Phi->getType(); 789 // Check that this PHI type is allowed. 790 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 791 !PhiTy->isPointerTy()) { 792 reportVectorizationFailure("Found a non-int non-pointer PHI", 793 "loop control flow is not understood by vectorizer", 794 "CFGNotUnderstood", ORE, TheLoop); 795 return false; 796 } 797 798 // If this PHINode is not in the header block, then we know that we 799 // can convert it to select during if-conversion. No need to check if 800 // the PHIs in this block are induction or reduction variables. 801 if (BB != Header) { 802 // Non-header phi nodes that have outside uses can be vectorized. Add 803 // them to the list of allowed exits. 804 // Unsafe cyclic dependencies with header phis are identified during 805 // legalization for reduction, induction and fixed order 806 // recurrences. 807 AllowedExit.insert(&I); 808 continue; 809 } 810 811 // We only allow if-converted PHIs with exactly two incoming values. 812 if (Phi->getNumIncomingValues() != 2) { 813 reportVectorizationFailure("Found an invalid PHI", 814 "loop control flow is not understood by vectorizer", 815 "CFGNotUnderstood", ORE, TheLoop, Phi); 816 return false; 817 } 818 819 RecurrenceDescriptor RedDes; 820 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, 821 DT, PSE.getSE())) { 822 Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); 823 AllowedExit.insert(RedDes.getLoopExitInstr()); 824 Reductions[Phi] = RedDes; 825 continue; 826 } 827 828 // We prevent matching non-constant strided pointer IVS to preserve 829 // historical vectorizer behavior after a generalization of the 830 // IVDescriptor code. The intent is to remove this check, but we 831 // have to fix issues around code quality for such loops first. 832 auto IsDisallowedStridedPointerInduction = 833 [](const InductionDescriptor &ID) { 834 if (AllowStridedPointerIVs) 835 return false; 836 return ID.getKind() == InductionDescriptor::IK_PtrInduction && 837 ID.getConstIntStepValue() == nullptr; 838 }; 839 840 // TODO: Instead of recording the AllowedExit, it would be good to 841 // record the complementary set: NotAllowedExit. These include (but may 842 // not be limited to): 843 // 1. Reduction phis as they represent the one-before-last value, which 844 // is not available when vectorized 845 // 2. Induction phis and increment when SCEV predicates cannot be used 846 // outside the loop - see addInductionPhi 847 // 3. Non-Phis with outside uses when SCEV predicates cannot be used 848 // outside the loop - see call to hasOutsideLoopUser in the non-phi 849 // handling below 850 // 4. FixedOrderRecurrence phis that can possibly be handled by 851 // extraction. 852 // By recording these, we can then reason about ways to vectorize each 853 // of these NotAllowedExit. 854 InductionDescriptor ID; 855 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && 856 !IsDisallowedStridedPointerInduction(ID)) { 857 addInductionPhi(Phi, ID, AllowedExit); 858 Requirements->addExactFPMathInst(ID.getExactFPMathInst()); 859 continue; 860 } 861 862 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { 863 AllowedExit.insert(Phi); 864 FixedOrderRecurrences.insert(Phi); 865 continue; 866 } 867 868 // As a last resort, coerce the PHI to a AddRec expression 869 // and re-try classifying it a an induction PHI. 870 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && 871 !IsDisallowedStridedPointerInduction(ID)) { 872 addInductionPhi(Phi, ID, AllowedExit); 873 continue; 874 } 875 876 reportVectorizationFailure("Found an unidentified PHI", 877 "value that could not be identified as " 878 "reduction is used outside the loop", 879 "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); 880 return false; 881 } // end of PHI handling 882 883 // We handle calls that: 884 // * Are debug info intrinsics. 885 // * Have a mapping to an IR intrinsic. 886 // * Have a vector version available. 887 auto *CI = dyn_cast<CallInst>(&I); 888 889 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && 890 !isa<DbgInfoIntrinsic>(CI) && 891 !(CI->getCalledFunction() && TLI && 892 (!VFDatabase::getMappings(*CI).empty() || 893 isTLIScalarize(*TLI, *CI)))) { 894 // If the call is a recognized math libary call, it is likely that 895 // we can vectorize it given loosened floating-point constraints. 896 LibFunc Func; 897 bool IsMathLibCall = 898 TLI && CI->getCalledFunction() && 899 CI->getType()->isFloatingPointTy() && 900 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && 901 TLI->hasOptimizedCodeGen(Func); 902 903 if (IsMathLibCall) { 904 // TODO: Ideally, we should not use clang-specific language here, 905 // but it's hard to provide meaningful yet generic advice. 906 // Also, should this be guarded by allowExtraAnalysis() and/or be part 907 // of the returned info from isFunctionVectorizable()? 908 reportVectorizationFailure( 909 "Found a non-intrinsic callsite", 910 "library call cannot be vectorized. " 911 "Try compiling with -fno-math-errno, -ffast-math, " 912 "or similar flags", 913 "CantVectorizeLibcall", ORE, TheLoop, CI); 914 } else { 915 reportVectorizationFailure("Found a non-intrinsic callsite", 916 "call instruction cannot be vectorized", 917 "CantVectorizeLibcall", ORE, TheLoop, CI); 918 } 919 return false; 920 } 921 922 // Some intrinsics have scalar arguments and should be same in order for 923 // them to be vectorized (i.e. loop invariant). 924 if (CI) { 925 auto *SE = PSE.getSE(); 926 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); 927 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx) 928 if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) { 929 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), 930 TheLoop)) { 931 reportVectorizationFailure("Found unvectorizable intrinsic", 932 "intrinsic instruction cannot be vectorized", 933 "CantVectorizeIntrinsic", ORE, TheLoop, CI); 934 return false; 935 } 936 } 937 } 938 939 // If we found a vectorized variant of a function, note that so LV can 940 // make better decisions about maximum VF. 941 if (CI && !VFDatabase::getMappings(*CI).empty()) 942 VecCallVariantsFound = true; 943 944 // Check that the instruction return type is vectorizable. 945 // Also, we can't vectorize extractelement instructions. 946 if ((!VectorType::isValidElementType(I.getType()) && 947 !I.getType()->isVoidTy()) || 948 isa<ExtractElementInst>(I)) { 949 reportVectorizationFailure("Found unvectorizable type", 950 "instruction return type cannot be vectorized", 951 "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); 952 return false; 953 } 954 955 // Check that the stored type is vectorizable. 956 if (auto *ST = dyn_cast<StoreInst>(&I)) { 957 Type *T = ST->getValueOperand()->getType(); 958 if (!VectorType::isValidElementType(T)) { 959 reportVectorizationFailure("Store instruction cannot be vectorized", 960 "store instruction cannot be vectorized", 961 "CantVectorizeStore", ORE, TheLoop, ST); 962 return false; 963 } 964 965 // For nontemporal stores, check that a nontemporal vector version is 966 // supported on the target. 967 if (ST->getMetadata(LLVMContext::MD_nontemporal)) { 968 // Arbitrarily try a vector of 2 elements. 969 auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); 970 assert(VecTy && "did not find vectorized version of stored type"); 971 if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { 972 reportVectorizationFailure( 973 "nontemporal store instruction cannot be vectorized", 974 "nontemporal store instruction cannot be vectorized", 975 "CantVectorizeNontemporalStore", ORE, TheLoop, ST); 976 return false; 977 } 978 } 979 980 } else if (auto *LD = dyn_cast<LoadInst>(&I)) { 981 if (LD->getMetadata(LLVMContext::MD_nontemporal)) { 982 // For nontemporal loads, check that a nontemporal vector version is 983 // supported on the target (arbitrarily try a vector of 2 elements). 984 auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); 985 assert(VecTy && "did not find vectorized version of load type"); 986 if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { 987 reportVectorizationFailure( 988 "nontemporal load instruction cannot be vectorized", 989 "nontemporal load instruction cannot be vectorized", 990 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); 991 return false; 992 } 993 } 994 995 // FP instructions can allow unsafe algebra, thus vectorizable by 996 // non-IEEE-754 compliant SIMD units. 997 // This applies to floating-point math operations and calls, not memory 998 // operations, shuffles, or casts, as they don't change precision or 999 // semantics. 1000 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && 1001 !I.isFast()) { 1002 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); 1003 Hints->setPotentiallyUnsafe(); 1004 } 1005 1006 // Reduction instructions are allowed to have exit users. 1007 // All other instructions must not have external users. 1008 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { 1009 // We can safely vectorize loops where instructions within the loop are 1010 // used outside the loop only if the SCEV predicates within the loop is 1011 // same as outside the loop. Allowing the exit means reusing the SCEV 1012 // outside the loop. 1013 if (PSE.getPredicate().isAlwaysTrue()) { 1014 AllowedExit.insert(&I); 1015 continue; 1016 } 1017 reportVectorizationFailure("Value cannot be used outside the loop", 1018 "value cannot be used outside the loop", 1019 "ValueUsedOutsideLoop", ORE, TheLoop, &I); 1020 return false; 1021 } 1022 } // next instr. 1023 } 1024 1025 if (!PrimaryInduction) { 1026 if (Inductions.empty()) { 1027 reportVectorizationFailure("Did not find one integer induction var", 1028 "loop induction variable could not be identified", 1029 "NoInductionVariable", ORE, TheLoop); 1030 return false; 1031 } 1032 if (!WidestIndTy) { 1033 reportVectorizationFailure("Did not find one integer induction var", 1034 "integer loop induction variable could not be identified", 1035 "NoIntegerInductionVariable", ORE, TheLoop); 1036 return false; 1037 } 1038 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1039 } 1040 1041 // Now we know the widest induction type, check if our found induction 1042 // is the same size. If it's not, unset it here and InnerLoopVectorizer 1043 // will create another. 1044 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) 1045 PrimaryInduction = nullptr; 1046 1047 return true; 1048 } 1049 1050 bool LoopVectorizationLegality::canVectorizeMemory() { 1051 LAI = &LAIs.getInfo(*TheLoop); 1052 const OptimizationRemarkAnalysis *LAR = LAI->getReport(); 1053 if (LAR) { 1054 ORE->emit([&]() { 1055 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), 1056 "loop not vectorized: ", *LAR); 1057 }); 1058 } 1059 1060 if (!LAI->canVectorizeMemory()) 1061 return false; 1062 1063 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { 1064 reportVectorizationFailure("We don't allow storing to uniform addresses", 1065 "write to a loop invariant address could not " 1066 "be vectorized", 1067 "CantVectorizeStoreToLoopInvariantAddress", ORE, 1068 TheLoop); 1069 return false; 1070 } 1071 1072 // We can vectorize stores to invariant address when final reduction value is 1073 // guaranteed to be stored at the end of the loop. Also, if decision to 1074 // vectorize loop is made, runtime checks are added so as to make sure that 1075 // invariant address won't alias with any other objects. 1076 if (!LAI->getStoresToInvariantAddresses().empty()) { 1077 // For each invariant address, check if last stored value is unconditional 1078 // and the address is not calculated inside the loop. 1079 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1080 if (!isInvariantStoreOfReduction(SI)) 1081 continue; 1082 1083 if (blockNeedsPredication(SI->getParent())) { 1084 reportVectorizationFailure( 1085 "We don't allow storing to uniform addresses", 1086 "write of conditional recurring variant value to a loop " 1087 "invariant address could not be vectorized", 1088 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1089 return false; 1090 } 1091 1092 // Invariant address should be defined outside of loop. LICM pass usually 1093 // makes sure it happens, but in rare cases it does not, we do not want 1094 // to overcomplicate vectorization to support this case. 1095 if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) { 1096 if (TheLoop->contains(Ptr)) { 1097 reportVectorizationFailure( 1098 "Invariant address is calculated inside the loop", 1099 "write to a loop invariant address could not " 1100 "be vectorized", 1101 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1102 return false; 1103 } 1104 } 1105 } 1106 1107 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) { 1108 // For each invariant address, check its last stored value is the result 1109 // of one of our reductions. 1110 // 1111 // We do not check if dependence with loads exists because that is already 1112 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. 1113 ScalarEvolution *SE = PSE.getSE(); 1114 SmallVector<StoreInst *, 4> UnhandledStores; 1115 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1116 if (isInvariantStoreOfReduction(SI)) { 1117 // Earlier stores to this address are effectively deadcode. 1118 // With opaque pointers it is possible for one pointer to be used with 1119 // different sizes of stored values: 1120 // store i32 0, ptr %x 1121 // store i8 0, ptr %x 1122 // The latest store doesn't complitely overwrite the first one in the 1123 // example. That is why we have to make sure that types of stored 1124 // values are same. 1125 // TODO: Check that bitwidth of unhandled store is smaller then the 1126 // one that overwrites it and add a test. 1127 erase_if(UnhandledStores, [SE, SI](StoreInst *I) { 1128 return storeToSameAddress(SE, SI, I) && 1129 I->getValueOperand()->getType() == 1130 SI->getValueOperand()->getType(); 1131 }); 1132 continue; 1133 } 1134 UnhandledStores.push_back(SI); 1135 } 1136 1137 bool IsOK = UnhandledStores.empty(); 1138 // TODO: we should also validate against InvariantMemSets. 1139 if (!IsOK) { 1140 reportVectorizationFailure( 1141 "We don't allow storing to uniform addresses", 1142 "write to a loop invariant address could not " 1143 "be vectorized", 1144 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1145 return false; 1146 } 1147 } 1148 } 1149 1150 PSE.addPredicate(LAI->getPSE().getPredicate()); 1151 return true; 1152 } 1153 1154 bool LoopVectorizationLegality::canVectorizeFPMath( 1155 bool EnableStrictReductions) { 1156 1157 // First check if there is any ExactFP math or if we allow reassociations 1158 if (!Requirements->getExactFPInst() || Hints->allowReordering()) 1159 return true; 1160 1161 // If the above is false, we have ExactFPMath & do not allow reordering. 1162 // If the EnableStrictReductions flag is set, first check if we have any 1163 // Exact FP induction vars, which we cannot vectorize. 1164 if (!EnableStrictReductions || 1165 any_of(getInductionVars(), [&](auto &Induction) -> bool { 1166 InductionDescriptor IndDesc = Induction.second; 1167 return IndDesc.getExactFPMathInst(); 1168 })) 1169 return false; 1170 1171 // We can now only vectorize if all reductions with Exact FP math also 1172 // have the isOrdered flag set, which indicates that we can move the 1173 // reduction operations in-loop. 1174 return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { 1175 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1176 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); 1177 })); 1178 } 1179 1180 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { 1181 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1182 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1183 return RdxDesc.IntermediateStore == SI; 1184 }); 1185 } 1186 1187 bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { 1188 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1189 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1190 if (!RdxDesc.IntermediateStore) 1191 return false; 1192 1193 ScalarEvolution *SE = PSE.getSE(); 1194 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); 1195 return V == InvariantAddress || 1196 SE->getSCEV(V) == SE->getSCEV(InvariantAddress); 1197 }); 1198 } 1199 1200 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { 1201 Value *In0 = const_cast<Value *>(V); 1202 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 1203 if (!PN) 1204 return false; 1205 1206 return Inductions.count(PN); 1207 } 1208 1209 const InductionDescriptor * 1210 LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { 1211 if (!isInductionPhi(Phi)) 1212 return nullptr; 1213 auto &ID = getInductionVars().find(Phi)->second; 1214 if (ID.getKind() == InductionDescriptor::IK_IntInduction || 1215 ID.getKind() == InductionDescriptor::IK_FpInduction) 1216 return &ID; 1217 return nullptr; 1218 } 1219 1220 const InductionDescriptor * 1221 LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { 1222 if (!isInductionPhi(Phi)) 1223 return nullptr; 1224 auto &ID = getInductionVars().find(Phi)->second; 1225 if (ID.getKind() == InductionDescriptor::IK_PtrInduction) 1226 return &ID; 1227 return nullptr; 1228 } 1229 1230 bool LoopVectorizationLegality::isCastedInductionVariable( 1231 const Value *V) const { 1232 auto *Inst = dyn_cast<Instruction>(V); 1233 return (Inst && InductionCastsToIgnore.count(Inst)); 1234 } 1235 1236 bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { 1237 return isInductionPhi(V) || isCastedInductionVariable(V); 1238 } 1239 1240 bool LoopVectorizationLegality::isFixedOrderRecurrence( 1241 const PHINode *Phi) const { 1242 return FixedOrderRecurrences.count(Phi); 1243 } 1244 1245 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { 1246 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 1247 } 1248 1249 bool LoopVectorizationLegality::blockCanBePredicated( 1250 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, 1251 SmallPtrSetImpl<const Instruction *> &MaskedOp) const { 1252 for (Instruction &I : *BB) { 1253 // We can predicate blocks with calls to assume, as long as we drop them in 1254 // case we flatten the CFG via predication. 1255 if (match(&I, m_Intrinsic<Intrinsic::assume>())) { 1256 MaskedOp.insert(&I); 1257 continue; 1258 } 1259 1260 // Do not let llvm.experimental.noalias.scope.decl block the vectorization. 1261 // TODO: there might be cases that it should block the vectorization. Let's 1262 // ignore those for now. 1263 if (isa<NoAliasScopeDeclInst>(&I)) 1264 continue; 1265 1266 // We can allow masked calls if there's at least one vector variant, even 1267 // if we end up scalarizing due to the cost model calculations. 1268 // TODO: Allow other calls if they have appropriate attributes... readonly 1269 // and argmemonly? 1270 if (CallInst *CI = dyn_cast<CallInst>(&I)) 1271 if (VFDatabase::hasMaskedVariant(*CI)) { 1272 MaskedOp.insert(CI); 1273 continue; 1274 } 1275 1276 // Loads are handled via masking (or speculated if safe to do so.) 1277 if (auto *LI = dyn_cast<LoadInst>(&I)) { 1278 if (!SafePtrs.count(LI->getPointerOperand())) 1279 MaskedOp.insert(LI); 1280 continue; 1281 } 1282 1283 // Predicated store requires some form of masking: 1284 // 1) masked store HW instruction, 1285 // 2) emulation via load-blend-store (only if safe and legal to do so, 1286 // be aware on the race conditions), or 1287 // 3) element-by-element predicate check and scalar store. 1288 if (auto *SI = dyn_cast<StoreInst>(&I)) { 1289 MaskedOp.insert(SI); 1290 continue; 1291 } 1292 1293 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) 1294 return false; 1295 } 1296 1297 return true; 1298 } 1299 1300 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1301 if (!EnableIfConversion) { 1302 reportVectorizationFailure("If-conversion is disabled", 1303 "if-conversion is disabled", 1304 "IfConversionDisabled", 1305 ORE, TheLoop); 1306 return false; 1307 } 1308 1309 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1310 1311 // A list of pointers which are known to be dereferenceable within scope of 1312 // the loop body for each iteration of the loop which executes. That is, 1313 // the memory pointed to can be dereferenced (with the access size implied by 1314 // the value's type) unconditionally within the loop header without 1315 // introducing a new fault. 1316 SmallPtrSet<Value *, 8> SafePointers; 1317 1318 // Collect safe addresses. 1319 for (BasicBlock *BB : TheLoop->blocks()) { 1320 if (!blockNeedsPredication(BB)) { 1321 for (Instruction &I : *BB) 1322 if (auto *Ptr = getLoadStorePointerOperand(&I)) 1323 SafePointers.insert(Ptr); 1324 continue; 1325 } 1326 1327 // For a block which requires predication, a address may be safe to access 1328 // in the loop w/o predication if we can prove dereferenceability facts 1329 // sufficient to ensure it'll never fault within the loop. For the moment, 1330 // we restrict this to loads; stores are more complicated due to 1331 // concurrency restrictions. 1332 ScalarEvolution &SE = *PSE.getSE(); 1333 for (Instruction &I : *BB) { 1334 LoadInst *LI = dyn_cast<LoadInst>(&I); 1335 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && 1336 isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC)) 1337 SafePointers.insert(LI->getPointerOperand()); 1338 } 1339 } 1340 1341 // Collect the blocks that need predication. 1342 for (BasicBlock *BB : TheLoop->blocks()) { 1343 // We don't support switch statements inside loops. 1344 if (!isa<BranchInst>(BB->getTerminator())) { 1345 reportVectorizationFailure("Loop contains a switch statement", 1346 "loop contains a switch statement", 1347 "LoopContainsSwitch", ORE, TheLoop, 1348 BB->getTerminator()); 1349 return false; 1350 } 1351 1352 // We must be able to predicate all blocks that need to be predicated. 1353 if (blockNeedsPredication(BB) && 1354 !blockCanBePredicated(BB, SafePointers, MaskedOp)) { 1355 reportVectorizationFailure( 1356 "Control flow cannot be substituted for a select", 1357 "control flow cannot be substituted for a select", "NoCFGForSelect", 1358 ORE, TheLoop, BB->getTerminator()); 1359 return false; 1360 } 1361 } 1362 1363 // We can if-convert this loop. 1364 return true; 1365 } 1366 1367 // Helper function to canVectorizeLoopNestCFG. 1368 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, 1369 bool UseVPlanNativePath) { 1370 assert((UseVPlanNativePath || Lp->isInnermost()) && 1371 "VPlan-native path is not enabled."); 1372 1373 // TODO: ORE should be improved to show more accurate information when an 1374 // outer loop can't be vectorized because a nested loop is not understood or 1375 // legal. Something like: "outer_loop_location: loop not vectorized: 1376 // (inner_loop_location) loop control flow is not understood by vectorizer". 1377 1378 // Store the result and return it at the end instead of exiting early, in case 1379 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1380 bool Result = true; 1381 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1382 1383 // We must have a loop in canonical form. Loops with indirectbr in them cannot 1384 // be canonicalized. 1385 if (!Lp->getLoopPreheader()) { 1386 reportVectorizationFailure("Loop doesn't have a legal pre-header", 1387 "loop control flow is not understood by vectorizer", 1388 "CFGNotUnderstood", ORE, TheLoop); 1389 if (DoExtraAnalysis) 1390 Result = false; 1391 else 1392 return false; 1393 } 1394 1395 // We must have a single backedge. 1396 if (Lp->getNumBackEdges() != 1) { 1397 reportVectorizationFailure("The loop must have a single backedge", 1398 "loop control flow is not understood by vectorizer", 1399 "CFGNotUnderstood", ORE, TheLoop); 1400 if (DoExtraAnalysis) 1401 Result = false; 1402 else 1403 return false; 1404 } 1405 1406 return Result; 1407 } 1408 1409 bool LoopVectorizationLegality::canVectorizeLoopNestCFG( 1410 Loop *Lp, bool UseVPlanNativePath) { 1411 // Store the result and return it at the end instead of exiting early, in case 1412 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1413 bool Result = true; 1414 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1415 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { 1416 if (DoExtraAnalysis) 1417 Result = false; 1418 else 1419 return false; 1420 } 1421 1422 // Recursively check whether the loop control flow of nested loops is 1423 // understood. 1424 for (Loop *SubLp : *Lp) 1425 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { 1426 if (DoExtraAnalysis) 1427 Result = false; 1428 else 1429 return false; 1430 } 1431 1432 return Result; 1433 } 1434 1435 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { 1436 // Store the result and return it at the end instead of exiting early, in case 1437 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1438 bool Result = true; 1439 1440 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1441 // Check whether the loop-related control flow in the loop nest is expected by 1442 // vectorizer. 1443 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { 1444 if (DoExtraAnalysis) 1445 Result = false; 1446 else 1447 return false; 1448 } 1449 1450 // We need to have a loop header. 1451 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() 1452 << '\n'); 1453 1454 // Specific checks for outer loops. We skip the remaining legal checks at this 1455 // point because they don't support outer loops. 1456 if (!TheLoop->isInnermost()) { 1457 assert(UseVPlanNativePath && "VPlan-native path is not enabled."); 1458 1459 if (!canVectorizeOuterLoop()) { 1460 reportVectorizationFailure("Unsupported outer loop", 1461 "unsupported outer loop", 1462 "UnsupportedOuterLoop", 1463 ORE, TheLoop); 1464 // TODO: Implement DoExtraAnalysis when subsequent legal checks support 1465 // outer loops. 1466 return false; 1467 } 1468 1469 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); 1470 return Result; 1471 } 1472 1473 assert(TheLoop->isInnermost() && "Inner loop expected."); 1474 // Check if we can if-convert non-single-bb loops. 1475 unsigned NumBlocks = TheLoop->getNumBlocks(); 1476 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1477 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1478 if (DoExtraAnalysis) 1479 Result = false; 1480 else 1481 return false; 1482 } 1483 1484 // Check if we can vectorize the instructions and CFG in this loop. 1485 if (!canVectorizeInstrs()) { 1486 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1487 if (DoExtraAnalysis) 1488 Result = false; 1489 else 1490 return false; 1491 } 1492 1493 // Go over each instruction and look at memory deps. 1494 if (!canVectorizeMemory()) { 1495 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1496 if (DoExtraAnalysis) 1497 Result = false; 1498 else 1499 return false; 1500 } 1501 1502 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 1503 reportVectorizationFailure("could not determine number of loop iterations", 1504 "could not determine number of loop iterations", 1505 "CantComputeNumberOfIterations", ORE, TheLoop); 1506 if (DoExtraAnalysis) 1507 Result = false; 1508 else 1509 return false; 1510 } 1511 1512 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" 1513 << (LAI->getRuntimePointerChecking()->Need 1514 ? " (with a runtime bound check)" 1515 : "") 1516 << "!\n"); 1517 1518 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 1519 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 1520 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 1521 1522 if (PSE.getPredicate().getComplexity() > SCEVThreshold) { 1523 reportVectorizationFailure("Too many SCEV checks needed", 1524 "Too many SCEV assumptions need to be made and checked at runtime", 1525 "TooManySCEVRunTimeChecks", ORE, TheLoop); 1526 if (DoExtraAnalysis) 1527 Result = false; 1528 else 1529 return false; 1530 } 1531 1532 // Okay! We've done all the tests. If any have failed, return false. Otherwise 1533 // we can vectorize, and at this point we don't have any other mem analysis 1534 // which may limit our maximum vectorization factor, so just return true with 1535 // no restrictions. 1536 return Result; 1537 } 1538 1539 bool LoopVectorizationLegality::canFoldTailByMasking() const { 1540 1541 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); 1542 1543 SmallPtrSet<const Value *, 8> ReductionLiveOuts; 1544 1545 for (const auto &Reduction : getReductionVars()) 1546 ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); 1547 1548 // TODO: handle non-reduction outside users when tail is folded by masking. 1549 for (auto *AE : AllowedExit) { 1550 // Check that all users of allowed exit values are inside the loop or 1551 // are the live-out of a reduction. 1552 if (ReductionLiveOuts.count(AE)) 1553 continue; 1554 for (User *U : AE->users()) { 1555 Instruction *UI = cast<Instruction>(U); 1556 if (TheLoop->contains(UI)) 1557 continue; 1558 LLVM_DEBUG( 1559 dbgs() 1560 << "LV: Cannot fold tail by masking, loop has an outside user for " 1561 << *UI << "\n"); 1562 return false; 1563 } 1564 } 1565 1566 for (const auto &Entry : getInductionVars()) { 1567 PHINode *OrigPhi = Entry.first; 1568 for (User *U : OrigPhi->users()) { 1569 auto *UI = cast<Instruction>(U); 1570 if (!TheLoop->contains(UI)) { 1571 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an " 1572 "outside user for " 1573 << *UI << "\n"); 1574 return false; 1575 } 1576 } 1577 } 1578 1579 // The list of pointers that we can safely read and write to remains empty. 1580 SmallPtrSet<Value *, 8> SafePointers; 1581 1582 // Check all blocks for predication, including those that ordinarily do not 1583 // need predication such as the header block. 1584 SmallPtrSet<const Instruction *, 8> TmpMaskedOp; 1585 for (BasicBlock *BB : TheLoop->blocks()) { 1586 if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) { 1587 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n"); 1588 return false; 1589 } 1590 } 1591 1592 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); 1593 1594 return true; 1595 } 1596 1597 void LoopVectorizationLegality::prepareToFoldTailByMasking() { 1598 // The list of pointers that we can safely read and write to remains empty. 1599 SmallPtrSet<Value *, 8> SafePointers; 1600 1601 // Mark all blocks for predication, including those that ordinarily do not 1602 // need predication such as the header block. 1603 for (BasicBlock *BB : TheLoop->blocks()) { 1604 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp); 1605 assert(R && "Must be able to predicate block when tail-folding."); 1606 } 1607 } 1608 1609 } // namespace llvm 1610