1 //===- LoopVectorizationLegality.cpp --------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides loop vectorization legality analysis. Original code 10 // resided in LoopVectorize.cpp for a long time. 11 // 12 // At this point, it is implemented as a utility class, not as an analysis 13 // pass. It should be easy to create an analysis pass around it if there 14 // is a need (but D45420 needs to happen first). 15 // 16 17 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 18 #include "llvm/Analysis/Loads.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 21 #include "llvm/Analysis/TargetLibraryInfo.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/Analysis/VectorUtils.h" 25 #include "llvm/IR/IntrinsicInst.h" 26 #include "llvm/IR/PatternMatch.h" 27 #include "llvm/Transforms/Utils/SizeOpts.h" 28 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 29 30 using namespace llvm; 31 using namespace PatternMatch; 32 33 #define LV_NAME "loop-vectorize" 34 #define DEBUG_TYPE LV_NAME 35 36 static cl::opt<bool> 37 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 38 cl::desc("Enable if-conversion during vectorization.")); 39 40 static cl::opt<bool> 41 AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, 42 cl::desc("Enable recognition of non-constant strided " 43 "pointer induction variables.")); 44 45 namespace llvm { 46 cl::opt<bool> 47 HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, 48 cl::desc("Allow enabling loop hints to reorder " 49 "FP operations during vectorization.")); 50 } 51 52 // TODO: Move size-based thresholds out of legality checking, make cost based 53 // decisions instead of hard thresholds. 54 static cl::opt<unsigned> VectorizeSCEVCheckThreshold( 55 "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 56 cl::desc("The maximum number of SCEV checks allowed.")); 57 58 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 59 "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 60 cl::desc("The maximum number of SCEV checks allowed with a " 61 "vectorize(enable) pragma")); 62 63 static cl::opt<LoopVectorizeHints::ScalableForceKind> 64 ForceScalableVectorization( 65 "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), 66 cl::Hidden, 67 cl::desc("Control whether the compiler can use scalable vectors to " 68 "vectorize a loop"), 69 cl::values( 70 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", 71 "Scalable vectorization is disabled."), 72 clEnumValN( 73 LoopVectorizeHints::SK_PreferScalable, "preferred", 74 "Scalable vectorization is available and favored when the " 75 "cost is inconclusive."), 76 clEnumValN( 77 LoopVectorizeHints::SK_PreferScalable, "on", 78 "Scalable vectorization is available and favored when the " 79 "cost is inconclusive."))); 80 81 static cl::opt<bool> EnableHistogramVectorization( 82 "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, 83 cl::desc("Enables autovectorization of some loops containing histograms")); 84 85 /// Maximum vectorization interleave count. 86 static const unsigned MaxInterleaveFactor = 16; 87 88 namespace llvm { 89 90 bool LoopVectorizeHints::Hint::validate(unsigned Val) { 91 switch (Kind) { 92 case HK_WIDTH: 93 return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 94 case HK_INTERLEAVE: 95 return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 96 case HK_FORCE: 97 return (Val <= 1); 98 case HK_ISVECTORIZED: 99 case HK_PREDICATE: 100 case HK_SCALABLE: 101 return (Val == 0 || Val == 1); 102 } 103 return false; 104 } 105 106 LoopVectorizeHints::LoopVectorizeHints(const Loop *L, 107 bool InterleaveOnlyWhenForced, 108 OptimizationRemarkEmitter &ORE, 109 const TargetTransformInfo *TTI) 110 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), 111 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), 112 Force("vectorize.enable", FK_Undefined, HK_FORCE), 113 IsVectorized("isvectorized", 0, HK_ISVECTORIZED), 114 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), 115 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE), 116 TheLoop(L), ORE(ORE) { 117 // Populate values with existing loop metadata. 118 getHintsFromMetadata(); 119 120 // force-vector-interleave overrides DisableInterleaving. 121 if (VectorizerParams::isInterleaveForced()) 122 Interleave.Value = VectorizerParams::VectorizationInterleave; 123 124 // If the metadata doesn't explicitly specify whether to enable scalable 125 // vectorization, then decide based on the following criteria (increasing 126 // level of priority): 127 // - Target default 128 // - Metadata width 129 // - Force option (always overrides) 130 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { 131 if (TTI) 132 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable 133 : SK_FixedWidthOnly; 134 135 if (Width.Value) 136 // If the width is set, but the metadata says nothing about the scalable 137 // property, then assume it concerns only a fixed-width UserVF. 138 // If width is not set, the flag takes precedence. 139 Scalable.Value = SK_FixedWidthOnly; 140 } 141 142 // If the flag is set to force any use of scalable vectors, override the loop 143 // hints. 144 if (ForceScalableVectorization.getValue() != 145 LoopVectorizeHints::SK_Unspecified) 146 Scalable.Value = ForceScalableVectorization.getValue(); 147 148 // Scalable vectorization is disabled if no preference is specified. 149 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) 150 Scalable.Value = SK_FixedWidthOnly; 151 152 if (IsVectorized.Value != 1) 153 // If the vectorization width and interleaving count are both 1 then 154 // consider the loop to have been already vectorized because there's 155 // nothing more that we can do. 156 IsVectorized.Value = 157 getWidth() == ElementCount::getFixed(1) && getInterleave() == 1; 158 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs() 159 << "LV: Interleaving disabled by the pass manager\n"); 160 } 161 162 void LoopVectorizeHints::setAlreadyVectorized() { 163 LLVMContext &Context = TheLoop->getHeader()->getContext(); 164 165 MDNode *IsVectorizedMD = MDNode::get( 166 Context, 167 {MDString::get(Context, "llvm.loop.isvectorized"), 168 ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); 169 MDNode *LoopID = TheLoop->getLoopID(); 170 MDNode *NewLoopID = 171 makePostTransformationMetadata(Context, LoopID, 172 {Twine(Prefix(), "vectorize.").str(), 173 Twine(Prefix(), "interleave.").str()}, 174 {IsVectorizedMD}); 175 TheLoop->setLoopID(NewLoopID); 176 177 // Update internal cache. 178 IsVectorized.Value = 1; 179 } 180 181 bool LoopVectorizeHints::allowVectorization( 182 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { 183 if (getForce() == LoopVectorizeHints::FK_Disabled) { 184 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 185 emitRemarkWithHints(); 186 return false; 187 } 188 189 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { 190 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 191 emitRemarkWithHints(); 192 return false; 193 } 194 195 if (getIsVectorized() == 1) { 196 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 197 // FIXME: Add interleave.disable metadata. This will allow 198 // vectorize.disable to be used without disabling the pass and errors 199 // to differentiate between disabled vectorization and a width of 1. 200 ORE.emit([&]() { 201 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), 202 "AllDisabled", L->getStartLoc(), 203 L->getHeader()) 204 << "loop not vectorized: vectorization and interleaving are " 205 "explicitly disabled, or the loop has already been " 206 "vectorized"; 207 }); 208 return false; 209 } 210 211 return true; 212 } 213 214 void LoopVectorizeHints::emitRemarkWithHints() const { 215 using namespace ore; 216 217 ORE.emit([&]() { 218 if (Force.Value == LoopVectorizeHints::FK_Disabled) 219 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", 220 TheLoop->getStartLoc(), 221 TheLoop->getHeader()) 222 << "loop not vectorized: vectorization is explicitly disabled"; 223 else { 224 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", 225 TheLoop->getStartLoc(), TheLoop->getHeader()); 226 R << "loop not vectorized"; 227 if (Force.Value == LoopVectorizeHints::FK_Enabled) { 228 R << " (Force=" << NV("Force", true); 229 if (Width.Value != 0) 230 R << ", Vector Width=" << NV("VectorWidth", getWidth()); 231 if (getInterleave() != 0) 232 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave()); 233 R << ")"; 234 } 235 return R; 236 } 237 }); 238 } 239 240 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { 241 if (getWidth() == ElementCount::getFixed(1)) 242 return LV_NAME; 243 if (getForce() == LoopVectorizeHints::FK_Disabled) 244 return LV_NAME; 245 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) 246 return LV_NAME; 247 return OptimizationRemarkAnalysis::AlwaysPrint; 248 } 249 250 bool LoopVectorizeHints::allowReordering() const { 251 // Allow the vectorizer to change the order of operations if enabling 252 // loop hints are provided 253 ElementCount EC = getWidth(); 254 return HintsAllowReordering && 255 (getForce() == LoopVectorizeHints::FK_Enabled || 256 EC.getKnownMinValue() > 1); 257 } 258 259 void LoopVectorizeHints::getHintsFromMetadata() { 260 MDNode *LoopID = TheLoop->getLoopID(); 261 if (!LoopID) 262 return; 263 264 // First operand should refer to the loop id itself. 265 assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 266 assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 267 268 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 269 const MDString *S = nullptr; 270 SmallVector<Metadata *, 4> Args; 271 272 // The expected hint is either a MDString or a MDNode with the first 273 // operand a MDString. 274 if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 275 if (!MD || MD->getNumOperands() == 0) 276 continue; 277 S = dyn_cast<MDString>(MD->getOperand(0)); 278 for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 279 Args.push_back(MD->getOperand(i)); 280 } else { 281 S = dyn_cast<MDString>(LoopID->getOperand(i)); 282 assert(Args.size() == 0 && "too many arguments for MDString"); 283 } 284 285 if (!S) 286 continue; 287 288 // Check if the hint starts with the loop metadata prefix. 289 StringRef Name = S->getString(); 290 if (Args.size() == 1) 291 setHint(Name, Args[0]); 292 } 293 } 294 295 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { 296 if (!Name.starts_with(Prefix())) 297 return; 298 Name = Name.substr(Prefix().size(), StringRef::npos); 299 300 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 301 if (!C) 302 return; 303 unsigned Val = C->getZExtValue(); 304 305 Hint *Hints[] = {&Width, &Interleave, &Force, 306 &IsVectorized, &Predicate, &Scalable}; 307 for (auto *H : Hints) { 308 if (Name == H->Name) { 309 if (H->validate(Val)) 310 H->Value = Val; 311 else 312 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 313 break; 314 } 315 } 316 } 317 318 // Return true if the inner loop \p Lp is uniform with regard to the outer loop 319 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes 320 // executing the inner loop will execute the same iterations). This check is 321 // very constrained for now but it will be relaxed in the future. \p Lp is 322 // considered uniform if it meets all the following conditions: 323 // 1) it has a canonical IV (starting from 0 and with stride 1), 324 // 2) its latch terminator is a conditional branch and, 325 // 3) its latch condition is a compare instruction whose operands are the 326 // canonical IV and an OuterLp invariant. 327 // This check doesn't take into account the uniformity of other conditions not 328 // related to the loop latch because they don't affect the loop uniformity. 329 // 330 // NOTE: We decided to keep all these checks and its associated documentation 331 // together so that we can easily have a picture of the current supported loop 332 // nests. However, some of the current checks don't depend on \p OuterLp and 333 // would be redundantly executed for each \p Lp if we invoked this function for 334 // different candidate outer loops. This is not the case for now because we 335 // don't currently have the infrastructure to evaluate multiple candidate outer 336 // loops and \p OuterLp will be a fixed parameter while we only support explicit 337 // outer loop vectorization. It's also very likely that these checks go away 338 // before introducing the aforementioned infrastructure. However, if this is not 339 // the case, we should move the \p OuterLp independent checks to a separate 340 // function that is only executed once for each \p Lp. 341 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { 342 assert(Lp->getLoopLatch() && "Expected loop with a single latch."); 343 344 // If Lp is the outer loop, it's uniform by definition. 345 if (Lp == OuterLp) 346 return true; 347 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); 348 349 // 1. 350 PHINode *IV = Lp->getCanonicalInductionVariable(); 351 if (!IV) { 352 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); 353 return false; 354 } 355 356 // 2. 357 BasicBlock *Latch = Lp->getLoopLatch(); 358 auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); 359 if (!LatchBr || LatchBr->isUnconditional()) { 360 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); 361 return false; 362 } 363 364 // 3. 365 auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition()); 366 if (!LatchCmp) { 367 LLVM_DEBUG( 368 dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); 369 return false; 370 } 371 372 Value *CondOp0 = LatchCmp->getOperand(0); 373 Value *CondOp1 = LatchCmp->getOperand(1); 374 Value *IVUpdate = IV->getIncomingValueForBlock(Latch); 375 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && 376 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { 377 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); 378 return false; 379 } 380 381 return true; 382 } 383 384 // Return true if \p Lp and all its nested loops are uniform with regard to \p 385 // OuterLp. 386 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { 387 if (!isUniformLoop(Lp, OuterLp)) 388 return false; 389 390 // Check if nested loops are uniform. 391 for (Loop *SubLp : *Lp) 392 if (!isUniformLoopNest(SubLp, OuterLp)) 393 return false; 394 395 return true; 396 } 397 398 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 399 if (Ty->isPointerTy()) 400 return DL.getIntPtrType(Ty); 401 402 // It is possible that char's or short's overflow when we ask for the loop's 403 // trip count, work around this by changing the type size. 404 if (Ty->getScalarSizeInBits() < 32) 405 return Type::getInt32Ty(Ty->getContext()); 406 407 return Ty; 408 } 409 410 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 411 Ty0 = convertPointerToIntegerType(DL, Ty0); 412 Ty1 = convertPointerToIntegerType(DL, Ty1); 413 if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 414 return Ty0; 415 return Ty1; 416 } 417 418 /// Check that the instruction has outside loop users and is not an 419 /// identified reduction variable. 420 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 421 SmallPtrSetImpl<Value *> &AllowedExit) { 422 // Reductions, Inductions and non-header phis are allowed to have exit users. All 423 // other instructions must not have external users. 424 if (!AllowedExit.count(Inst)) 425 // Check that all of the users of the loop are inside the BB. 426 for (User *U : Inst->users()) { 427 Instruction *UI = cast<Instruction>(U); 428 // This user may be a reduction exit value. 429 if (!TheLoop->contains(UI)) { 430 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 431 return true; 432 } 433 } 434 return false; 435 } 436 437 /// Returns true if A and B have same pointer operands or same SCEVs addresses 438 static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, 439 StoreInst *B) { 440 // Compare store 441 if (A == B) 442 return true; 443 444 // Otherwise Compare pointers 445 Value *APtr = A->getPointerOperand(); 446 Value *BPtr = B->getPointerOperand(); 447 if (APtr == BPtr) 448 return true; 449 450 // Otherwise compare address SCEVs 451 if (SE->getSCEV(APtr) == SE->getSCEV(BPtr)) 452 return true; 453 454 return false; 455 } 456 457 int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, 458 Value *Ptr) const { 459 // FIXME: Currently, the set of symbolic strides is sometimes queried before 460 // it's collected. This happens from canVectorizeWithIfConvert, when the 461 // pointer is checked to reference consecutive elements suitable for a 462 // masked access. 463 const auto &Strides = 464 LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); 465 466 Function *F = TheLoop->getHeader()->getParent(); 467 bool OptForSize = F->hasOptSize() || 468 llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, 469 PGSOQueryType::IRPass); 470 bool CanAddPredicate = !OptForSize; 471 int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, 472 CanAddPredicate, false).value_or(0); 473 if (Stride == 1 || Stride == -1) 474 return Stride; 475 return 0; 476 } 477 478 bool LoopVectorizationLegality::isInvariant(Value *V) const { 479 return LAI->isInvariant(V); 480 } 481 482 namespace { 483 /// A rewriter to build the SCEVs for each of the VF lanes in the expected 484 /// vectorized loop, which can then be compared to detect their uniformity. This 485 /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) 486 /// with new AddRecs where the step is multiplied by StepMultiplier and Offset * 487 /// Step is added. Also checks if all sub-expressions are analyzable w.r.t. 488 /// uniformity. 489 class SCEVAddRecForUniformityRewriter 490 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { 491 /// Multiplier to be applied to the step of AddRecs in TheLoop. 492 unsigned StepMultiplier; 493 494 /// Offset to be added to the AddRecs in TheLoop. 495 unsigned Offset; 496 497 /// Loop for which to rewrite AddRecsFor. 498 Loop *TheLoop; 499 500 /// Is any sub-expressions not analyzable w.r.t. uniformity? 501 bool CannotAnalyze = false; 502 503 bool canAnalyze() const { return !CannotAnalyze; } 504 505 public: 506 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, 507 unsigned Offset, Loop *TheLoop) 508 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), 509 TheLoop(TheLoop) {} 510 511 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 512 assert(Expr->getLoop() == TheLoop && 513 "addrec outside of TheLoop must be invariant and should have been " 514 "handled earlier"); 515 // Build a new AddRec by multiplying the step by StepMultiplier and 516 // incrementing the start by Offset * step. 517 Type *Ty = Expr->getType(); 518 auto *Step = Expr->getStepRecurrence(SE); 519 if (!SE.isLoopInvariant(Step, TheLoop)) { 520 CannotAnalyze = true; 521 return Expr; 522 } 523 auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier)); 524 auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset)); 525 auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset); 526 return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap); 527 } 528 529 const SCEV *visit(const SCEV *S) { 530 if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop)) 531 return S; 532 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); 533 } 534 535 const SCEV *visitUnknown(const SCEVUnknown *S) { 536 if (SE.isLoopInvariant(S, TheLoop)) 537 return S; 538 // The value could vary across iterations. 539 CannotAnalyze = true; 540 return S; 541 } 542 543 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { 544 // Could not analyze the expression. 545 CannotAnalyze = true; 546 return S; 547 } 548 549 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, 550 unsigned StepMultiplier, unsigned Offset, 551 Loop *TheLoop) { 552 /// Bail out if the expression does not contain an UDiv expression. 553 /// Uniform values which are not loop invariant require operations to strip 554 /// out the lowest bits. For now just look for UDivs and use it to avoid 555 /// re-writing UDIV-free expressions for other lanes to limit compile time. 556 if (!SCEVExprContains(S, 557 [](const SCEV *S) { return isa<SCEVUDivExpr>(S); })) 558 return SE.getCouldNotCompute(); 559 560 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, 561 TheLoop); 562 const SCEV *Result = Rewriter.visit(S); 563 564 if (Rewriter.canAnalyze()) 565 return Result; 566 return SE.getCouldNotCompute(); 567 } 568 }; 569 570 } // namespace 571 572 bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { 573 if (isInvariant(V)) 574 return true; 575 if (VF.isScalable()) 576 return false; 577 if (VF.isScalar()) 578 return true; 579 580 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is 581 // never considered uniform. 582 auto *SE = PSE.getSE(); 583 if (!SE->isSCEVable(V->getType())) 584 return false; 585 const SCEV *S = SE->getSCEV(V); 586 587 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for 588 // lane 0 matches the expressions for all other lanes. 589 unsigned FixedVF = VF.getKnownMinValue(); 590 const SCEV *FirstLaneExpr = 591 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop); 592 if (isa<SCEVCouldNotCompute>(FirstLaneExpr)) 593 return false; 594 595 // Make sure the expressions for lanes FixedVF-1..1 match the expression for 596 // lane 0. We check lanes in reverse order for compile-time, as frequently 597 // checking the last lane is sufficient to rule out uniformity. 598 return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) { 599 const SCEV *IthLaneExpr = 600 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop); 601 return FirstLaneExpr == IthLaneExpr; 602 }); 603 } 604 605 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, 606 ElementCount VF) const { 607 Value *Ptr = getLoadStorePointerOperand(&I); 608 if (!Ptr) 609 return false; 610 // Note: There's nothing inherent which prevents predicated loads and 611 // stores from being uniform. The current lowering simply doesn't handle 612 // it; in particular, the cost model distinguishes scatter/gather from 613 // scalar w/predication, and we currently rely on the scalar path. 614 return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent()); 615 } 616 617 bool LoopVectorizationLegality::canVectorizeOuterLoop() { 618 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop."); 619 // Store the result and return it at the end instead of exiting early, in case 620 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 621 bool Result = true; 622 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 623 624 for (BasicBlock *BB : TheLoop->blocks()) { 625 // Check whether the BB terminator is a BranchInst. Any other terminator is 626 // not supported yet. 627 auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); 628 if (!Br) { 629 reportVectorizationFailure("Unsupported basic block terminator", 630 "loop control flow is not understood by vectorizer", 631 "CFGNotUnderstood", ORE, TheLoop); 632 if (DoExtraAnalysis) 633 Result = false; 634 else 635 return false; 636 } 637 638 // Check whether the BranchInst is a supported one. Only unconditional 639 // branches, conditional branches with an outer loop invariant condition or 640 // backedges are supported. 641 // FIXME: We skip these checks when VPlan predication is enabled as we 642 // want to allow divergent branches. This whole check will be removed 643 // once VPlan predication is on by default. 644 if (Br && Br->isConditional() && 645 !TheLoop->isLoopInvariant(Br->getCondition()) && 646 !LI->isLoopHeader(Br->getSuccessor(0)) && 647 !LI->isLoopHeader(Br->getSuccessor(1))) { 648 reportVectorizationFailure("Unsupported conditional branch", 649 "loop control flow is not understood by vectorizer", 650 "CFGNotUnderstood", ORE, TheLoop); 651 if (DoExtraAnalysis) 652 Result = false; 653 else 654 return false; 655 } 656 } 657 658 // Check whether inner loops are uniform. At this point, we only support 659 // simple outer loops scenarios with uniform nested loops. 660 if (!isUniformLoopNest(TheLoop /*loop nest*/, 661 TheLoop /*context outer loop*/)) { 662 reportVectorizationFailure("Outer loop contains divergent loops", 663 "loop control flow is not understood by vectorizer", 664 "CFGNotUnderstood", ORE, TheLoop); 665 if (DoExtraAnalysis) 666 Result = false; 667 else 668 return false; 669 } 670 671 // Check whether we are able to set up outer loop induction. 672 if (!setupOuterLoopInductions()) { 673 reportVectorizationFailure("Unsupported outer loop Phi(s)", 674 "Unsupported outer loop Phi(s)", 675 "UnsupportedPhi", ORE, TheLoop); 676 if (DoExtraAnalysis) 677 Result = false; 678 else 679 return false; 680 } 681 682 return Result; 683 } 684 685 void LoopVectorizationLegality::addInductionPhi( 686 PHINode *Phi, const InductionDescriptor &ID, 687 SmallPtrSetImpl<Value *> &AllowedExit) { 688 Inductions[Phi] = ID; 689 690 // In case this induction also comes with casts that we know we can ignore 691 // in the vectorized loop body, record them here. All casts could be recorded 692 // here for ignoring, but suffices to record only the first (as it is the 693 // only one that may bw used outside the cast sequence). 694 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 695 if (!Casts.empty()) 696 InductionCastsToIgnore.insert(*Casts.begin()); 697 698 Type *PhiTy = Phi->getType(); 699 const DataLayout &DL = Phi->getDataLayout(); 700 701 // Get the widest type. 702 if (!PhiTy->isFloatingPointTy()) { 703 if (!WidestIndTy) 704 WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 705 else 706 WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 707 } 708 709 // Int inductions are special because we only allow one IV. 710 if (ID.getKind() == InductionDescriptor::IK_IntInduction && 711 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && 712 isa<Constant>(ID.getStartValue()) && 713 cast<Constant>(ID.getStartValue())->isNullValue()) { 714 715 // Use the phi node with the widest type as induction. Use the last 716 // one if there are multiple (no good reason for doing this other 717 // than it is expedient). We've checked that it begins at zero and 718 // steps by one, so this is a canonical induction variable. 719 if (!PrimaryInduction || PhiTy == WidestIndTy) 720 PrimaryInduction = Phi; 721 } 722 723 // Both the PHI node itself, and the "post-increment" value feeding 724 // back into the PHI node may have external users. 725 // We can allow those uses, except if the SCEVs we have for them rely 726 // on predicates that only hold within the loop, since allowing the exit 727 // currently means re-using this SCEV outside the loop (see PR33706 for more 728 // details). 729 if (PSE.getPredicate().isAlwaysTrue()) { 730 AllowedExit.insert(Phi); 731 AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); 732 } 733 734 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); 735 } 736 737 bool LoopVectorizationLegality::setupOuterLoopInductions() { 738 BasicBlock *Header = TheLoop->getHeader(); 739 740 // Returns true if a given Phi is a supported induction. 741 auto isSupportedPhi = [&](PHINode &Phi) -> bool { 742 InductionDescriptor ID; 743 if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && 744 ID.getKind() == InductionDescriptor::IK_IntInduction) { 745 addInductionPhi(&Phi, ID, AllowedExit); 746 return true; 747 } else { 748 // Bail out for any Phi in the outer loop header that is not a supported 749 // induction. 750 LLVM_DEBUG( 751 dbgs() 752 << "LV: Found unsupported PHI for outer loop vectorization.\n"); 753 return false; 754 } 755 }; 756 757 if (llvm::all_of(Header->phis(), isSupportedPhi)) 758 return true; 759 else 760 return false; 761 } 762 763 /// Checks if a function is scalarizable according to the TLI, in 764 /// the sense that it should be vectorized and then expanded in 765 /// multiple scalar calls. This is represented in the 766 /// TLI via mappings that do not specify a vector name, as in the 767 /// following example: 768 /// 769 /// const VecDesc VecIntrinsics[] = { 770 /// {"llvm.phx.abs.i32", "", 4} 771 /// }; 772 static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { 773 const StringRef ScalarName = CI.getCalledFunction()->getName(); 774 bool Scalarize = TLI.isFunctionVectorizable(ScalarName); 775 // Check that all known VFs are not associated to a vector 776 // function, i.e. the vector name is emty. 777 if (Scalarize) { 778 ElementCount WidestFixedVF, WidestScalableVF; 779 TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); 780 for (ElementCount VF = ElementCount::getFixed(2); 781 ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) 782 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 783 for (ElementCount VF = ElementCount::getScalable(1); 784 ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) 785 Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); 786 assert((WidestScalableVF.isZero() || !Scalarize) && 787 "Caller may decide to scalarize a variant using a scalable VF"); 788 } 789 return Scalarize; 790 } 791 792 bool LoopVectorizationLegality::canVectorizeInstrs() { 793 BasicBlock *Header = TheLoop->getHeader(); 794 795 // For each block in the loop. 796 for (BasicBlock *BB : TheLoop->blocks()) { 797 // Scan the instructions in the block and look for hazards. 798 for (Instruction &I : *BB) { 799 if (auto *Phi = dyn_cast<PHINode>(&I)) { 800 Type *PhiTy = Phi->getType(); 801 // Check that this PHI type is allowed. 802 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 803 !PhiTy->isPointerTy()) { 804 reportVectorizationFailure("Found a non-int non-pointer PHI", 805 "loop control flow is not understood by vectorizer", 806 "CFGNotUnderstood", ORE, TheLoop); 807 return false; 808 } 809 810 // If this PHINode is not in the header block, then we know that we 811 // can convert it to select during if-conversion. No need to check if 812 // the PHIs in this block are induction or reduction variables. 813 if (BB != Header) { 814 // Non-header phi nodes that have outside uses can be vectorized. Add 815 // them to the list of allowed exits. 816 // Unsafe cyclic dependencies with header phis are identified during 817 // legalization for reduction, induction and fixed order 818 // recurrences. 819 AllowedExit.insert(&I); 820 continue; 821 } 822 823 // We only allow if-converted PHIs with exactly two incoming values. 824 if (Phi->getNumIncomingValues() != 2) { 825 reportVectorizationFailure("Found an invalid PHI", 826 "loop control flow is not understood by vectorizer", 827 "CFGNotUnderstood", ORE, TheLoop, Phi); 828 return false; 829 } 830 831 RecurrenceDescriptor RedDes; 832 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, 833 DT, PSE.getSE())) { 834 Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); 835 AllowedExit.insert(RedDes.getLoopExitInstr()); 836 Reductions[Phi] = RedDes; 837 continue; 838 } 839 840 // We prevent matching non-constant strided pointer IVS to preserve 841 // historical vectorizer behavior after a generalization of the 842 // IVDescriptor code. The intent is to remove this check, but we 843 // have to fix issues around code quality for such loops first. 844 auto isDisallowedStridedPointerInduction = 845 [](const InductionDescriptor &ID) { 846 if (AllowStridedPointerIVs) 847 return false; 848 return ID.getKind() == InductionDescriptor::IK_PtrInduction && 849 ID.getConstIntStepValue() == nullptr; 850 }; 851 852 // TODO: Instead of recording the AllowedExit, it would be good to 853 // record the complementary set: NotAllowedExit. These include (but may 854 // not be limited to): 855 // 1. Reduction phis as they represent the one-before-last value, which 856 // is not available when vectorized 857 // 2. Induction phis and increment when SCEV predicates cannot be used 858 // outside the loop - see addInductionPhi 859 // 3. Non-Phis with outside uses when SCEV predicates cannot be used 860 // outside the loop - see call to hasOutsideLoopUser in the non-phi 861 // handling below 862 // 4. FixedOrderRecurrence phis that can possibly be handled by 863 // extraction. 864 // By recording these, we can then reason about ways to vectorize each 865 // of these NotAllowedExit. 866 InductionDescriptor ID; 867 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) && 868 !isDisallowedStridedPointerInduction(ID)) { 869 addInductionPhi(Phi, ID, AllowedExit); 870 Requirements->addExactFPMathInst(ID.getExactFPMathInst()); 871 continue; 872 } 873 874 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { 875 AllowedExit.insert(Phi); 876 FixedOrderRecurrences.insert(Phi); 877 continue; 878 } 879 880 // As a last resort, coerce the PHI to a AddRec expression 881 // and re-try classifying it a an induction PHI. 882 if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) && 883 !isDisallowedStridedPointerInduction(ID)) { 884 addInductionPhi(Phi, ID, AllowedExit); 885 continue; 886 } 887 888 reportVectorizationFailure("Found an unidentified PHI", 889 "value that could not be identified as " 890 "reduction is used outside the loop", 891 "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); 892 return false; 893 } // end of PHI handling 894 895 // We handle calls that: 896 // * Are debug info intrinsics. 897 // * Have a mapping to an IR intrinsic. 898 // * Have a vector version available. 899 auto *CI = dyn_cast<CallInst>(&I); 900 901 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && 902 !isa<DbgInfoIntrinsic>(CI) && 903 !(CI->getCalledFunction() && TLI && 904 (!VFDatabase::getMappings(*CI).empty() || 905 isTLIScalarize(*TLI, *CI)))) { 906 // If the call is a recognized math libary call, it is likely that 907 // we can vectorize it given loosened floating-point constraints. 908 LibFunc Func; 909 bool IsMathLibCall = 910 TLI && CI->getCalledFunction() && 911 CI->getType()->isFloatingPointTy() && 912 TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) && 913 TLI->hasOptimizedCodeGen(Func); 914 915 if (IsMathLibCall) { 916 // TODO: Ideally, we should not use clang-specific language here, 917 // but it's hard to provide meaningful yet generic advice. 918 // Also, should this be guarded by allowExtraAnalysis() and/or be part 919 // of the returned info from isFunctionVectorizable()? 920 reportVectorizationFailure( 921 "Found a non-intrinsic callsite", 922 "library call cannot be vectorized. " 923 "Try compiling with -fno-math-errno, -ffast-math, " 924 "or similar flags", 925 "CantVectorizeLibcall", ORE, TheLoop, CI); 926 } else { 927 reportVectorizationFailure("Found a non-intrinsic callsite", 928 "call instruction cannot be vectorized", 929 "CantVectorizeLibcall", ORE, TheLoop, CI); 930 } 931 return false; 932 } 933 934 // Some intrinsics have scalar arguments and should be same in order for 935 // them to be vectorized (i.e. loop invariant). 936 if (CI) { 937 auto *SE = PSE.getSE(); 938 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); 939 for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) 940 if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, i)) { 941 if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { 942 reportVectorizationFailure("Found unvectorizable intrinsic", 943 "intrinsic instruction cannot be vectorized", 944 "CantVectorizeIntrinsic", ORE, TheLoop, CI); 945 return false; 946 } 947 } 948 } 949 950 // If we found a vectorized variant of a function, note that so LV can 951 // make better decisions about maximum VF. 952 if (CI && !VFDatabase::getMappings(*CI).empty()) 953 VecCallVariantsFound = true; 954 955 // Check that the instruction return type is vectorizable. 956 // Also, we can't vectorize extractelement instructions. 957 if ((!VectorType::isValidElementType(I.getType()) && 958 !I.getType()->isVoidTy()) || 959 isa<ExtractElementInst>(I)) { 960 reportVectorizationFailure("Found unvectorizable type", 961 "instruction return type cannot be vectorized", 962 "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); 963 return false; 964 } 965 966 // Check that the stored type is vectorizable. 967 if (auto *ST = dyn_cast<StoreInst>(&I)) { 968 Type *T = ST->getValueOperand()->getType(); 969 if (!VectorType::isValidElementType(T)) { 970 reportVectorizationFailure("Store instruction cannot be vectorized", 971 "store instruction cannot be vectorized", 972 "CantVectorizeStore", ORE, TheLoop, ST); 973 return false; 974 } 975 976 // For nontemporal stores, check that a nontemporal vector version is 977 // supported on the target. 978 if (ST->getMetadata(LLVMContext::MD_nontemporal)) { 979 // Arbitrarily try a vector of 2 elements. 980 auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2); 981 assert(VecTy && "did not find vectorized version of stored type"); 982 if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) { 983 reportVectorizationFailure( 984 "nontemporal store instruction cannot be vectorized", 985 "nontemporal store instruction cannot be vectorized", 986 "CantVectorizeNontemporalStore", ORE, TheLoop, ST); 987 return false; 988 } 989 } 990 991 } else if (auto *LD = dyn_cast<LoadInst>(&I)) { 992 if (LD->getMetadata(LLVMContext::MD_nontemporal)) { 993 // For nontemporal loads, check that a nontemporal vector version is 994 // supported on the target (arbitrarily try a vector of 2 elements). 995 auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2); 996 assert(VecTy && "did not find vectorized version of load type"); 997 if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) { 998 reportVectorizationFailure( 999 "nontemporal load instruction cannot be vectorized", 1000 "nontemporal load instruction cannot be vectorized", 1001 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); 1002 return false; 1003 } 1004 } 1005 1006 // FP instructions can allow unsafe algebra, thus vectorizable by 1007 // non-IEEE-754 compliant SIMD units. 1008 // This applies to floating-point math operations and calls, not memory 1009 // operations, shuffles, or casts, as they don't change precision or 1010 // semantics. 1011 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && 1012 !I.isFast()) { 1013 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); 1014 Hints->setPotentiallyUnsafe(); 1015 } 1016 1017 // Reduction instructions are allowed to have exit users. 1018 // All other instructions must not have external users. 1019 if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { 1020 // We can safely vectorize loops where instructions within the loop are 1021 // used outside the loop only if the SCEV predicates within the loop is 1022 // same as outside the loop. Allowing the exit means reusing the SCEV 1023 // outside the loop. 1024 if (PSE.getPredicate().isAlwaysTrue()) { 1025 AllowedExit.insert(&I); 1026 continue; 1027 } 1028 reportVectorizationFailure("Value cannot be used outside the loop", 1029 "value cannot be used outside the loop", 1030 "ValueUsedOutsideLoop", ORE, TheLoop, &I); 1031 return false; 1032 } 1033 } // next instr. 1034 } 1035 1036 if (!PrimaryInduction) { 1037 if (Inductions.empty()) { 1038 reportVectorizationFailure("Did not find one integer induction var", 1039 "loop induction variable could not be identified", 1040 "NoInductionVariable", ORE, TheLoop); 1041 return false; 1042 } else if (!WidestIndTy) { 1043 reportVectorizationFailure("Did not find one integer induction var", 1044 "integer loop induction variable could not be identified", 1045 "NoIntegerInductionVariable", ORE, TheLoop); 1046 return false; 1047 } else { 1048 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1049 } 1050 } 1051 1052 // Now we know the widest induction type, check if our found induction 1053 // is the same size. If it's not, unset it here and InnerLoopVectorizer 1054 // will create another. 1055 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) 1056 PrimaryInduction = nullptr; 1057 1058 return true; 1059 } 1060 1061 bool LoopVectorizationLegality::canVectorizeMemory() { 1062 LAI = &LAIs.getInfo(*TheLoop); 1063 const OptimizationRemarkAnalysis *LAR = LAI->getReport(); 1064 if (LAR) { 1065 ORE->emit([&]() { 1066 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), 1067 "loop not vectorized: ", *LAR); 1068 }); 1069 } 1070 1071 if (!LAI->canVectorizeMemory()) 1072 if (!EnableHistogramVectorization || 1073 !LAI->canVectorizeMemoryWithHistogram()) 1074 return false; 1075 1076 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { 1077 reportVectorizationFailure("We don't allow storing to uniform addresses", 1078 "write to a loop invariant address could not " 1079 "be vectorized", 1080 "CantVectorizeStoreToLoopInvariantAddress", ORE, 1081 TheLoop); 1082 return false; 1083 } 1084 1085 // We can vectorize stores to invariant address when final reduction value is 1086 // guaranteed to be stored at the end of the loop. Also, if decision to 1087 // vectorize loop is made, runtime checks are added so as to make sure that 1088 // invariant address won't alias with any other objects. 1089 if (!LAI->getStoresToInvariantAddresses().empty()) { 1090 // For each invariant address, check if last stored value is unconditional 1091 // and the address is not calculated inside the loop. 1092 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1093 if (!isInvariantStoreOfReduction(SI)) 1094 continue; 1095 1096 if (blockNeedsPredication(SI->getParent())) { 1097 reportVectorizationFailure( 1098 "We don't allow storing to uniform addresses", 1099 "write of conditional recurring variant value to a loop " 1100 "invariant address could not be vectorized", 1101 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1102 return false; 1103 } 1104 1105 // Invariant address should be defined outside of loop. LICM pass usually 1106 // makes sure it happens, but in rare cases it does not, we do not want 1107 // to overcomplicate vectorization to support this case. 1108 if (Instruction *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) { 1109 if (TheLoop->contains(Ptr)) { 1110 reportVectorizationFailure( 1111 "Invariant address is calculated inside the loop", 1112 "write to a loop invariant address could not " 1113 "be vectorized", 1114 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1115 return false; 1116 } 1117 } 1118 } 1119 1120 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) { 1121 // For each invariant address, check its last stored value is the result 1122 // of one of our reductions. 1123 // 1124 // We do not check if dependence with loads exists because that is already 1125 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. 1126 ScalarEvolution *SE = PSE.getSE(); 1127 SmallVector<StoreInst *, 4> UnhandledStores; 1128 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { 1129 if (isInvariantStoreOfReduction(SI)) { 1130 // Earlier stores to this address are effectively deadcode. 1131 // With opaque pointers it is possible for one pointer to be used with 1132 // different sizes of stored values: 1133 // store i32 0, ptr %x 1134 // store i8 0, ptr %x 1135 // The latest store doesn't complitely overwrite the first one in the 1136 // example. That is why we have to make sure that types of stored 1137 // values are same. 1138 // TODO: Check that bitwidth of unhandled store is smaller then the 1139 // one that overwrites it and add a test. 1140 erase_if(UnhandledStores, [SE, SI](StoreInst *I) { 1141 return storeToSameAddress(SE, SI, I) && 1142 I->getValueOperand()->getType() == 1143 SI->getValueOperand()->getType(); 1144 }); 1145 continue; 1146 } 1147 UnhandledStores.push_back(SI); 1148 } 1149 1150 bool IsOK = UnhandledStores.empty(); 1151 // TODO: we should also validate against InvariantMemSets. 1152 if (!IsOK) { 1153 reportVectorizationFailure( 1154 "We don't allow storing to uniform addresses", 1155 "write to a loop invariant address could not " 1156 "be vectorized", 1157 "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); 1158 return false; 1159 } 1160 } 1161 } 1162 1163 PSE.addPredicate(LAI->getPSE().getPredicate()); 1164 return true; 1165 } 1166 1167 bool LoopVectorizationLegality::canVectorizeFPMath( 1168 bool EnableStrictReductions) { 1169 1170 // First check if there is any ExactFP math or if we allow reassociations 1171 if (!Requirements->getExactFPInst() || Hints->allowReordering()) 1172 return true; 1173 1174 // If the above is false, we have ExactFPMath & do not allow reordering. 1175 // If the EnableStrictReductions flag is set, first check if we have any 1176 // Exact FP induction vars, which we cannot vectorize. 1177 if (!EnableStrictReductions || 1178 any_of(getInductionVars(), [&](auto &Induction) -> bool { 1179 InductionDescriptor IndDesc = Induction.second; 1180 return IndDesc.getExactFPMathInst(); 1181 })) 1182 return false; 1183 1184 // We can now only vectorize if all reductions with Exact FP math also 1185 // have the isOrdered flag set, which indicates that we can move the 1186 // reduction operations in-loop. 1187 return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { 1188 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1189 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); 1190 })); 1191 } 1192 1193 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { 1194 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1195 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1196 return RdxDesc.IntermediateStore == SI; 1197 }); 1198 } 1199 1200 bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { 1201 return any_of(getReductionVars(), [&](auto &Reduction) -> bool { 1202 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1203 if (!RdxDesc.IntermediateStore) 1204 return false; 1205 1206 ScalarEvolution *SE = PSE.getSE(); 1207 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); 1208 return V == InvariantAddress || 1209 SE->getSCEV(V) == SE->getSCEV(InvariantAddress); 1210 }); 1211 } 1212 1213 bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { 1214 Value *In0 = const_cast<Value *>(V); 1215 PHINode *PN = dyn_cast_or_null<PHINode>(In0); 1216 if (!PN) 1217 return false; 1218 1219 return Inductions.count(PN); 1220 } 1221 1222 const InductionDescriptor * 1223 LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { 1224 if (!isInductionPhi(Phi)) 1225 return nullptr; 1226 auto &ID = getInductionVars().find(Phi)->second; 1227 if (ID.getKind() == InductionDescriptor::IK_IntInduction || 1228 ID.getKind() == InductionDescriptor::IK_FpInduction) 1229 return &ID; 1230 return nullptr; 1231 } 1232 1233 const InductionDescriptor * 1234 LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { 1235 if (!isInductionPhi(Phi)) 1236 return nullptr; 1237 auto &ID = getInductionVars().find(Phi)->second; 1238 if (ID.getKind() == InductionDescriptor::IK_PtrInduction) 1239 return &ID; 1240 return nullptr; 1241 } 1242 1243 bool LoopVectorizationLegality::isCastedInductionVariable( 1244 const Value *V) const { 1245 auto *Inst = dyn_cast<Instruction>(V); 1246 return (Inst && InductionCastsToIgnore.count(Inst)); 1247 } 1248 1249 bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { 1250 return isInductionPhi(V) || isCastedInductionVariable(V); 1251 } 1252 1253 bool LoopVectorizationLegality::isFixedOrderRecurrence( 1254 const PHINode *Phi) const { 1255 return FixedOrderRecurrences.count(Phi); 1256 } 1257 1258 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { 1259 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 1260 } 1261 1262 bool LoopVectorizationLegality::blockCanBePredicated( 1263 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, 1264 SmallPtrSetImpl<const Instruction *> &MaskedOp) const { 1265 for (Instruction &I : *BB) { 1266 // We can predicate blocks with calls to assume, as long as we drop them in 1267 // case we flatten the CFG via predication. 1268 if (match(&I, m_Intrinsic<Intrinsic::assume>())) { 1269 MaskedOp.insert(&I); 1270 continue; 1271 } 1272 1273 // Do not let llvm.experimental.noalias.scope.decl block the vectorization. 1274 // TODO: there might be cases that it should block the vectorization. Let's 1275 // ignore those for now. 1276 if (isa<NoAliasScopeDeclInst>(&I)) 1277 continue; 1278 1279 // We can allow masked calls if there's at least one vector variant, even 1280 // if we end up scalarizing due to the cost model calculations. 1281 // TODO: Allow other calls if they have appropriate attributes... readonly 1282 // and argmemonly? 1283 if (CallInst *CI = dyn_cast<CallInst>(&I)) 1284 if (VFDatabase::hasMaskedVariant(*CI)) { 1285 MaskedOp.insert(CI); 1286 continue; 1287 } 1288 1289 // Loads are handled via masking (or speculated if safe to do so.) 1290 if (auto *LI = dyn_cast<LoadInst>(&I)) { 1291 if (!SafePtrs.count(LI->getPointerOperand())) 1292 MaskedOp.insert(LI); 1293 continue; 1294 } 1295 1296 // Predicated store requires some form of masking: 1297 // 1) masked store HW instruction, 1298 // 2) emulation via load-blend-store (only if safe and legal to do so, 1299 // be aware on the race conditions), or 1300 // 3) element-by-element predicate check and scalar store. 1301 if (auto *SI = dyn_cast<StoreInst>(&I)) { 1302 MaskedOp.insert(SI); 1303 continue; 1304 } 1305 1306 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) 1307 return false; 1308 } 1309 1310 return true; 1311 } 1312 1313 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1314 if (!EnableIfConversion) { 1315 reportVectorizationFailure("If-conversion is disabled", 1316 "if-conversion is disabled", 1317 "IfConversionDisabled", 1318 ORE, TheLoop); 1319 return false; 1320 } 1321 1322 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1323 1324 // A list of pointers which are known to be dereferenceable within scope of 1325 // the loop body for each iteration of the loop which executes. That is, 1326 // the memory pointed to can be dereferenced (with the access size implied by 1327 // the value's type) unconditionally within the loop header without 1328 // introducing a new fault. 1329 SmallPtrSet<Value *, 8> SafePointers; 1330 1331 // Collect safe addresses. 1332 for (BasicBlock *BB : TheLoop->blocks()) { 1333 if (!blockNeedsPredication(BB)) { 1334 for (Instruction &I : *BB) 1335 if (auto *Ptr = getLoadStorePointerOperand(&I)) 1336 SafePointers.insert(Ptr); 1337 continue; 1338 } 1339 1340 // For a block which requires predication, a address may be safe to access 1341 // in the loop w/o predication if we can prove dereferenceability facts 1342 // sufficient to ensure it'll never fault within the loop. For the moment, 1343 // we restrict this to loads; stores are more complicated due to 1344 // concurrency restrictions. 1345 ScalarEvolution &SE = *PSE.getSE(); 1346 for (Instruction &I : *BB) { 1347 LoadInst *LI = dyn_cast<LoadInst>(&I); 1348 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) && 1349 isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT, AC)) 1350 SafePointers.insert(LI->getPointerOperand()); 1351 } 1352 } 1353 1354 // Collect the blocks that need predication. 1355 for (BasicBlock *BB : TheLoop->blocks()) { 1356 // We don't support switch statements inside loops. 1357 if (!isa<BranchInst>(BB->getTerminator())) { 1358 reportVectorizationFailure("Loop contains a switch statement", 1359 "loop contains a switch statement", 1360 "LoopContainsSwitch", ORE, TheLoop, 1361 BB->getTerminator()); 1362 return false; 1363 } 1364 1365 // We must be able to predicate all blocks that need to be predicated. 1366 if (blockNeedsPredication(BB) && 1367 !blockCanBePredicated(BB, SafePointers, MaskedOp)) { 1368 reportVectorizationFailure( 1369 "Control flow cannot be substituted for a select", 1370 "control flow cannot be substituted for a select", "NoCFGForSelect", 1371 ORE, TheLoop, BB->getTerminator()); 1372 return false; 1373 } 1374 } 1375 1376 // We can if-convert this loop. 1377 return true; 1378 } 1379 1380 // Helper function to canVectorizeLoopNestCFG. 1381 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, 1382 bool UseVPlanNativePath) { 1383 assert((UseVPlanNativePath || Lp->isInnermost()) && 1384 "VPlan-native path is not enabled."); 1385 1386 // TODO: ORE should be improved to show more accurate information when an 1387 // outer loop can't be vectorized because a nested loop is not understood or 1388 // legal. Something like: "outer_loop_location: loop not vectorized: 1389 // (inner_loop_location) loop control flow is not understood by vectorizer". 1390 1391 // Store the result and return it at the end instead of exiting early, in case 1392 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1393 bool Result = true; 1394 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1395 1396 // We must have a loop in canonical form. Loops with indirectbr in them cannot 1397 // be canonicalized. 1398 if (!Lp->getLoopPreheader()) { 1399 reportVectorizationFailure("Loop doesn't have a legal pre-header", 1400 "loop control flow is not understood by vectorizer", 1401 "CFGNotUnderstood", ORE, TheLoop); 1402 if (DoExtraAnalysis) 1403 Result = false; 1404 else 1405 return false; 1406 } 1407 1408 // We must have a single backedge. 1409 if (Lp->getNumBackEdges() != 1) { 1410 reportVectorizationFailure("The loop must have a single backedge", 1411 "loop control flow is not understood by vectorizer", 1412 "CFGNotUnderstood", ORE, TheLoop); 1413 if (DoExtraAnalysis) 1414 Result = false; 1415 else 1416 return false; 1417 } 1418 1419 return Result; 1420 } 1421 1422 bool LoopVectorizationLegality::canVectorizeLoopNestCFG( 1423 Loop *Lp, bool UseVPlanNativePath) { 1424 // Store the result and return it at the end instead of exiting early, in case 1425 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1426 bool Result = true; 1427 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1428 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { 1429 if (DoExtraAnalysis) 1430 Result = false; 1431 else 1432 return false; 1433 } 1434 1435 // Recursively check whether the loop control flow of nested loops is 1436 // understood. 1437 for (Loop *SubLp : *Lp) 1438 if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { 1439 if (DoExtraAnalysis) 1440 Result = false; 1441 else 1442 return false; 1443 } 1444 1445 return Result; 1446 } 1447 1448 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { 1449 // Store the result and return it at the end instead of exiting early, in case 1450 // allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1451 bool Result = true; 1452 1453 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); 1454 // Check whether the loop-related control flow in the loop nest is expected by 1455 // vectorizer. 1456 if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { 1457 if (DoExtraAnalysis) 1458 Result = false; 1459 else 1460 return false; 1461 } 1462 1463 // We need to have a loop header. 1464 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() 1465 << '\n'); 1466 1467 // Specific checks for outer loops. We skip the remaining legal checks at this 1468 // point because they don't support outer loops. 1469 if (!TheLoop->isInnermost()) { 1470 assert(UseVPlanNativePath && "VPlan-native path is not enabled."); 1471 1472 if (!canVectorizeOuterLoop()) { 1473 reportVectorizationFailure("Unsupported outer loop", 1474 "unsupported outer loop", 1475 "UnsupportedOuterLoop", 1476 ORE, TheLoop); 1477 // TODO: Implement DoExtraAnalysis when subsequent legal checks support 1478 // outer loops. 1479 return false; 1480 } 1481 1482 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); 1483 return Result; 1484 } 1485 1486 assert(TheLoop->isInnermost() && "Inner loop expected."); 1487 // Check if we can if-convert non-single-bb loops. 1488 unsigned NumBlocks = TheLoop->getNumBlocks(); 1489 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1490 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1491 if (DoExtraAnalysis) 1492 Result = false; 1493 else 1494 return false; 1495 } 1496 1497 // Check if we can vectorize the instructions and CFG in this loop. 1498 if (!canVectorizeInstrs()) { 1499 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1500 if (DoExtraAnalysis) 1501 Result = false; 1502 else 1503 return false; 1504 } 1505 1506 // Go over each instruction and look at memory deps. 1507 if (!canVectorizeMemory()) { 1508 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1509 if (DoExtraAnalysis) 1510 Result = false; 1511 else 1512 return false; 1513 } 1514 1515 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 1516 reportVectorizationFailure("could not determine number of loop iterations", 1517 "could not determine number of loop iterations", 1518 "CantComputeNumberOfIterations", ORE, TheLoop); 1519 if (DoExtraAnalysis) 1520 Result = false; 1521 else 1522 return false; 1523 } 1524 1525 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" 1526 << (LAI->getRuntimePointerChecking()->Need 1527 ? " (with a runtime bound check)" 1528 : "") 1529 << "!\n"); 1530 1531 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 1532 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 1533 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 1534 1535 if (PSE.getPredicate().getComplexity() > SCEVThreshold) { 1536 reportVectorizationFailure("Too many SCEV checks needed", 1537 "Too many SCEV assumptions need to be made and checked at runtime", 1538 "TooManySCEVRunTimeChecks", ORE, TheLoop); 1539 if (DoExtraAnalysis) 1540 Result = false; 1541 else 1542 return false; 1543 } 1544 1545 // Okay! We've done all the tests. If any have failed, return false. Otherwise 1546 // we can vectorize, and at this point we don't have any other mem analysis 1547 // which may limit our maximum vectorization factor, so just return true with 1548 // no restrictions. 1549 return Result; 1550 } 1551 1552 bool LoopVectorizationLegality::canFoldTailByMasking() const { 1553 1554 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); 1555 1556 SmallPtrSet<const Value *, 8> ReductionLiveOuts; 1557 1558 for (const auto &Reduction : getReductionVars()) 1559 ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); 1560 1561 // TODO: handle non-reduction outside users when tail is folded by masking. 1562 for (auto *AE : AllowedExit) { 1563 // Check that all users of allowed exit values are inside the loop or 1564 // are the live-out of a reduction. 1565 if (ReductionLiveOuts.count(AE)) 1566 continue; 1567 for (User *U : AE->users()) { 1568 Instruction *UI = cast<Instruction>(U); 1569 if (TheLoop->contains(UI)) 1570 continue; 1571 LLVM_DEBUG( 1572 dbgs() 1573 << "LV: Cannot fold tail by masking, loop has an outside user for " 1574 << *UI << "\n"); 1575 return false; 1576 } 1577 } 1578 1579 for (const auto &Entry : getInductionVars()) { 1580 PHINode *OrigPhi = Entry.first; 1581 for (User *U : OrigPhi->users()) { 1582 auto *UI = cast<Instruction>(U); 1583 if (!TheLoop->contains(UI)) { 1584 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an " 1585 "outside user for " 1586 << *UI << "\n"); 1587 return false; 1588 } 1589 } 1590 } 1591 1592 // The list of pointers that we can safely read and write to remains empty. 1593 SmallPtrSet<Value *, 8> SafePointers; 1594 1595 // Check all blocks for predication, including those that ordinarily do not 1596 // need predication such as the header block. 1597 SmallPtrSet<const Instruction *, 8> TmpMaskedOp; 1598 for (BasicBlock *BB : TheLoop->blocks()) { 1599 if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) { 1600 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n"); 1601 return false; 1602 } 1603 } 1604 1605 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); 1606 1607 return true; 1608 } 1609 1610 void LoopVectorizationLegality::prepareToFoldTailByMasking() { 1611 // The list of pointers that we can safely read and write to remains empty. 1612 SmallPtrSet<Value *, 8> SafePointers; 1613 1614 // Mark all blocks for predication, including those that ordinarily do not 1615 // need predication such as the header block. 1616 for (BasicBlock *BB : TheLoop->blocks()) { 1617 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp); 1618 assert(R && "Must be able to predicate block when tail-folding."); 1619 } 1620 } 1621 1622 } // namespace llvm 1623