1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/STLExtras.h" 70 #include "llvm/ADT/SmallPtrSet.h" 71 #include "llvm/ADT/SmallSet.h" 72 #include "llvm/ADT/SmallVector.h" 73 #include "llvm/ADT/Statistic.h" 74 #include "llvm/ADT/StringRef.h" 75 #include "llvm/ADT/Twine.h" 76 #include "llvm/ADT/iterator_range.h" 77 #include "llvm/Analysis/AssumptionCache.h" 78 #include "llvm/Analysis/BasicAliasAnalysis.h" 79 #include "llvm/Analysis/BlockFrequencyInfo.h" 80 #include "llvm/Analysis/CFG.h" 81 #include "llvm/Analysis/CodeMetrics.h" 82 #include "llvm/Analysis/DemandedBits.h" 83 #include "llvm/Analysis/GlobalsModRef.h" 84 #include "llvm/Analysis/LoopAccessAnalysis.h" 85 #include "llvm/Analysis/LoopAnalysisManager.h" 86 #include "llvm/Analysis/LoopInfo.h" 87 #include "llvm/Analysis/LoopIterator.h" 88 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 89 #include "llvm/Analysis/ProfileSummaryInfo.h" 90 #include "llvm/Analysis/ScalarEvolution.h" 91 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 92 #include "llvm/Analysis/TargetLibraryInfo.h" 93 #include "llvm/Analysis/TargetTransformInfo.h" 94 #include "llvm/Analysis/ValueTracking.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfo.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/MDBuilder.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/ProfDataUtils.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cmath> 146 #include <cstdint> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <map> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN( 251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 252 "data-and-control-without-rt-check", 253 "Similar to data-and-control, but remove the runtime check"))); 254 255 static cl::opt<bool> MaximizeBandwidth( 256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 257 cl::desc("Maximize bandwidth when selecting vectorization factor which " 258 "will be determined by the smallest type in loop.")); 259 260 static cl::opt<bool> EnableInterleavedMemAccesses( 261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 262 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 263 264 /// An interleave-group may need masking if it resides in a block that needs 265 /// predication, or in order to mask away gaps. 266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 269 270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 272 cl::desc("We don't interleave loops with a estimated constant trip count " 273 "below this number")); 274 275 static cl::opt<unsigned> ForceTargetNumScalarRegs( 276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's number of scalar registers.")); 278 279 static cl::opt<unsigned> ForceTargetNumVectorRegs( 280 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 281 cl::desc("A flag that overrides the target's number of vector registers.")); 282 283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 285 cl::desc("A flag that overrides the target's max interleave factor for " 286 "scalar loops.")); 287 288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 290 cl::desc("A flag that overrides the target's max interleave factor for " 291 "vectorized loops.")); 292 293 static cl::opt<unsigned> ForceTargetInstructionCost( 294 "force-target-instruction-cost", cl::init(0), cl::Hidden, 295 cl::desc("A flag that overrides the target's expected cost for " 296 "an instruction to a single constant value. Mostly " 297 "useful for getting consistent testing.")); 298 299 static cl::opt<bool> ForceTargetSupportsScalableVectors( 300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 301 cl::desc( 302 "Pretend that scalable vectors are supported, even if the target does " 303 "not support them. This flag should only be used for testing.")); 304 305 static cl::opt<unsigned> SmallLoopCost( 306 "small-loop-cost", cl::init(20), cl::Hidden, 307 cl::desc( 308 "The cost of a loop that is considered 'small' by the interleaver.")); 309 310 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 312 cl::desc("Enable the use of the block frequency analysis to access PGO " 313 "heuristics minimizing code growth in cold regions and being more " 314 "aggressive in hot regions.")); 315 316 // Runtime interleave loops for load/store throughput. 317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 319 cl::desc( 320 "Enable runtime interleaving until load/store ports are saturated")); 321 322 /// Interleave small loops with scalar reductions. 323 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 325 cl::desc("Enable interleaving for loops with small iteration counts that " 326 "contain scalar reductions to expose ILP.")); 327 328 /// The number of stores in a loop that are allowed to need predication. 329 static cl::opt<unsigned> NumberOfStoresToPredicate( 330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 331 cl::desc("Max number of stores to be predicated behind an if.")); 332 333 static cl::opt<bool> EnableIndVarRegisterHeur( 334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 335 cl::desc("Count the induction variable only once when interleaving")); 336 337 static cl::opt<bool> EnableCondStoresVectorization( 338 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 339 cl::desc("Enable if predication of stores during vectorization.")); 340 341 static cl::opt<unsigned> MaxNestedScalarReductionIC( 342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 343 cl::desc("The maximum interleave count to use when interleaving a scalar " 344 "reduction in a nested loop.")); 345 346 static cl::opt<bool> 347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 348 cl::Hidden, 349 cl::desc("Prefer in-loop vector reductions, " 350 "overriding the targets preference.")); 351 352 static cl::opt<bool> ForceOrderedReductions( 353 "force-ordered-reductions", cl::init(false), cl::Hidden, 354 cl::desc("Enable the vectorisation of loops with in-order (strict) " 355 "FP reductions")); 356 357 static cl::opt<bool> PreferPredicatedReductionSelect( 358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 359 cl::desc( 360 "Prefer predicating a reduction operation over an after loop select.")); 361 362 namespace llvm { 363 cl::opt<bool> EnableVPlanNativePath( 364 "enable-vplan-native-path", cl::Hidden, 365 cl::desc("Enable VPlan-native vectorization path with " 366 "support for outer loop vectorization.")); 367 } 368 369 // This flag enables the stress testing of the VPlan H-CFG construction in the 370 // VPlan-native vectorization path. It must be used in conjuction with 371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 372 // verification of the H-CFGs built. 373 static cl::opt<bool> VPlanBuildStressTest( 374 "vplan-build-stress-test", cl::init(false), cl::Hidden, 375 cl::desc( 376 "Build VPlan for every supported loop nest in the function and bail " 377 "out right after the build (stress test the VPlan H-CFG construction " 378 "in the VPlan-native vectorization path).")); 379 380 cl::opt<bool> llvm::EnableLoopInterleaving( 381 "interleave-loops", cl::init(true), cl::Hidden, 382 cl::desc("Enable loop interleaving in Loop vectorization passes")); 383 cl::opt<bool> llvm::EnableLoopVectorization( 384 "vectorize-loops", cl::init(true), cl::Hidden, 385 cl::desc("Run the Loop vectorization passes")); 386 387 static cl::opt<bool> PrintVPlansInDotFormat( 388 "vplan-print-in-dot-format", cl::Hidden, 389 cl::desc("Use dot format instead of plain text when dumping VPlans")); 390 391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 392 "force-widen-divrem-via-safe-divisor", cl::Hidden, 393 cl::desc( 394 "Override cost based safe divisor widening for div/rem instructions")); 395 396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 398 cl::Hidden, 399 cl::desc("Try wider VFs if they enable the use of vector variants")); 400 401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 402 // variables not overflowing do not hold. See `emitSCEVChecks`. 403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 405 // `emitMemRuntimeChecks`. 406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 407 // Likelyhood of bypassing the vectorized loop because there are zero trips left 408 // after prolog. See `emitIterationCountCheck`. 409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 410 411 /// A helper function that returns true if the given type is irregular. The 412 /// type is irregular if its allocated size doesn't equal the store size of an 413 /// element of the corresponding vector type. 414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 415 // Determine if an array of N elements of type Ty is "bitcast compatible" 416 // with a <N x Ty> vector. 417 // This is only true if there is no padding between the array elements. 418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 419 } 420 421 /// A helper function that returns the reciprocal of the block probability of 422 /// predicated blocks. If we return X, we are assuming the predicated block 423 /// will execute once for every X iterations of the loop header. 424 /// 425 /// TODO: We should use actual block probability here, if available. Currently, 426 /// we always assume predicated blocks have a 50% chance of executing. 427 static unsigned getReciprocalPredBlockProb() { return 2; } 428 429 /// Returns "best known" trip count for the specified loop \p L as defined by 430 /// the following procedure: 431 /// 1) Returns exact trip count if it is known. 432 /// 2) Returns expected trip count according to profile data if any. 433 /// 3) Returns upper bound estimate if it is known. 434 /// 4) Returns std::nullopt if all of the above failed. 435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 436 Loop *L) { 437 // Check if exact trip count is known. 438 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 439 return ExpectedTC; 440 441 // Check if there is an expected trip count available from profile data. 442 if (LoopVectorizeWithBlockFrequency) 443 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 444 return *EstimatedTC; 445 446 // Check if upper bound estimate is known. 447 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 448 return ExpectedTC; 449 450 return std::nullopt; 451 } 452 453 /// Return a vector containing interleaved elements from multiple 454 /// smaller input vectors. 455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 456 const Twine &Name) { 457 unsigned Factor = Vals.size(); 458 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 459 460 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 461 #ifndef NDEBUG 462 for (Value *Val : Vals) 463 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 464 #endif 465 466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 467 // must use intrinsics to interleave. 468 if (VecTy->isScalableTy()) { 469 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 470 return Builder.CreateIntrinsic( 471 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 472 /*FMFSource=*/nullptr, Name); 473 } 474 475 // Fixed length. Start by concatenating all vectors into a wide vector. 476 Value *WideVec = concatenateVectors(Builder, Vals); 477 478 // Interleave the elements into the wide vector. 479 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 480 return Builder.CreateShuffleVector( 481 WideVec, createInterleaveMask(NumElts, Factor), Name); 482 } 483 484 namespace { 485 // Forward declare GeneratedRTChecks. 486 class GeneratedRTChecks; 487 488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 489 } // namespace 490 491 namespace llvm { 492 493 AnalysisKey ShouldRunExtraVectorPasses::Key; 494 495 /// InnerLoopVectorizer vectorizes loops which contain only one basic 496 /// block to a specified vectorization factor (VF). 497 /// This class performs the widening of scalars into vectors, or multiple 498 /// scalars. This class also implements the following features: 499 /// * It inserts an epilogue loop for handling loops that don't have iteration 500 /// counts that are known to be a multiple of the vectorization factor. 501 /// * It handles the code generation for reduction variables. 502 /// * Scalarization (implementation using scalars) of un-vectorizable 503 /// instructions. 504 /// InnerLoopVectorizer does not perform any vectorization-legality 505 /// checks, and relies on the caller to check for the different legality 506 /// aspects. The InnerLoopVectorizer relies on the 507 /// LoopVectorizationLegality class to provide information about the induction 508 /// and reduction variables that were found to a given vectorization factor. 509 class InnerLoopVectorizer { 510 public: 511 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 512 LoopInfo *LI, DominatorTree *DT, 513 const TargetLibraryInfo *TLI, 514 const TargetTransformInfo *TTI, AssumptionCache *AC, 515 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 516 ElementCount MinProfitableTripCount, 517 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 518 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 519 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 520 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 521 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 522 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 523 PSI(PSI), RTChecks(RTChecks) { 524 // Query this against the original loop and save it here because the profile 525 // of the original loop header may change as the transformation happens. 526 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 527 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 528 529 if (MinProfitableTripCount.isZero()) 530 this->MinProfitableTripCount = VecWidth; 531 else 532 this->MinProfitableTripCount = MinProfitableTripCount; 533 } 534 535 virtual ~InnerLoopVectorizer() = default; 536 537 /// Create a new empty loop that will contain vectorized instructions later 538 /// on, while the old loop will be used as the scalar remainder. Control flow 539 /// is generated around the vectorized (and scalar epilogue) loops consisting 540 /// of various checks and bypasses. Return the pre-header block of the new 541 /// loop and the start value for the canonical induction, if it is != 0. The 542 /// latter is the case when vectorizing the epilogue loop. In the case of 543 /// epilogue vectorization, this function is overriden to handle the more 544 /// complex control flow around the loops. \p ExpandedSCEVs is used to 545 /// look up SCEV expansions for expressions needed during skeleton creation. 546 virtual std::pair<BasicBlock *, Value *> 547 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 548 549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 550 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 551 552 // Return true if any runtime check is added. 553 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 554 555 /// A type for vectorized values in the new loop. Each value from the 556 /// original loop, when vectorized, is represented by UF vector values in the 557 /// new unrolled loop, where UF is the unroll factor. 558 using VectorParts = SmallVector<Value *, 2>; 559 560 /// A helper function to scalarize a single Instruction in the innermost loop. 561 /// Generates a sequence of scalar instances for each lane between \p MinLane 562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 564 /// Instr's operands. 565 void scalarizeInstruction(const Instruction *Instr, 566 VPReplicateRecipe *RepRecipe, 567 const VPIteration &Instance, 568 VPTransformState &State); 569 570 /// Try to vectorize interleaved access group \p Group with the base address 571 /// given in \p Addr, optionally masking the vector operations if \p 572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 573 /// values in the vectorized loop. 574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 575 ArrayRef<VPValue *> VPDefs, 576 VPTransformState &State, VPValue *Addr, 577 ArrayRef<VPValue *> StoredValues, 578 VPValue *BlockInMask, bool NeedsMaskForGaps); 579 580 /// Fix the non-induction PHIs in \p Plan. 581 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 582 583 /// Returns true if the reordering of FP operations is not allowed, but we are 584 /// able to vectorize with strict in-order reductions for the given RdxDesc. 585 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 586 587 /// Create a new phi node for the induction variable \p OrigPhi to resume 588 /// iteration count in the scalar epilogue, from where the vectorized loop 589 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 590 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 591 /// and the resume values can come from an additional bypass block, the \p 592 /// AdditionalBypass pair provides information about the bypass block and the 593 /// end value on the edge from bypass to this loop. 594 PHINode *createInductionResumeValue( 595 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 596 ArrayRef<BasicBlock *> BypassBlocks, 597 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 598 599 /// Returns the original loop trip count. 600 Value *getTripCount() const { return TripCount; } 601 602 /// Used to set the trip count after ILV's construction and after the 603 /// preheader block has been executed. Note that this always holds the trip 604 /// count of the original loop for both main loop and epilogue vectorization. 605 void setTripCount(Value *TC) { TripCount = TC; } 606 607 protected: 608 friend class LoopVectorizationPlanner; 609 610 /// A small list of PHINodes. 611 using PhiVector = SmallVector<PHINode *, 4>; 612 613 /// A type for scalarized values in the new loop. Each value from the 614 /// original loop, when scalarized, is represented by UF x VF scalar values 615 /// in the new unrolled loop, where UF is the unroll factor and VF is the 616 /// vectorization factor. 617 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 618 619 /// Set up the values of the IVs correctly when exiting the vector loop. 620 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 621 Value *VectorTripCount, Value *EndValue, 622 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 623 VPlan &Plan, VPTransformState &State); 624 625 /// Create the exit value of first order recurrences in the middle block and 626 /// update their users. 627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 628 VPTransformState &State); 629 630 /// Create code for the loop exit value of the reduction. 631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Returns (and creates if needed) the trip count of the widened loop. 638 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 639 640 /// Returns a bitcasted value to the requested vector type. 641 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 642 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 643 const DataLayout &DL); 644 645 /// Emit a bypass check to see if the vector trip count is zero, including if 646 /// it overflows. 647 void emitIterationCountCheck(BasicBlock *Bypass); 648 649 /// Emit a bypass check to see if all of the SCEV assumptions we've 650 /// had to make are correct. Returns the block containing the checks or 651 /// nullptr if no checks have been added. 652 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 653 654 /// Emit bypass checks to check any memory assumptions we may have made. 655 /// Returns the block containing the checks or nullptr if no checks have been 656 /// added. 657 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 658 659 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 660 /// vector loop preheader, middle block and scalar preheader. 661 void createVectorLoopSkeleton(StringRef Prefix); 662 663 /// Create new phi nodes for the induction variables to resume iteration count 664 /// in the scalar epilogue, from where the vectorized loop left off. 665 /// In cases where the loop skeleton is more complicated (eg. epilogue 666 /// vectorization) and the resume values can come from an additional bypass 667 /// block, the \p AdditionalBypass pair provides information about the bypass 668 /// block and the end value on the edge from bypass to this loop. 669 void createInductionResumeValues( 670 const SCEV2ValueTy &ExpandedSCEVs, 671 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 672 673 /// Complete the loop skeleton by adding debug MDs, creating appropriate 674 /// conditional branches in the middle block, preparing the builder and 675 /// running the verifier. Return the preheader of the completed vector loop. 676 BasicBlock *completeLoopSkeleton(); 677 678 /// Collect poison-generating recipes that may generate a poison value that is 679 /// used after vectorization, even when their operands are not poison. Those 680 /// recipes meet the following conditions: 681 /// * Contribute to the address computation of a recipe generating a widen 682 /// memory load/store (VPWidenMemoryInstructionRecipe or 683 /// VPInterleaveRecipe). 684 /// * Such a widen memory load/store has at least one underlying Instruction 685 /// that is in a basic block that needs predication and after vectorization 686 /// the generated instruction won't be predicated. 687 void collectPoisonGeneratingRecipes(VPTransformState &State); 688 689 /// Allow subclasses to override and print debug traces before/after vplan 690 /// execution, when trace information is requested. 691 virtual void printDebugTracesAtStart(){}; 692 virtual void printDebugTracesAtEnd(){}; 693 694 /// The original loop. 695 Loop *OrigLoop; 696 697 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 698 /// dynamic knowledge to simplify SCEV expressions and converts them to a 699 /// more usable form. 700 PredicatedScalarEvolution &PSE; 701 702 /// Loop Info. 703 LoopInfo *LI; 704 705 /// Dominator Tree. 706 DominatorTree *DT; 707 708 /// Target Library Info. 709 const TargetLibraryInfo *TLI; 710 711 /// Target Transform Info. 712 const TargetTransformInfo *TTI; 713 714 /// Assumption Cache. 715 AssumptionCache *AC; 716 717 /// Interface to emit optimization remarks. 718 OptimizationRemarkEmitter *ORE; 719 720 /// The vectorization SIMD factor to use. Each vector will have this many 721 /// vector elements. 722 ElementCount VF; 723 724 ElementCount MinProfitableTripCount; 725 726 /// The vectorization unroll factor to use. Each scalar is vectorized to this 727 /// many different vector instructions. 728 unsigned UF; 729 730 /// The builder that we use 731 IRBuilder<> Builder; 732 733 // --- Vectorization state --- 734 735 /// The vector-loop preheader. 736 BasicBlock *LoopVectorPreHeader; 737 738 /// The scalar-loop preheader. 739 BasicBlock *LoopScalarPreHeader; 740 741 /// Middle Block between the vector and the scalar. 742 BasicBlock *LoopMiddleBlock; 743 744 /// The unique ExitBlock of the scalar loop if one exists. Note that 745 /// there can be multiple exiting edges reaching this block. 746 BasicBlock *LoopExitBlock; 747 748 /// The scalar loop body. 749 BasicBlock *LoopScalarBody; 750 751 /// A list of all bypass blocks. The first block is the entry of the loop. 752 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 753 754 /// Store instructions that were predicated. 755 SmallVector<Instruction *, 4> PredicatedInstructions; 756 757 /// Trip count of the original loop. 758 Value *TripCount = nullptr; 759 760 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 761 Value *VectorTripCount = nullptr; 762 763 /// The legality analysis. 764 LoopVectorizationLegality *Legal; 765 766 /// The profitablity analysis. 767 LoopVectorizationCostModel *Cost; 768 769 // Record whether runtime checks are added. 770 bool AddedSafetyChecks = false; 771 772 // Holds the end values for each induction variable. We save the end values 773 // so we can later fix-up the external users of the induction variables. 774 DenseMap<PHINode *, Value *> IVEndValues; 775 776 /// BFI and PSI are used to check for profile guided size optimizations. 777 BlockFrequencyInfo *BFI; 778 ProfileSummaryInfo *PSI; 779 780 // Whether this loop should be optimized for size based on profile guided size 781 // optimizatios. 782 bool OptForSizeBasedOnProfile; 783 784 /// Structure to hold information about generated runtime checks, responsible 785 /// for cleaning the checks, if vectorization turns out unprofitable. 786 GeneratedRTChecks &RTChecks; 787 788 // Holds the resume values for reductions in the loops, used to set the 789 // correct start value of reduction PHIs when vectorizing the epilogue. 790 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 791 ReductionResumeValues; 792 }; 793 794 class InnerLoopUnroller : public InnerLoopVectorizer { 795 public: 796 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 797 LoopInfo *LI, DominatorTree *DT, 798 const TargetLibraryInfo *TLI, 799 const TargetTransformInfo *TTI, AssumptionCache *AC, 800 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 801 LoopVectorizationLegality *LVL, 802 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 803 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 805 ElementCount::getFixed(1), 806 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 807 BFI, PSI, Check) {} 808 }; 809 810 /// Encapsulate information regarding vectorization of a loop and its epilogue. 811 /// This information is meant to be updated and used across two stages of 812 /// epilogue vectorization. 813 struct EpilogueLoopVectorizationInfo { 814 ElementCount MainLoopVF = ElementCount::getFixed(0); 815 unsigned MainLoopUF = 0; 816 ElementCount EpilogueVF = ElementCount::getFixed(0); 817 unsigned EpilogueUF = 0; 818 BasicBlock *MainLoopIterationCountCheck = nullptr; 819 BasicBlock *EpilogueIterationCountCheck = nullptr; 820 BasicBlock *SCEVSafetyCheck = nullptr; 821 BasicBlock *MemSafetyCheck = nullptr; 822 Value *TripCount = nullptr; 823 Value *VectorTripCount = nullptr; 824 825 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 826 ElementCount EVF, unsigned EUF) 827 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 828 assert(EUF == 1 && 829 "A high UF for the epilogue loop is likely not beneficial."); 830 } 831 }; 832 833 /// An extension of the inner loop vectorizer that creates a skeleton for a 834 /// vectorized loop that has its epilogue (residual) also vectorized. 835 /// The idea is to run the vplan on a given loop twice, firstly to setup the 836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 837 /// from the first step and vectorize the epilogue. This is achieved by 838 /// deriving two concrete strategy classes from this base class and invoking 839 /// them in succession from the loop vectorizer planner. 840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 841 public: 842 InnerLoopAndEpilogueVectorizer( 843 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 844 DominatorTree *DT, const TargetLibraryInfo *TLI, 845 const TargetTransformInfo *TTI, AssumptionCache *AC, 846 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 847 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 848 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 849 GeneratedRTChecks &Checks) 850 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 851 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 852 CM, BFI, PSI, Checks), 853 EPI(EPI) {} 854 855 // Override this function to handle the more complex control flow around the 856 // three loops. 857 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 858 const SCEV2ValueTy &ExpandedSCEVs) final { 859 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 860 } 861 862 /// The interface for creating a vectorized skeleton using one of two 863 /// different strategies, each corresponding to one execution of the vplan 864 /// as described above. 865 virtual std::pair<BasicBlock *, Value *> 866 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 867 868 /// Holds and updates state information required to vectorize the main loop 869 /// and its epilogue in two separate passes. This setup helps us avoid 870 /// regenerating and recomputing runtime safety checks. It also helps us to 871 /// shorten the iteration-count-check path length for the cases where the 872 /// iteration count of the loop is so small that the main vector loop is 873 /// completely skipped. 874 EpilogueLoopVectorizationInfo &EPI; 875 }; 876 877 /// A specialized derived class of inner loop vectorizer that performs 878 /// vectorization of *main* loops in the process of vectorizing loops and their 879 /// epilogues. 880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 881 public: 882 EpilogueVectorizerMainLoop( 883 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 884 DominatorTree *DT, const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 887 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 888 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 889 GeneratedRTChecks &Check) 890 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 EPI, LVL, CM, BFI, PSI, Check) {} 892 /// Implements the interface for creating a vectorized skeleton using the 893 /// *main loop* strategy (ie the first pass of vplan execution). 894 std::pair<BasicBlock *, Value *> 895 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 896 897 protected: 898 /// Emits an iteration count bypass check once for the main loop (when \p 899 /// ForEpilogue is false) and once for the epilogue loop (when \p 900 /// ForEpilogue is true). 901 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 902 void printDebugTracesAtStart() override; 903 void printDebugTracesAtEnd() override; 904 }; 905 906 // A specialized derived class of inner loop vectorizer that performs 907 // vectorization of *epilogue* loops in the process of vectorizing loops and 908 // their epilogues. 909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 910 public: 911 EpilogueVectorizerEpilogueLoop( 912 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 913 DominatorTree *DT, const TargetLibraryInfo *TLI, 914 const TargetTransformInfo *TTI, AssumptionCache *AC, 915 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 916 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 917 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 918 GeneratedRTChecks &Checks) 919 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 920 EPI, LVL, CM, BFI, PSI, Checks) { 921 TripCount = EPI.TripCount; 922 } 923 /// Implements the interface for creating a vectorized skeleton using the 924 /// *epilogue loop* strategy (ie the second pass of vplan execution). 925 std::pair<BasicBlock *, Value *> 926 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 927 928 protected: 929 /// Emits an iteration count bypass check after the main vector loop has 930 /// finished to see if there are any iterations left to execute by either 931 /// the vector epilogue or the scalar epilogue. 932 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 933 BasicBlock *Bypass, 934 BasicBlock *Insert); 935 void printDebugTracesAtStart() override; 936 void printDebugTracesAtEnd() override; 937 }; 938 } // end namespace llvm 939 940 /// Look for a meaningful debug location on the instruction or it's 941 /// operands. 942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 943 if (!I) 944 return DebugLoc(); 945 946 DebugLoc Empty; 947 if (I->getDebugLoc() != Empty) 948 return I->getDebugLoc(); 949 950 for (Use &Op : I->operands()) { 951 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 952 if (OpInst->getDebugLoc() != Empty) 953 return OpInst->getDebugLoc(); 954 } 955 956 return I->getDebugLoc(); 957 } 958 959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 960 /// is passed, the message relates to that particular instruction. 961 #ifndef NDEBUG 962 static void debugVectorizationMessage(const StringRef Prefix, 963 const StringRef DebugMsg, 964 Instruction *I) { 965 dbgs() << "LV: " << Prefix << DebugMsg; 966 if (I != nullptr) 967 dbgs() << " " << *I; 968 else 969 dbgs() << '.'; 970 dbgs() << '\n'; 971 } 972 #endif 973 974 /// Create an analysis remark that explains why vectorization failed 975 /// 976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 977 /// RemarkName is the identifier for the remark. If \p I is passed it is an 978 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 979 /// the location of the remark. \return the remark object that can be 980 /// streamed to. 981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 982 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 983 Value *CodeRegion = TheLoop->getHeader(); 984 DebugLoc DL = TheLoop->getStartLoc(); 985 986 if (I) { 987 CodeRegion = I->getParent(); 988 // If there is no debug location attached to the instruction, revert back to 989 // using the loop's. 990 if (I->getDebugLoc()) 991 DL = I->getDebugLoc(); 992 } 993 994 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 995 } 996 997 namespace llvm { 998 999 /// Return a value for Step multiplied by VF. 1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1001 int64_t Step) { 1002 assert(Ty->isIntegerTy() && "Expected an integer step"); 1003 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1004 } 1005 1006 /// Return the runtime value for VF. 1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1008 return B.CreateElementCount(Ty, VF); 1009 } 1010 1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1012 Loop *OrigLoop) { 1013 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1014 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1015 1016 ScalarEvolution &SE = *PSE.getSE(); 1017 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1018 } 1019 1020 void reportVectorizationFailure(const StringRef DebugMsg, 1021 const StringRef OREMsg, const StringRef ORETag, 1022 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1023 Instruction *I) { 1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1025 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1026 ORE->emit( 1027 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1028 << "loop not vectorized: " << OREMsg); 1029 } 1030 1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1032 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1033 Instruction *I) { 1034 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1035 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1036 ORE->emit( 1037 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1038 << Msg); 1039 } 1040 1041 /// Report successful vectorization of the loop. In case an outer loop is 1042 /// vectorized, prepend "outer" to the vectorization remark. 1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1044 VectorizationFactor VF, unsigned IC) { 1045 LLVM_DEBUG(debugVectorizationMessage( 1046 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 1047 nullptr)); 1048 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 1049 ORE->emit([&]() { 1050 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 1051 TheLoop->getHeader()) 1052 << "vectorized " << LoopType << "loop (vectorization width: " 1053 << ore::NV("VectorizationFactor", VF.Width) 1054 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 1055 }); 1056 } 1057 1058 } // end namespace llvm 1059 1060 #ifndef NDEBUG 1061 /// \return string containing a file name and a line # for the given loop. 1062 static std::string getDebugLocString(const Loop *L) { 1063 std::string Result; 1064 if (L) { 1065 raw_string_ostream OS(Result); 1066 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1067 LoopDbgLoc.print(OS); 1068 else 1069 // Just print the module name. 1070 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1071 OS.flush(); 1072 } 1073 return Result; 1074 } 1075 #endif 1076 1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1078 VPTransformState &State) { 1079 1080 // Collect recipes in the backward slice of `Root` that may generate a poison 1081 // value that is used after vectorization. 1082 SmallPtrSet<VPRecipeBase *, 16> Visited; 1083 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1084 SmallVector<VPRecipeBase *, 16> Worklist; 1085 Worklist.push_back(Root); 1086 1087 // Traverse the backward slice of Root through its use-def chain. 1088 while (!Worklist.empty()) { 1089 VPRecipeBase *CurRec = Worklist.back(); 1090 Worklist.pop_back(); 1091 1092 if (!Visited.insert(CurRec).second) 1093 continue; 1094 1095 // Prune search if we find another recipe generating a widen memory 1096 // instruction. Widen memory instructions involved in address computation 1097 // will lead to gather/scatter instructions, which don't need to be 1098 // handled. 1099 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1100 isa<VPInterleaveRecipe>(CurRec) || 1101 isa<VPScalarIVStepsRecipe>(CurRec) || 1102 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1103 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1104 continue; 1105 1106 // This recipe contributes to the address computation of a widen 1107 // load/store. If the underlying instruction has poison-generating flags, 1108 // drop them directly. 1109 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1110 RecWithFlags->dropPoisonGeneratingFlags(); 1111 } else { 1112 Instruction *Instr = dyn_cast_or_null<Instruction>( 1113 CurRec->getVPSingleValue()->getUnderlyingValue()); 1114 (void)Instr; 1115 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1116 "found instruction with poison generating flags not covered by " 1117 "VPRecipeWithIRFlags"); 1118 } 1119 1120 // Add new definitions to the worklist. 1121 for (VPValue *operand : CurRec->operands()) 1122 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1123 Worklist.push_back(OpDef); 1124 } 1125 }); 1126 1127 // Traverse all the recipes in the VPlan and collect the poison-generating 1128 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1129 // VPInterleaveRecipe. 1130 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1131 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1132 for (VPRecipeBase &Recipe : *VPBB) { 1133 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1134 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1135 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1136 if (AddrDef && WidenRec->isConsecutive() && 1137 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1138 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1139 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1140 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1141 if (AddrDef) { 1142 // Check if any member of the interleave group needs predication. 1143 const InterleaveGroup<Instruction> *InterGroup = 1144 InterleaveRec->getInterleaveGroup(); 1145 bool NeedPredication = false; 1146 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1147 I < NumMembers; ++I) { 1148 Instruction *Member = InterGroup->getMember(I); 1149 if (Member) 1150 NeedPredication |= 1151 Legal->blockNeedsPredication(Member->getParent()); 1152 } 1153 1154 if (NeedPredication) 1155 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1156 } 1157 } 1158 } 1159 } 1160 } 1161 1162 namespace llvm { 1163 1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1165 // lowered. 1166 enum ScalarEpilogueLowering { 1167 1168 // The default: allowing scalar epilogues. 1169 CM_ScalarEpilogueAllowed, 1170 1171 // Vectorization with OptForSize: don't allow epilogues. 1172 CM_ScalarEpilogueNotAllowedOptSize, 1173 1174 // A special case of vectorisation with OptForSize: loops with a very small 1175 // trip count are considered for vectorization under OptForSize, thereby 1176 // making sure the cost of their loop body is dominant, free of runtime 1177 // guards and scalar iteration overheads. 1178 CM_ScalarEpilogueNotAllowedLowTripLoop, 1179 1180 // Loop hint predicate indicating an epilogue is undesired. 1181 CM_ScalarEpilogueNotNeededUsePredicate, 1182 1183 // Directive indicating we must either tail fold or not vectorize 1184 CM_ScalarEpilogueNotAllowedUsePredicate 1185 }; 1186 1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1188 1189 /// LoopVectorizationCostModel - estimates the expected speedups due to 1190 /// vectorization. 1191 /// In many cases vectorization is not profitable. This can happen because of 1192 /// a number of reasons. In this class we mainly attempt to predict the 1193 /// expected speedup/slowdowns due to the supported instruction set. We use the 1194 /// TargetTransformInfo to query the different backends for the cost of 1195 /// different operations. 1196 class LoopVectorizationCostModel { 1197 public: 1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1199 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1200 LoopVectorizationLegality *Legal, 1201 const TargetTransformInfo &TTI, 1202 const TargetLibraryInfo *TLI, DemandedBits *DB, 1203 AssumptionCache *AC, 1204 OptimizationRemarkEmitter *ORE, const Function *F, 1205 const LoopVectorizeHints *Hints, 1206 InterleavedAccessInfo &IAI) 1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1209 Hints(Hints), InterleaveInfo(IAI) {} 1210 1211 /// \return An upper bound for the vectorization factors (both fixed and 1212 /// scalable). If the factors are 0, vectorization and interleaving should be 1213 /// avoided up front. 1214 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// Setup cost-based decisions for user vectorization factor. 1221 /// \return true if the UserVF is a feasible VF to be chosen. 1222 bool selectUserVectorizationFactor(ElementCount UserVF) { 1223 collectUniformsAndScalars(UserVF); 1224 collectInstsToScalarize(UserVF); 1225 return expectedCost(UserVF).first.isValid(); 1226 } 1227 1228 /// \return The size (in bits) of the smallest and widest types in the code 1229 /// that needs to be vectorized. We ignore values that remain scalar such as 1230 /// 64 bit loop indices. 1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1232 1233 /// \return The desired interleave count. 1234 /// If interleave count has been specified by metadata it will be returned. 1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1236 /// are the selected vectorization factor and the cost of the selected VF. 1237 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1238 1239 /// Memory access instruction may be vectorized in more than one way. 1240 /// Form of instruction after vectorization depends on cost. 1241 /// This function takes cost-based decisions for Load/Store instructions 1242 /// and collects them in a map. This decisions map is used for building 1243 /// the lists of loop-uniform and loop-scalar instructions. 1244 /// The calculated cost is saved with widening decision in order to 1245 /// avoid redundant calculations. 1246 void setCostBasedWideningDecision(ElementCount VF); 1247 1248 /// A call may be vectorized in different ways depending on whether we have 1249 /// vectorized variants available and whether the target supports masking. 1250 /// This function analyzes all calls in the function at the supplied VF, 1251 /// makes a decision based on the costs of available options, and stores that 1252 /// decision in a map for use in planning and plan execution. 1253 void setVectorizedCallDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Collect all element types in the loop for which widening is needed. 1275 void collectElementTypesForWidening(); 1276 1277 /// Split reductions into those that happen in the loop, and those that happen 1278 /// outside. In loop reductions are collected into InLoopReductions. 1279 void collectInLoopReductions(); 1280 1281 /// Returns true if we should use strict in-order reductions for the given 1282 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1283 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1284 /// of FP operations. 1285 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1286 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1287 } 1288 1289 /// \returns The smallest bitwidth each instruction can be represented with. 1290 /// The vector equivalents of these instructions should be truncated to this 1291 /// type. 1292 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1293 return MinBWs; 1294 } 1295 1296 /// \returns True if it is more profitable to scalarize instruction \p I for 1297 /// vectorization factor \p VF. 1298 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1299 assert(VF.isVector() && 1300 "Profitable to scalarize relevant only for VF > 1."); 1301 1302 // Cost model is not run in the VPlan-native path - return conservative 1303 // result until this changes. 1304 if (EnableVPlanNativePath) 1305 return false; 1306 1307 auto Scalars = InstsToScalarize.find(VF); 1308 assert(Scalars != InstsToScalarize.end() && 1309 "VF not yet analyzed for scalarization profitability"); 1310 return Scalars->second.contains(I); 1311 } 1312 1313 /// Returns true if \p I is known to be uniform after vectorization. 1314 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1315 // Pseudo probe needs to be duplicated for each unrolled iteration and 1316 // vector lane so that profiled loop trip count can be accurately 1317 // accumulated instead of being under counted. 1318 if (isa<PseudoProbeInst>(I)) 1319 return false; 1320 1321 if (VF.isScalar()) 1322 return true; 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto UniformsPerVF = Uniforms.find(VF); 1330 assert(UniformsPerVF != Uniforms.end() && 1331 "VF not yet analyzed for uniformity"); 1332 return UniformsPerVF->second.count(I); 1333 } 1334 1335 /// Returns true if \p I is known to be scalar after vectorization. 1336 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto ScalarsPerVF = Scalars.find(VF); 1346 assert(ScalarsPerVF != Scalars.end() && 1347 "Scalar values are not calculated for VF"); 1348 return ScalarsPerVF->second.count(I); 1349 } 1350 1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1352 /// for vectorization factor \p VF. 1353 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1354 return VF.isVector() && MinBWs.contains(I) && 1355 !isProfitableToScalarize(I, VF) && 1356 !isScalarAfterVectorization(I, VF); 1357 } 1358 1359 /// Decision that was taken during cost calculation for memory instruction. 1360 enum InstWidening { 1361 CM_Unknown, 1362 CM_Widen, // For consecutive accesses with stride +1. 1363 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1364 CM_Interleave, 1365 CM_GatherScatter, 1366 CM_Scalarize, 1367 CM_VectorCall, 1368 CM_IntrinsicCall 1369 }; 1370 1371 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1372 /// instruction \p I and vector width \p VF. 1373 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1374 InstructionCost Cost) { 1375 assert(VF.isVector() && "Expected VF >=2"); 1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1377 } 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// interleaving group \p Grp and vector width \p VF. 1381 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1382 ElementCount VF, InstWidening W, 1383 InstructionCost Cost) { 1384 assert(VF.isVector() && "Expected VF >=2"); 1385 /// Broadcast this decicion to all instructions inside the group. 1386 /// But the cost will be assigned to one instruction only. 1387 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1388 if (auto *I = Grp->getMember(i)) { 1389 if (Grp->getInsertPos() == I) 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 else 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1393 } 1394 } 1395 } 1396 1397 /// Return the cost model decision for the given instruction \p I and vector 1398 /// width \p VF. Return CM_Unknown if this instruction did not pass 1399 /// through the cost modeling. 1400 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1401 assert(VF.isVector() && "Expected VF to be a vector VF"); 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return CM_GatherScatter; 1406 1407 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1408 auto Itr = WideningDecisions.find(InstOnVF); 1409 if (Itr == WideningDecisions.end()) 1410 return CM_Unknown; 1411 return Itr->second.first; 1412 } 1413 1414 /// Return the vectorization cost for the given instruction \p I and vector 1415 /// width \p VF. 1416 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1419 assert(WideningDecisions.contains(InstOnVF) && 1420 "The cost is not calculated"); 1421 return WideningDecisions[InstOnVF].second; 1422 } 1423 1424 struct CallWideningDecision { 1425 InstWidening Kind; 1426 Function *Variant; 1427 Intrinsic::ID IID; 1428 std::optional<unsigned> MaskPos; 1429 InstructionCost Cost; 1430 }; 1431 1432 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1433 Function *Variant, Intrinsic::ID IID, 1434 std::optional<unsigned> MaskPos, 1435 InstructionCost Cost) { 1436 assert(!VF.isScalar() && "Expected vector VF"); 1437 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1438 MaskPos, Cost}; 1439 } 1440 1441 CallWideningDecision getCallWideningDecision(CallInst *CI, 1442 ElementCount VF) const { 1443 assert(!VF.isScalar() && "Expected vector VF"); 1444 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1445 } 1446 1447 /// Return True if instruction \p I is an optimizable truncate whose operand 1448 /// is an induction variable. Such a truncate will be removed by adding a new 1449 /// induction variable with the destination type. 1450 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1451 // If the instruction is not a truncate, return false. 1452 auto *Trunc = dyn_cast<TruncInst>(I); 1453 if (!Trunc) 1454 return false; 1455 1456 // Get the source and destination types of the truncate. 1457 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1458 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1459 1460 // If the truncate is free for the given types, return false. Replacing a 1461 // free truncate with an induction variable would add an induction variable 1462 // update instruction to each iteration of the loop. We exclude from this 1463 // check the primary induction variable since it will need an update 1464 // instruction regardless. 1465 Value *Op = Trunc->getOperand(0); 1466 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1467 return false; 1468 1469 // If the truncated value is not an induction variable, return false. 1470 return Legal->isInductionPhi(Op); 1471 } 1472 1473 /// Collects the instructions to scalarize for each predicated instruction in 1474 /// the loop. 1475 void collectInstsToScalarize(ElementCount VF); 1476 1477 /// Collect Uniform and Scalar values for the given \p VF. 1478 /// The sets depend on CM decision for Load/Store instructions 1479 /// that may be vectorized as interleave, gather-scatter or scalarized. 1480 /// Also make a decision on what to do about call instructions in the loop 1481 /// at that VF -- scalarize, call a known vector routine, or call a 1482 /// vector intrinsic. 1483 void collectUniformsAndScalars(ElementCount VF) { 1484 // Do the analysis once. 1485 if (VF.isScalar() || Uniforms.contains(VF)) 1486 return; 1487 setCostBasedWideningDecision(VF); 1488 setVectorizedCallDecision(VF); 1489 collectLoopUniforms(VF); 1490 collectLoopScalars(VF); 1491 } 1492 1493 /// Returns true if the target machine supports masked store operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(DataType, Ptr) && 1497 TTI.isLegalMaskedStore(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked load operation 1501 /// for the given \p DataType and kind of access to \p Ptr. 1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1503 return Legal->isConsecutivePtr(DataType, Ptr) && 1504 TTI.isLegalMaskedLoad(DataType, Alignment); 1505 } 1506 1507 /// Returns true if the target machine can represent \p V as a masked gather 1508 /// or scatter operation. 1509 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1510 bool LI = isa<LoadInst>(V); 1511 bool SI = isa<StoreInst>(V); 1512 if (!LI && !SI) 1513 return false; 1514 auto *Ty = getLoadStoreType(V); 1515 Align Align = getLoadStoreAlignment(V); 1516 if (VF.isVector()) 1517 Ty = VectorType::get(Ty, VF); 1518 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1519 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1520 } 1521 1522 /// Returns true if the target machine supports all of the reduction 1523 /// variables found for the given VF. 1524 bool canVectorizeReductions(ElementCount VF) const { 1525 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1526 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1528 })); 1529 } 1530 1531 /// Given costs for both strategies, return true if the scalar predication 1532 /// lowering should be used for div/rem. This incorporates an override 1533 /// option so it is not simply a cost comparison. 1534 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1535 InstructionCost SafeDivisorCost) const { 1536 switch (ForceSafeDivisor) { 1537 case cl::BOU_UNSET: 1538 return ScalarCost < SafeDivisorCost; 1539 case cl::BOU_TRUE: 1540 return false; 1541 case cl::BOU_FALSE: 1542 return true; 1543 }; 1544 llvm_unreachable("impossible case value"); 1545 } 1546 1547 /// Returns true if \p I is an instruction which requires predication and 1548 /// for which our chosen predication strategy is scalarization (i.e. we 1549 /// don't have an alternate strategy such as masking available). 1550 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1551 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1552 1553 /// Returns true if \p I is an instruction that needs to be predicated 1554 /// at runtime. The result is independent of the predication mechanism. 1555 /// Superset of instructions that return true for isScalarWithPredication. 1556 bool isPredicatedInst(Instruction *I) const; 1557 1558 /// Return the costs for our two available strategies for lowering a 1559 /// div/rem operation which requires speculating at least one lane. 1560 /// First result is for scalarization (will be invalid for scalable 1561 /// vectors); second is for the safe-divisor strategy. 1562 std::pair<InstructionCost, InstructionCost> 1563 getDivRemSpeculationCost(Instruction *I, 1564 ElementCount VF) const; 1565 1566 /// Returns true if \p I is a memory instruction with consecutive memory 1567 /// access that can be widened. 1568 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1569 1570 /// Returns true if \p I is a memory instruction in an interleaved-group 1571 /// of memory accesses that can be vectorized with wide vector loads/stores 1572 /// and shuffles. 1573 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1574 1575 /// Check if \p Instr belongs to any interleaved access group. 1576 bool isAccessInterleaved(Instruction *Instr) { 1577 return InterleaveInfo.isInterleaved(Instr); 1578 } 1579 1580 /// Get the interleaved access group that \p Instr belongs to. 1581 const InterleaveGroup<Instruction> * 1582 getInterleavedAccessGroup(Instruction *Instr) { 1583 return InterleaveInfo.getInterleaveGroup(Instr); 1584 } 1585 1586 /// Returns true if we're required to use a scalar epilogue for at least 1587 /// the final iteration of the original loop. 1588 bool requiresScalarEpilogue(bool IsVectorizing) const { 1589 if (!isScalarEpilogueAllowed()) 1590 return false; 1591 // If we might exit from anywhere but the latch, must run the exiting 1592 // iteration in scalar form. 1593 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1594 return true; 1595 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1596 } 1597 1598 /// Returns true if we're required to use a scalar epilogue for at least 1599 /// the final iteration of the original loop for all VFs in \p Range. 1600 /// A scalar epilogue must either be required for all VFs in \p Range or for 1601 /// none. 1602 bool requiresScalarEpilogue(VFRange Range) const { 1603 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1604 return requiresScalarEpilogue(VF.isVector()); 1605 }; 1606 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1607 assert( 1608 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1609 "all VFs in range must agree on whether a scalar epilogue is required"); 1610 return IsRequired; 1611 } 1612 1613 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1614 /// loop hint annotation. 1615 bool isScalarEpilogueAllowed() const { 1616 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1617 } 1618 1619 /// Returns the TailFoldingStyle that is best for the current loop. 1620 TailFoldingStyle 1621 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1622 if (!CanFoldTailByMasking) 1623 return TailFoldingStyle::None; 1624 1625 if (ForceTailFoldingStyle.getNumOccurrences()) 1626 return ForceTailFoldingStyle; 1627 1628 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { 1633 return getTailFoldingStyle() != TailFoldingStyle::None; 1634 } 1635 1636 /// Returns true if the instructions in this block requires predication 1637 /// for any reason, e.g. because tail folding now requires a predicate 1638 /// or because the block in the original loop was predicated. 1639 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1640 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1641 } 1642 1643 /// Returns true if the Phi is part of an inloop reduction. 1644 bool isInLoopReduction(PHINode *Phi) const { 1645 return InLoopReductions.contains(Phi); 1646 } 1647 1648 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1649 /// with factor VF. Return the cost of the instruction, including 1650 /// scalarization overhead if it's needed. 1651 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1652 1653 /// Estimate cost of a call instruction CI if it were vectorized with factor 1654 /// VF. Return the cost of the instruction, including scalarization overhead 1655 /// if it's needed. 1656 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1657 1658 /// Invalidates decisions already taken by the cost model. 1659 void invalidateCostModelingDecisions() { 1660 WideningDecisions.clear(); 1661 CallWideningDecisions.clear(); 1662 Uniforms.clear(); 1663 Scalars.clear(); 1664 } 1665 1666 /// The vectorization cost is a combination of the cost itself and a boolean 1667 /// indicating whether any of the contributing operations will actually 1668 /// operate on vector values after type legalization in the backend. If this 1669 /// latter value is false, then all operations will be scalarized (i.e. no 1670 /// vectorization has actually taken place). 1671 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1672 1673 /// Returns the expected execution cost. The unit of the cost does 1674 /// not matter because we use the 'cost' units to compare different 1675 /// vector widths. The cost that is returned is *not* normalized by 1676 /// the factor width. If \p Invalid is not nullptr, this function 1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1678 /// each instruction that has an Invalid cost for the given VF. 1679 VectorizationCostTy 1680 expectedCost(ElementCount VF, 1681 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1682 1683 bool hasPredStores() const { return NumPredStores > 0; } 1684 1685 /// Returns true if epilogue vectorization is considered profitable, and 1686 /// false otherwise. 1687 /// \p VF is the vectorization factor chosen for the original loop. 1688 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1689 1690 private: 1691 unsigned NumPredStores = 0; 1692 1693 /// \return An upper bound for the vectorization factors for both 1694 /// fixed and scalable vectorization, where the minimum-known number of 1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1696 /// disabled or unsupported, then the scalable part will be equal to 1697 /// ElementCount::getScalable(0). 1698 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1699 ElementCount UserVF, 1700 bool FoldTailByMasking); 1701 1702 /// \return the maximized element count based on the targets vector 1703 /// registers and the loop trip-count, but limited to a maximum safe VF. 1704 /// This is a helper function of computeFeasibleMaxVF. 1705 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1706 unsigned SmallestType, 1707 unsigned WidestType, 1708 ElementCount MaxSafeVF, 1709 bool FoldTailByMasking); 1710 1711 /// \return the maximum legal scalable VF, based on the safe max number 1712 /// of elements. 1713 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1714 1715 /// Returns the execution time cost of an instruction for a given vector 1716 /// width. Vector width of one means scalar. 1717 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost-computation logic from getInstructionCost which provides 1720 /// the vector type as an output parameter. 1721 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1722 Type *&VectorTy); 1723 1724 /// Return the cost of instructions in an inloop reduction pattern, if I is 1725 /// part of that pattern. 1726 std::optional<InstructionCost> 1727 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1728 TTI::TargetCostKind CostKind) const; 1729 1730 /// Calculate vectorization cost of memory instruction \p I. 1731 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1732 1733 /// The cost computation for scalarized memory instruction. 1734 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1735 1736 /// The cost computation for interleaving group of memory instructions. 1737 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost computation for Gather/Scatter instruction. 1740 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1741 1742 /// The cost computation for widening instruction \p I with consecutive 1743 /// memory access. 1744 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1745 1746 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1747 /// Load: scalar load + broadcast. 1748 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1749 /// element) 1750 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1751 1752 /// Estimate the overhead of scalarizing an instruction. This is a 1753 /// convenience wrapper for the type-based getScalarizationOverhead API. 1754 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1755 TTI::TargetCostKind CostKind) const; 1756 1757 /// Returns true if an artificially high cost for emulated masked memrefs 1758 /// should be used. 1759 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1760 1761 /// Map of scalar integer values to the smallest bitwidth they can be legally 1762 /// represented as. The vector equivalents of these values should be truncated 1763 /// to this type. 1764 MapVector<Instruction *, uint64_t> MinBWs; 1765 1766 /// A type representing the costs for instructions if they were to be 1767 /// scalarized rather than vectorized. The entries are Instruction-Cost 1768 /// pairs. 1769 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1770 1771 /// A set containing all BasicBlocks that are known to present after 1772 /// vectorization as a predicated block. 1773 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1774 PredicatedBBsAfterVectorization; 1775 1776 /// Records whether it is allowed to have the original scalar loop execute at 1777 /// least once. This may be needed as a fallback loop in case runtime 1778 /// aliasing/dependence checks fail, or to handle the tail/remainder 1779 /// iterations when the trip count is unknown or doesn't divide by the VF, 1780 /// or as a peel-loop to handle gaps in interleave-groups. 1781 /// Under optsize and when the trip count is very small we don't allow any 1782 /// iterations to execute in the scalar loop. 1783 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1784 1785 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1786 bool CanFoldTailByMasking = false; 1787 1788 /// A map holding scalar costs for different vectorization factors. The 1789 /// presence of a cost for an instruction in the mapping indicates that the 1790 /// instruction will be scalarized when vectorizing with the associated 1791 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1792 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1793 1794 /// Holds the instructions known to be uniform after vectorization. 1795 /// The data is collected per VF. 1796 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1797 1798 /// Holds the instructions known to be scalar after vectorization. 1799 /// The data is collected per VF. 1800 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1801 1802 /// Holds the instructions (address computations) that are forced to be 1803 /// scalarized. 1804 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1805 1806 /// PHINodes of the reductions that should be expanded in-loop. 1807 SmallPtrSet<PHINode *, 4> InLoopReductions; 1808 1809 /// A Map of inloop reduction operations and their immediate chain operand. 1810 /// FIXME: This can be removed once reductions can be costed correctly in 1811 /// VPlan. This was added to allow quick lookup of the inloop operations. 1812 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1813 1814 /// Returns the expected difference in cost from scalarizing the expression 1815 /// feeding a predicated instruction \p PredInst. The instructions to 1816 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1817 /// non-negative return value implies the expression will be scalarized. 1818 /// Currently, only single-use chains are considered for scalarization. 1819 InstructionCost computePredInstDiscount(Instruction *PredInst, 1820 ScalarCostsTy &ScalarCosts, 1821 ElementCount VF); 1822 1823 /// Collect the instructions that are uniform after vectorization. An 1824 /// instruction is uniform if we represent it with a single scalar value in 1825 /// the vectorized loop corresponding to each vector iteration. Examples of 1826 /// uniform instructions include pointer operands of consecutive or 1827 /// interleaved memory accesses. Note that although uniformity implies an 1828 /// instruction will be scalar, the reverse is not true. In general, a 1829 /// scalarized instruction will be represented by VF scalar values in the 1830 /// vectorized loop, each corresponding to an iteration of the original 1831 /// scalar loop. 1832 void collectLoopUniforms(ElementCount VF); 1833 1834 /// Collect the instructions that are scalar after vectorization. An 1835 /// instruction is scalar if it is known to be uniform or will be scalarized 1836 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1837 /// to the list if they are used by a load/store instruction that is marked as 1838 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1839 /// VF values in the vectorized loop, each corresponding to an iteration of 1840 /// the original scalar loop. 1841 void collectLoopScalars(ElementCount VF); 1842 1843 /// Keeps cost model vectorization decision and cost for instructions. 1844 /// Right now it is used for memory instructions only. 1845 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1846 std::pair<InstWidening, InstructionCost>>; 1847 1848 DecisionList WideningDecisions; 1849 1850 using CallDecisionList = 1851 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1852 1853 CallDecisionList CallWideningDecisions; 1854 1855 /// Returns true if \p V is expected to be vectorized and it needs to be 1856 /// extracted. 1857 bool needsExtract(Value *V, ElementCount VF) const { 1858 Instruction *I = dyn_cast<Instruction>(V); 1859 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1860 TheLoop->isLoopInvariant(I)) 1861 return false; 1862 1863 // Assume we can vectorize V (and hence we need extraction) if the 1864 // scalars are not computed yet. This can happen, because it is called 1865 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1866 // the scalars are collected. That should be a safe assumption in most 1867 // cases, because we check if the operands have vectorizable types 1868 // beforehand in LoopVectorizationLegality. 1869 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1870 }; 1871 1872 /// Returns a range containing only operands needing to be extracted. 1873 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1874 ElementCount VF) const { 1875 return SmallVector<Value *, 4>(make_filter_range( 1876 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1877 } 1878 1879 public: 1880 /// The loop that we evaluate. 1881 Loop *TheLoop; 1882 1883 /// Predicated scalar evolution analysis. 1884 PredicatedScalarEvolution &PSE; 1885 1886 /// Loop Info analysis. 1887 LoopInfo *LI; 1888 1889 /// Vectorization legality. 1890 LoopVectorizationLegality *Legal; 1891 1892 /// Vector target information. 1893 const TargetTransformInfo &TTI; 1894 1895 /// Target Library Info. 1896 const TargetLibraryInfo *TLI; 1897 1898 /// Demanded bits analysis. 1899 DemandedBits *DB; 1900 1901 /// Assumption cache. 1902 AssumptionCache *AC; 1903 1904 /// Interface to emit optimization remarks. 1905 OptimizationRemarkEmitter *ORE; 1906 1907 const Function *TheFunction; 1908 1909 /// Loop Vectorize Hint. 1910 const LoopVectorizeHints *Hints; 1911 1912 /// The interleave access information contains groups of interleaved accesses 1913 /// with the same stride and close to each other. 1914 InterleavedAccessInfo &InterleaveInfo; 1915 1916 /// Values to ignore in the cost model. 1917 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1918 1919 /// Values to ignore in the cost model when VF > 1. 1920 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1921 1922 /// All element types found in the loop. 1923 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1924 }; 1925 } // end namespace llvm 1926 1927 namespace { 1928 /// Helper struct to manage generating runtime checks for vectorization. 1929 /// 1930 /// The runtime checks are created up-front in temporary blocks to allow better 1931 /// estimating the cost and un-linked from the existing IR. After deciding to 1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1933 /// temporary blocks are completely removed. 1934 class GeneratedRTChecks { 1935 /// Basic block which contains the generated SCEV checks, if any. 1936 BasicBlock *SCEVCheckBlock = nullptr; 1937 1938 /// The value representing the result of the generated SCEV checks. If it is 1939 /// nullptr, either no SCEV checks have been generated or they have been used. 1940 Value *SCEVCheckCond = nullptr; 1941 1942 /// Basic block which contains the generated memory runtime checks, if any. 1943 BasicBlock *MemCheckBlock = nullptr; 1944 1945 /// The value representing the result of the generated memory runtime checks. 1946 /// If it is nullptr, either no memory runtime checks have been generated or 1947 /// they have been used. 1948 Value *MemRuntimeCheckCond = nullptr; 1949 1950 DominatorTree *DT; 1951 LoopInfo *LI; 1952 TargetTransformInfo *TTI; 1953 1954 SCEVExpander SCEVExp; 1955 SCEVExpander MemCheckExp; 1956 1957 bool CostTooHigh = false; 1958 const bool AddBranchWeights; 1959 1960 public: 1961 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1962 TargetTransformInfo *TTI, const DataLayout &DL, 1963 bool AddBranchWeights) 1964 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1965 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} 1966 1967 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1968 /// accurately estimate the cost of the runtime checks. The blocks are 1969 /// un-linked from the IR and is added back during vector code generation. If 1970 /// there is no vector code generation, the check blocks are removed 1971 /// completely. 1972 void Create(Loop *L, const LoopAccessInfo &LAI, 1973 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1974 1975 // Hard cutoff to limit compile-time increase in case a very large number of 1976 // runtime checks needs to be generated. 1977 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1978 // profile info. 1979 CostTooHigh = 1980 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1981 if (CostTooHigh) 1982 return; 1983 1984 BasicBlock *LoopHeader = L->getHeader(); 1985 BasicBlock *Preheader = L->getLoopPreheader(); 1986 1987 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1988 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1989 // may be used by SCEVExpander. The blocks will be un-linked from their 1990 // predecessors and removed from LI & DT at the end of the function. 1991 if (!UnionPred.isAlwaysTrue()) { 1992 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1993 nullptr, "vector.scevcheck"); 1994 1995 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1996 &UnionPred, SCEVCheckBlock->getTerminator()); 1997 } 1998 1999 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2000 if (RtPtrChecking.Need) { 2001 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2002 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2003 "vector.memcheck"); 2004 2005 auto DiffChecks = RtPtrChecking.getDiffChecks(); 2006 if (DiffChecks) { 2007 Value *RuntimeVF = nullptr; 2008 MemRuntimeCheckCond = addDiffRuntimeChecks( 2009 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 2010 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 2011 if (!RuntimeVF) 2012 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 2013 return RuntimeVF; 2014 }, 2015 IC); 2016 } else { 2017 MemRuntimeCheckCond = addRuntimeChecks( 2018 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 2019 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 2020 } 2021 assert(MemRuntimeCheckCond && 2022 "no RT checks generated although RtPtrChecking " 2023 "claimed checks are required"); 2024 } 2025 2026 if (!MemCheckBlock && !SCEVCheckBlock) 2027 return; 2028 2029 // Unhook the temporary block with the checks, update various places 2030 // accordingly. 2031 if (SCEVCheckBlock) 2032 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2033 if (MemCheckBlock) 2034 MemCheckBlock->replaceAllUsesWith(Preheader); 2035 2036 if (SCEVCheckBlock) { 2037 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2038 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2039 Preheader->getTerminator()->eraseFromParent(); 2040 } 2041 if (MemCheckBlock) { 2042 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2043 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2044 Preheader->getTerminator()->eraseFromParent(); 2045 } 2046 2047 DT->changeImmediateDominator(LoopHeader, Preheader); 2048 if (MemCheckBlock) { 2049 DT->eraseNode(MemCheckBlock); 2050 LI->removeBlock(MemCheckBlock); 2051 } 2052 if (SCEVCheckBlock) { 2053 DT->eraseNode(SCEVCheckBlock); 2054 LI->removeBlock(SCEVCheckBlock); 2055 } 2056 } 2057 2058 InstructionCost getCost() { 2059 if (SCEVCheckBlock || MemCheckBlock) 2060 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2061 2062 if (CostTooHigh) { 2063 InstructionCost Cost; 2064 Cost.setInvalid(); 2065 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2066 return Cost; 2067 } 2068 2069 InstructionCost RTCheckCost = 0; 2070 if (SCEVCheckBlock) 2071 for (Instruction &I : *SCEVCheckBlock) { 2072 if (SCEVCheckBlock->getTerminator() == &I) 2073 continue; 2074 InstructionCost C = 2075 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2076 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2077 RTCheckCost += C; 2078 } 2079 if (MemCheckBlock) 2080 for (Instruction &I : *MemCheckBlock) { 2081 if (MemCheckBlock->getTerminator() == &I) 2082 continue; 2083 InstructionCost C = 2084 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2085 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2086 RTCheckCost += C; 2087 } 2088 2089 if (SCEVCheckBlock || MemCheckBlock) 2090 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2091 << "\n"); 2092 2093 return RTCheckCost; 2094 } 2095 2096 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2097 /// unused. 2098 ~GeneratedRTChecks() { 2099 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2100 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2101 if (!SCEVCheckCond) 2102 SCEVCleaner.markResultUsed(); 2103 2104 if (!MemRuntimeCheckCond) 2105 MemCheckCleaner.markResultUsed(); 2106 2107 if (MemRuntimeCheckCond) { 2108 auto &SE = *MemCheckExp.getSE(); 2109 // Memory runtime check generation creates compares that use expanded 2110 // values. Remove them before running the SCEVExpanderCleaners. 2111 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2112 if (MemCheckExp.isInsertedInstruction(&I)) 2113 continue; 2114 SE.forgetValue(&I); 2115 I.eraseFromParent(); 2116 } 2117 } 2118 MemCheckCleaner.cleanup(); 2119 SCEVCleaner.cleanup(); 2120 2121 if (SCEVCheckCond) 2122 SCEVCheckBlock->eraseFromParent(); 2123 if (MemRuntimeCheckCond) 2124 MemCheckBlock->eraseFromParent(); 2125 } 2126 2127 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2128 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2129 /// depending on the generated condition. 2130 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2131 BasicBlock *LoopVectorPreHeader, 2132 BasicBlock *LoopExitBlock) { 2133 if (!SCEVCheckCond) 2134 return nullptr; 2135 2136 Value *Cond = SCEVCheckCond; 2137 // Mark the check as used, to prevent it from being removed during cleanup. 2138 SCEVCheckCond = nullptr; 2139 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2140 if (C->isZero()) 2141 return nullptr; 2142 2143 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2144 2145 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2146 // Create new preheader for vector loop. 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2149 2150 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2151 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2152 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2153 SCEVCheckBlock); 2154 2155 DT->addNewBlock(SCEVCheckBlock, Pred); 2156 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2157 2158 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2159 if (AddBranchWeights) 2160 setBranchWeights(BI, SCEVCheckBypassWeights); 2161 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2162 return SCEVCheckBlock; 2163 } 2164 2165 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2166 /// the branches to branch to the vector preheader or \p Bypass, depending on 2167 /// the generated condition. 2168 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2169 BasicBlock *LoopVectorPreHeader) { 2170 // Check if we generated code that checks in runtime if arrays overlap. 2171 if (!MemRuntimeCheckCond) 2172 return nullptr; 2173 2174 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2175 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2176 MemCheckBlock); 2177 2178 DT->addNewBlock(MemCheckBlock, Pred); 2179 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2180 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2181 2182 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2183 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2184 2185 BranchInst &BI = 2186 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2187 if (AddBranchWeights) { 2188 setBranchWeights(BI, MemCheckBypassWeights); 2189 } 2190 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2191 MemCheckBlock->getTerminator()->setDebugLoc( 2192 Pred->getTerminator()->getDebugLoc()); 2193 2194 // Mark the check as used, to prevent it from being removed during cleanup. 2195 MemRuntimeCheckCond = nullptr; 2196 return MemCheckBlock; 2197 } 2198 }; 2199 } // namespace 2200 2201 static bool useActiveLaneMask(TailFoldingStyle Style) { 2202 return Style == TailFoldingStyle::Data || 2203 Style == TailFoldingStyle::DataAndControlFlow || 2204 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2205 } 2206 2207 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2208 return Style == TailFoldingStyle::DataAndControlFlow || 2209 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2210 } 2211 2212 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2213 // vectorization. The loop needs to be annotated with #pragma omp simd 2214 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2215 // vector length information is not provided, vectorization is not considered 2216 // explicit. Interleave hints are not allowed either. These limitations will be 2217 // relaxed in the future. 2218 // Please, note that we are currently forced to abuse the pragma 'clang 2219 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2220 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2221 // provides *explicit vectorization hints* (LV can bypass legal checks and 2222 // assume that vectorization is legal). However, both hints are implemented 2223 // using the same metadata (llvm.loop.vectorize, processed by 2224 // LoopVectorizeHints). This will be fixed in the future when the native IR 2225 // representation for pragma 'omp simd' is introduced. 2226 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2227 OptimizationRemarkEmitter *ORE) { 2228 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2229 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2230 2231 // Only outer loops with an explicit vectorization hint are supported. 2232 // Unannotated outer loops are ignored. 2233 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2234 return false; 2235 2236 Function *Fn = OuterLp->getHeader()->getParent(); 2237 if (!Hints.allowVectorization(Fn, OuterLp, 2238 true /*VectorizeOnlyWhenForced*/)) { 2239 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2240 return false; 2241 } 2242 2243 if (Hints.getInterleave() > 1) { 2244 // TODO: Interleave support is future work. 2245 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2246 "outer loops.\n"); 2247 Hints.emitRemarkWithHints(); 2248 return false; 2249 } 2250 2251 return true; 2252 } 2253 2254 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2255 OptimizationRemarkEmitter *ORE, 2256 SmallVectorImpl<Loop *> &V) { 2257 // Collect inner loops and outer loops without irreducible control flow. For 2258 // now, only collect outer loops that have explicit vectorization hints. If we 2259 // are stress testing the VPlan H-CFG construction, we collect the outermost 2260 // loop of every loop nest. 2261 if (L.isInnermost() || VPlanBuildStressTest || 2262 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2263 LoopBlocksRPO RPOT(&L); 2264 RPOT.perform(LI); 2265 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2266 V.push_back(&L); 2267 // TODO: Collect inner loops inside marked outer loops in case 2268 // vectorization fails for the outer loop. Do not invoke 2269 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2270 // already known to be reducible. We can use an inherited attribute for 2271 // that. 2272 return; 2273 } 2274 } 2275 for (Loop *InnerL : L) 2276 collectSupportedLoops(*InnerL, LI, ORE, V); 2277 } 2278 2279 //===----------------------------------------------------------------------===// 2280 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2281 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2282 //===----------------------------------------------------------------------===// 2283 2284 /// Compute the transformed value of Index at offset StartValue using step 2285 /// StepValue. 2286 /// For integer induction, returns StartValue + Index * StepValue. 2287 /// For pointer induction, returns StartValue[Index * StepValue]. 2288 /// FIXME: The newly created binary instructions should contain nsw/nuw 2289 /// flags, which can be found from the original scalar operations. 2290 static Value * 2291 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2292 Value *Step, 2293 InductionDescriptor::InductionKind InductionKind, 2294 const BinaryOperator *InductionBinOp) { 2295 Type *StepTy = Step->getType(); 2296 Value *CastedIndex = StepTy->isIntegerTy() 2297 ? B.CreateSExtOrTrunc(Index, StepTy) 2298 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2299 if (CastedIndex != Index) { 2300 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2301 Index = CastedIndex; 2302 } 2303 2304 // Note: the IR at this point is broken. We cannot use SE to create any new 2305 // SCEV and then expand it, hoping that SCEV's simplification will give us 2306 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2307 // lead to various SCEV crashes. So all we can do is to use builder and rely 2308 // on InstCombine for future simplifications. Here we handle some trivial 2309 // cases only. 2310 auto CreateAdd = [&B](Value *X, Value *Y) { 2311 assert(X->getType() == Y->getType() && "Types don't match!"); 2312 if (auto *CX = dyn_cast<ConstantInt>(X)) 2313 if (CX->isZero()) 2314 return Y; 2315 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2316 if (CY->isZero()) 2317 return X; 2318 return B.CreateAdd(X, Y); 2319 }; 2320 2321 // We allow X to be a vector type, in which case Y will potentially be 2322 // splatted into a vector with the same element count. 2323 auto CreateMul = [&B](Value *X, Value *Y) { 2324 assert(X->getType()->getScalarType() == Y->getType() && 2325 "Types don't match!"); 2326 if (auto *CX = dyn_cast<ConstantInt>(X)) 2327 if (CX->isOne()) 2328 return Y; 2329 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2330 if (CY->isOne()) 2331 return X; 2332 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2333 if (XVTy && !isa<VectorType>(Y->getType())) 2334 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2335 return B.CreateMul(X, Y); 2336 }; 2337 2338 switch (InductionKind) { 2339 case InductionDescriptor::IK_IntInduction: { 2340 assert(!isa<VectorType>(Index->getType()) && 2341 "Vector indices not supported for integer inductions yet"); 2342 assert(Index->getType() == StartValue->getType() && 2343 "Index type does not match StartValue type"); 2344 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2345 return B.CreateSub(StartValue, Index); 2346 auto *Offset = CreateMul(Index, Step); 2347 return CreateAdd(StartValue, Offset); 2348 } 2349 case InductionDescriptor::IK_PtrInduction: 2350 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2351 case InductionDescriptor::IK_FpInduction: { 2352 assert(!isa<VectorType>(Index->getType()) && 2353 "Vector indices not supported for FP inductions yet"); 2354 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2355 assert(InductionBinOp && 2356 (InductionBinOp->getOpcode() == Instruction::FAdd || 2357 InductionBinOp->getOpcode() == Instruction::FSub) && 2358 "Original bin op should be defined for FP induction"); 2359 2360 Value *MulExp = B.CreateFMul(Step, Index); 2361 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2362 "induction"); 2363 } 2364 case InductionDescriptor::IK_NoInduction: 2365 return nullptr; 2366 } 2367 llvm_unreachable("invalid enum"); 2368 } 2369 2370 std::optional<unsigned> getMaxVScale(const Function &F, 2371 const TargetTransformInfo &TTI) { 2372 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2373 return MaxVScale; 2374 2375 if (F.hasFnAttribute(Attribute::VScaleRange)) 2376 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2377 2378 return std::nullopt; 2379 } 2380 2381 /// For the given VF and UF and maximum trip count computed for the loop, return 2382 /// whether the induction variable might overflow in the vectorized loop. If not, 2383 /// then we know a runtime overflow check always evaluates to false and can be 2384 /// removed. 2385 static bool isIndvarOverflowCheckKnownFalse( 2386 const LoopVectorizationCostModel *Cost, 2387 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2388 // Always be conservative if we don't know the exact unroll factor. 2389 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2390 2391 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2392 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2393 2394 // We know the runtime overflow check is known false iff the (max) trip-count 2395 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2396 // the vector loop induction variable. 2397 if (unsigned TC = 2398 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2399 uint64_t MaxVF = VF.getKnownMinValue(); 2400 if (VF.isScalable()) { 2401 std::optional<unsigned> MaxVScale = 2402 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2403 if (!MaxVScale) 2404 return false; 2405 MaxVF *= *MaxVScale; 2406 } 2407 2408 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2409 } 2410 2411 return false; 2412 } 2413 2414 // Return whether we allow using masked interleave-groups (for dealing with 2415 // strided loads/stores that reside in predicated blocks, or for dealing 2416 // with gaps). 2417 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2418 // If an override option has been passed in for interleaved accesses, use it. 2419 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2420 return EnableMaskedInterleavedMemAccesses; 2421 2422 return TTI.enableMaskedInterleavedAccessVectorization(); 2423 } 2424 2425 // Try to vectorize the interleave group that \p Instr belongs to. 2426 // 2427 // E.g. Translate following interleaved load group (factor = 3): 2428 // for (i = 0; i < N; i+=3) { 2429 // R = Pic[i]; // Member of index 0 2430 // G = Pic[i+1]; // Member of index 1 2431 // B = Pic[i+2]; // Member of index 2 2432 // ... // do something to R, G, B 2433 // } 2434 // To: 2435 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2436 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2437 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2438 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2439 // 2440 // Or translate following interleaved store group (factor = 3): 2441 // for (i = 0; i < N; i+=3) { 2442 // ... do something to R, G, B 2443 // Pic[i] = R; // Member of index 0 2444 // Pic[i+1] = G; // Member of index 1 2445 // Pic[i+2] = B; // Member of index 2 2446 // } 2447 // To: 2448 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2449 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2450 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2451 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2452 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2453 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2454 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2455 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2456 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2457 Instruction *Instr = Group->getInsertPos(); 2458 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2459 2460 // Prepare for the vector type of the interleaved load/store. 2461 Type *ScalarTy = getLoadStoreType(Instr); 2462 unsigned InterleaveFactor = Group->getFactor(); 2463 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2464 2465 // Prepare for the new pointers. 2466 SmallVector<Value *, 2> AddrParts; 2467 unsigned Index = Group->getIndex(Instr); 2468 2469 // TODO: extend the masked interleaved-group support to reversed access. 2470 assert((!BlockInMask || !Group->isReverse()) && 2471 "Reversed masked interleave-group not supported."); 2472 2473 Value *Idx; 2474 // If the group is reverse, adjust the index to refer to the last vector lane 2475 // instead of the first. We adjust the index from the first vector lane, 2476 // rather than directly getting the pointer for lane VF - 1, because the 2477 // pointer operand of the interleaved access is supposed to be uniform. For 2478 // uniform instructions, we're only required to generate a value for the 2479 // first vector lane in each unroll iteration. 2480 if (Group->isReverse()) { 2481 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2482 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2483 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2484 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2485 Idx = Builder.CreateNeg(Idx); 2486 } else 2487 Idx = Builder.getInt32(-Index); 2488 2489 for (unsigned Part = 0; Part < UF; Part++) { 2490 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2491 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2492 State.setDebugLocFrom(I->getDebugLoc()); 2493 2494 // Notice current instruction could be any index. Need to adjust the address 2495 // to the member of index 0. 2496 // 2497 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2498 // b = A[i]; // Member of index 0 2499 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2500 // 2501 // E.g. A[i+1] = a; // Member of index 1 2502 // A[i] = b; // Member of index 0 2503 // A[i+2] = c; // Member of index 2 (Current instruction) 2504 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2505 2506 bool InBounds = false; 2507 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2508 InBounds = gep->isInBounds(); 2509 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2510 AddrParts.push_back(AddrPart); 2511 } 2512 2513 State.setDebugLocFrom(Instr->getDebugLoc()); 2514 Value *PoisonVec = PoisonValue::get(VecTy); 2515 2516 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2517 unsigned Part, Value *MaskForGaps) -> Value * { 2518 if (VF.isScalable()) { 2519 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2520 assert(InterleaveFactor == 2 && 2521 "Unsupported deinterleave factor for scalable vectors"); 2522 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2523 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2524 auto *MaskTy = 2525 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2526 return Builder.CreateIntrinsic( 2527 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2528 /*FMFSource=*/nullptr, "interleaved.mask"); 2529 } 2530 2531 if (!BlockInMask) 2532 return MaskForGaps; 2533 2534 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2535 Value *ShuffledMask = Builder.CreateShuffleVector( 2536 BlockInMaskPart, 2537 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2538 "interleaved.mask"); 2539 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2540 MaskForGaps) 2541 : ShuffledMask; 2542 }; 2543 2544 // Vectorize the interleaved load group. 2545 if (isa<LoadInst>(Instr)) { 2546 Value *MaskForGaps = nullptr; 2547 if (NeedsMaskForGaps) { 2548 MaskForGaps = 2549 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2550 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2551 } 2552 2553 // For each unroll part, create a wide load for the group. 2554 SmallVector<Value *, 2> NewLoads; 2555 for (unsigned Part = 0; Part < UF; Part++) { 2556 Instruction *NewLoad; 2557 if (BlockInMask || MaskForGaps) { 2558 assert(useMaskedInterleavedAccesses(*TTI) && 2559 "masked interleaved groups are not allowed."); 2560 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2561 NewLoad = 2562 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2563 GroupMask, PoisonVec, "wide.masked.vec"); 2564 } 2565 else 2566 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2567 Group->getAlign(), "wide.vec"); 2568 Group->addMetadata(NewLoad); 2569 NewLoads.push_back(NewLoad); 2570 } 2571 2572 if (VecTy->isScalableTy()) { 2573 assert(InterleaveFactor == 2 && 2574 "Unsupported deinterleave factor for scalable vectors"); 2575 2576 for (unsigned Part = 0; Part < UF; ++Part) { 2577 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2578 // so must use intrinsics to deinterleave. 2579 Value *DI = Builder.CreateIntrinsic( 2580 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2581 /*FMFSource=*/nullptr, "strided.vec"); 2582 unsigned J = 0; 2583 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2584 Instruction *Member = Group->getMember(I); 2585 2586 if (!Member) 2587 continue; 2588 2589 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2590 // If this member has different type, cast the result type. 2591 if (Member->getType() != ScalarTy) { 2592 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2593 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2594 } 2595 2596 if (Group->isReverse()) 2597 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2598 2599 State.set(VPDefs[J], StridedVec, Part); 2600 ++J; 2601 } 2602 } 2603 2604 return; 2605 } 2606 2607 // For each member in the group, shuffle out the appropriate data from the 2608 // wide loads. 2609 unsigned J = 0; 2610 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2611 Instruction *Member = Group->getMember(I); 2612 2613 // Skip the gaps in the group. 2614 if (!Member) 2615 continue; 2616 2617 auto StrideMask = 2618 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2619 for (unsigned Part = 0; Part < UF; Part++) { 2620 Value *StridedVec = Builder.CreateShuffleVector( 2621 NewLoads[Part], StrideMask, "strided.vec"); 2622 2623 // If this member has different type, cast the result type. 2624 if (Member->getType() != ScalarTy) { 2625 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2626 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2627 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2628 } 2629 2630 if (Group->isReverse()) 2631 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2632 2633 State.set(VPDefs[J], StridedVec, Part); 2634 } 2635 ++J; 2636 } 2637 return; 2638 } 2639 2640 // The sub vector type for current instruction. 2641 auto *SubVT = VectorType::get(ScalarTy, VF); 2642 2643 // Vectorize the interleaved store group. 2644 Value *MaskForGaps = 2645 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2646 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2647 "masked interleaved groups are not allowed."); 2648 assert((!MaskForGaps || !VF.isScalable()) && 2649 "masking gaps for scalable vectors is not yet supported."); 2650 for (unsigned Part = 0; Part < UF; Part++) { 2651 // Collect the stored vector from each member. 2652 SmallVector<Value *, 4> StoredVecs; 2653 unsigned StoredIdx = 0; 2654 for (unsigned i = 0; i < InterleaveFactor; i++) { 2655 assert((Group->getMember(i) || MaskForGaps) && 2656 "Fail to get a member from an interleaved store group"); 2657 Instruction *Member = Group->getMember(i); 2658 2659 // Skip the gaps in the group. 2660 if (!Member) { 2661 Value *Undef = PoisonValue::get(SubVT); 2662 StoredVecs.push_back(Undef); 2663 continue; 2664 } 2665 2666 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2667 ++StoredIdx; 2668 2669 if (Group->isReverse()) 2670 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2671 2672 // If this member has different type, cast it to a unified type. 2673 2674 if (StoredVec->getType() != SubVT) 2675 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2676 2677 StoredVecs.push_back(StoredVec); 2678 } 2679 2680 // Interleave all the smaller vectors into one wider vector. 2681 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2682 Instruction *NewStoreInstr; 2683 if (BlockInMask || MaskForGaps) { 2684 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2685 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2686 Group->getAlign(), GroupMask); 2687 } else 2688 NewStoreInstr = 2689 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2690 2691 Group->addMetadata(NewStoreInstr); 2692 } 2693 } 2694 2695 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2696 VPReplicateRecipe *RepRecipe, 2697 const VPIteration &Instance, 2698 VPTransformState &State) { 2699 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2700 2701 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2702 // the first lane and part. 2703 if (isa<NoAliasScopeDeclInst>(Instr)) 2704 if (!Instance.isFirstIteration()) 2705 return; 2706 2707 // Does this instruction return a value ? 2708 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2709 2710 Instruction *Cloned = Instr->clone(); 2711 if (!IsVoidRetTy) { 2712 Cloned->setName(Instr->getName() + ".cloned"); 2713 #if !defined(NDEBUG) 2714 // Verify that VPlan type inference results agree with the type of the 2715 // generated values. 2716 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2717 "inferred type and type from generated instructions do not match"); 2718 #endif 2719 } 2720 2721 RepRecipe->setFlags(Cloned); 2722 2723 if (auto DL = Instr->getDebugLoc()) 2724 State.setDebugLocFrom(DL); 2725 2726 // Replace the operands of the cloned instructions with their scalar 2727 // equivalents in the new loop. 2728 for (const auto &I : enumerate(RepRecipe->operands())) { 2729 auto InputInstance = Instance; 2730 VPValue *Operand = I.value(); 2731 if (vputils::isUniformAfterVectorization(Operand)) 2732 InputInstance.Lane = VPLane::getFirstLane(); 2733 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2734 } 2735 State.addNewMetadata(Cloned, Instr); 2736 2737 // Place the cloned scalar in the new loop. 2738 State.Builder.Insert(Cloned); 2739 2740 State.set(RepRecipe, Cloned, Instance); 2741 2742 // If we just cloned a new assumption, add it the assumption cache. 2743 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2744 AC->registerAssumption(II); 2745 2746 // End if-block. 2747 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2748 if (IfPredicateInstr) 2749 PredicatedInstructions.push_back(Cloned); 2750 } 2751 2752 Value * 2753 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2754 if (VectorTripCount) 2755 return VectorTripCount; 2756 2757 Value *TC = getTripCount(); 2758 IRBuilder<> Builder(InsertBlock->getTerminator()); 2759 2760 Type *Ty = TC->getType(); 2761 // This is where we can make the step a runtime constant. 2762 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2763 2764 // If the tail is to be folded by masking, round the number of iterations N 2765 // up to a multiple of Step instead of rounding down. This is done by first 2766 // adding Step-1 and then rounding down. Note that it's ok if this addition 2767 // overflows: the vector induction variable will eventually wrap to zero given 2768 // that it starts at zero and its Step is a power of two; the loop will then 2769 // exit, with the last early-exit vector comparison also producing all-true. 2770 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2771 // is accounted for in emitIterationCountCheck that adds an overflow check. 2772 if (Cost->foldTailByMasking()) { 2773 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2774 "VF*UF must be a power of 2 when folding tail by masking"); 2775 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2776 TC = Builder.CreateAdd( 2777 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2778 } 2779 2780 // Now we need to generate the expression for the part of the loop that the 2781 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2782 // iterations are not required for correctness, or N - Step, otherwise. Step 2783 // is equal to the vectorization factor (number of SIMD elements) times the 2784 // unroll factor (number of SIMD instructions). 2785 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2786 2787 // There are cases where we *must* run at least one iteration in the remainder 2788 // loop. See the cost model for when this can happen. If the step evenly 2789 // divides the trip count, we set the remainder to be equal to the step. If 2790 // the step does not evenly divide the trip count, no adjustment is necessary 2791 // since there will already be scalar iterations. Note that the minimum 2792 // iterations check ensures that N >= Step. 2793 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2794 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2795 R = Builder.CreateSelect(IsZero, Step, R); 2796 } 2797 2798 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2799 2800 return VectorTripCount; 2801 } 2802 2803 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2804 const DataLayout &DL) { 2805 // Verify that V is a vector type with same number of elements as DstVTy. 2806 auto *DstFVTy = cast<VectorType>(DstVTy); 2807 auto VF = DstFVTy->getElementCount(); 2808 auto *SrcVecTy = cast<VectorType>(V->getType()); 2809 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2810 Type *SrcElemTy = SrcVecTy->getElementType(); 2811 Type *DstElemTy = DstFVTy->getElementType(); 2812 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2813 "Vector elements must have same size"); 2814 2815 // Do a direct cast if element types are castable. 2816 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2817 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2818 } 2819 // V cannot be directly casted to desired vector type. 2820 // May happen when V is a floating point vector but DstVTy is a vector of 2821 // pointers or vice-versa. Handle this using a two-step bitcast using an 2822 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2823 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2824 "Only one type should be a pointer type"); 2825 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2826 "Only one type should be a floating point type"); 2827 Type *IntTy = 2828 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2829 auto *VecIntTy = VectorType::get(IntTy, VF); 2830 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2831 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2832 } 2833 2834 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2835 Value *Count = getTripCount(); 2836 // Reuse existing vector loop preheader for TC checks. 2837 // Note that new preheader block is generated for vector loop. 2838 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2839 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2840 2841 // Generate code to check if the loop's trip count is less than VF * UF, or 2842 // equal to it in case a scalar epilogue is required; this implies that the 2843 // vector trip count is zero. This check also covers the case where adding one 2844 // to the backedge-taken count overflowed leading to an incorrect trip count 2845 // of zero. In this case we will also jump to the scalar loop. 2846 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2847 : ICmpInst::ICMP_ULT; 2848 2849 // If tail is to be folded, vector loop takes care of all iterations. 2850 Type *CountTy = Count->getType(); 2851 Value *CheckMinIters = Builder.getFalse(); 2852 auto CreateStep = [&]() -> Value * { 2853 // Create step with max(MinProTripCount, UF * VF). 2854 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2855 return createStepForVF(Builder, CountTy, VF, UF); 2856 2857 Value *MinProfTC = 2858 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2859 if (!VF.isScalable()) 2860 return MinProfTC; 2861 return Builder.CreateBinaryIntrinsic( 2862 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2863 }; 2864 2865 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2866 if (Style == TailFoldingStyle::None) 2867 CheckMinIters = 2868 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2869 else if (VF.isScalable() && 2870 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2871 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2872 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2873 // an overflow to zero when updating induction variables and so an 2874 // additional overflow check is required before entering the vector loop. 2875 2876 // Get the maximum unsigned value for the type. 2877 Value *MaxUIntTripCount = 2878 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2879 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2880 2881 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2882 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2883 } 2884 2885 // Create new preheader for vector loop. 2886 LoopVectorPreHeader = 2887 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2888 "vector.ph"); 2889 2890 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2891 DT->getNode(Bypass)->getIDom()) && 2892 "TC check is expected to dominate Bypass"); 2893 2894 // Update dominator for Bypass & LoopExit (if needed). 2895 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2896 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2897 // If there is an epilogue which must run, there's no edge from the 2898 // middle block to exit blocks and thus no need to update the immediate 2899 // dominator of the exit blocks. 2900 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2901 2902 BranchInst &BI = 2903 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2904 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2905 setBranchWeights(BI, MinItersBypassWeights); 2906 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2907 LoopBypassBlocks.push_back(TCCheckBlock); 2908 } 2909 2910 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2911 BasicBlock *const SCEVCheckBlock = 2912 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2913 if (!SCEVCheckBlock) 2914 return nullptr; 2915 2916 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2917 (OptForSizeBasedOnProfile && 2918 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2919 "Cannot SCEV check stride or overflow when optimizing for size"); 2920 2921 2922 // Update dominator only if this is first RT check. 2923 if (LoopBypassBlocks.empty()) { 2924 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2925 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2926 // If there is an epilogue which must run, there's no edge from the 2927 // middle block to exit blocks and thus no need to update the immediate 2928 // dominator of the exit blocks. 2929 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2930 } 2931 2932 LoopBypassBlocks.push_back(SCEVCheckBlock); 2933 AddedSafetyChecks = true; 2934 return SCEVCheckBlock; 2935 } 2936 2937 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2938 // VPlan-native path does not do any analysis for runtime checks currently. 2939 if (EnableVPlanNativePath) 2940 return nullptr; 2941 2942 BasicBlock *const MemCheckBlock = 2943 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2944 2945 // Check if we generated code that checks in runtime if arrays overlap. We put 2946 // the checks into a separate block to make the more common case of few 2947 // elements faster. 2948 if (!MemCheckBlock) 2949 return nullptr; 2950 2951 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2952 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2953 "Cannot emit memory checks when optimizing for size, unless forced " 2954 "to vectorize."); 2955 ORE->emit([&]() { 2956 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2957 OrigLoop->getStartLoc(), 2958 OrigLoop->getHeader()) 2959 << "Code-size may be reduced by not forcing " 2960 "vectorization, or by source-code modifications " 2961 "eliminating the need for runtime checks " 2962 "(e.g., adding 'restrict')."; 2963 }); 2964 } 2965 2966 LoopBypassBlocks.push_back(MemCheckBlock); 2967 2968 AddedSafetyChecks = true; 2969 2970 return MemCheckBlock; 2971 } 2972 2973 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2974 LoopScalarBody = OrigLoop->getHeader(); 2975 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2976 assert(LoopVectorPreHeader && "Invalid loop structure"); 2977 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 2978 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 2979 "multiple exit loop without required epilogue?"); 2980 2981 LoopMiddleBlock = 2982 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2983 LI, nullptr, Twine(Prefix) + "middle.block"); 2984 LoopScalarPreHeader = 2985 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2986 nullptr, Twine(Prefix) + "scalar.ph"); 2987 2988 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 2989 2990 // Set up the middle block terminator. Two cases: 2991 // 1) If we know that we must execute the scalar epilogue, emit an 2992 // unconditional branch. 2993 // 2) Otherwise, we must have a single unique exit block (due to how we 2994 // implement the multiple exit case). In this case, set up a conditional 2995 // branch from the middle block to the loop scalar preheader, and the 2996 // exit block. completeLoopSkeleton will update the condition to use an 2997 // iteration check, if required to decide whether to execute the remainder. 2998 BranchInst *BrInst = 2999 Cost->requiresScalarEpilogue(VF.isVector()) 3000 ? BranchInst::Create(LoopScalarPreHeader) 3001 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3002 Builder.getTrue()); 3003 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3004 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3005 3006 // Update dominator for loop exit. During skeleton creation, only the vector 3007 // pre-header and the middle block are created. The vector loop is entirely 3008 // created during VPlan exection. 3009 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3010 // If there is an epilogue which must run, there's no edge from the 3011 // middle block to exit blocks and thus no need to update the immediate 3012 // dominator of the exit blocks. 3013 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3014 } 3015 3016 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3017 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3018 ArrayRef<BasicBlock *> BypassBlocks, 3019 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3020 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3021 assert(VectorTripCount && "Expected valid arguments"); 3022 3023 Instruction *OldInduction = Legal->getPrimaryInduction(); 3024 Value *&EndValue = IVEndValues[OrigPhi]; 3025 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3026 if (OrigPhi == OldInduction) { 3027 // We know what the end value is. 3028 EndValue = VectorTripCount; 3029 } else { 3030 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3031 3032 // Fast-math-flags propagate from the original induction instruction. 3033 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3034 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3035 3036 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), 3037 Step, II.getKind(), II.getInductionBinOp()); 3038 EndValue->setName("ind.end"); 3039 3040 // Compute the end value for the additional bypass (if applicable). 3041 if (AdditionalBypass.first) { 3042 B.SetInsertPoint(AdditionalBypass.first, 3043 AdditionalBypass.first->getFirstInsertionPt()); 3044 EndValueFromAdditionalBypass = 3045 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), 3046 Step, II.getKind(), II.getInductionBinOp()); 3047 EndValueFromAdditionalBypass->setName("ind.end"); 3048 } 3049 } 3050 3051 // Create phi nodes to merge from the backedge-taken check block. 3052 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3053 LoopScalarPreHeader->getTerminator()); 3054 // Copy original phi DL over to the new one. 3055 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3056 3057 // The new PHI merges the original incoming value, in case of a bypass, 3058 // or the value at the end of the vectorized loop. 3059 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3060 3061 // Fix the scalar body counter (PHI node). 3062 // The old induction's phi node in the scalar body needs the truncated 3063 // value. 3064 for (BasicBlock *BB : BypassBlocks) 3065 BCResumeVal->addIncoming(II.getStartValue(), BB); 3066 3067 if (AdditionalBypass.first) 3068 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3069 EndValueFromAdditionalBypass); 3070 return BCResumeVal; 3071 } 3072 3073 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3074 /// expansion results. 3075 static Value *getExpandedStep(const InductionDescriptor &ID, 3076 const SCEV2ValueTy &ExpandedSCEVs) { 3077 const SCEV *Step = ID.getStep(); 3078 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3079 return C->getValue(); 3080 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3081 return U->getValue(); 3082 auto I = ExpandedSCEVs.find(Step); 3083 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3084 return I->second; 3085 } 3086 3087 void InnerLoopVectorizer::createInductionResumeValues( 3088 const SCEV2ValueTy &ExpandedSCEVs, 3089 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3090 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3091 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3092 "Inconsistent information about additional bypass."); 3093 // We are going to resume the execution of the scalar loop. 3094 // Go over all of the induction variables that we found and fix the 3095 // PHIs that are left in the scalar version of the loop. 3096 // The starting values of PHI nodes depend on the counter of the last 3097 // iteration in the vectorized loop. 3098 // If we come from a bypass edge then we need to start from the original 3099 // start value. 3100 for (const auto &InductionEntry : Legal->getInductionVars()) { 3101 PHINode *OrigPhi = InductionEntry.first; 3102 const InductionDescriptor &II = InductionEntry.second; 3103 PHINode *BCResumeVal = createInductionResumeValue( 3104 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3105 AdditionalBypass); 3106 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3107 } 3108 } 3109 3110 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3111 // The trip counts should be cached by now. 3112 Value *Count = getTripCount(); 3113 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3114 3115 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3116 3117 // Add a check in the middle block to see if we have completed 3118 // all of the iterations in the first vector loop. Three cases: 3119 // 1) If we require a scalar epilogue, there is no conditional branch as 3120 // we unconditionally branch to the scalar preheader. Do nothing. 3121 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3122 // Thus if tail is to be folded, we know we don't need to run the 3123 // remainder and we can use the previous value for the condition (true). 3124 // 3) Otherwise, construct a runtime check. 3125 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3126 !Cost->foldTailByMasking()) { 3127 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3128 // of the corresponding compare because they may have ended up with 3129 // different line numbers and we want to avoid awkward line stepping while 3130 // debugging. Eg. if the compare has got a line number inside the loop. 3131 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant 3132 // operands. Perform simplification directly on VPlan once the branch is 3133 // modeled there. 3134 IRBuilder<> B(LoopMiddleBlock->getTerminator()); 3135 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); 3136 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); 3137 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); 3138 BI.setCondition(CmpN); 3139 if (hasBranchWeightMD(*ScalarLatchTerm)) { 3140 // Assume that `Count % VectorTripCount` is equally distributed. 3141 unsigned TripCount = UF * VF.getKnownMinValue(); 3142 assert(TripCount > 0 && "trip count should not be zero"); 3143 const uint32_t Weights[] = {1, TripCount - 1}; 3144 setBranchWeights(BI, Weights); 3145 } 3146 } 3147 3148 #ifdef EXPENSIVE_CHECKS 3149 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3150 #endif 3151 3152 return LoopVectorPreHeader; 3153 } 3154 3155 std::pair<BasicBlock *, Value *> 3156 InnerLoopVectorizer::createVectorizedLoopSkeleton( 3157 const SCEV2ValueTy &ExpandedSCEVs) { 3158 /* 3159 In this function we generate a new loop. The new loop will contain 3160 the vectorized instructions while the old loop will continue to run the 3161 scalar remainder. 3162 3163 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3164 / | preheader are expanded here. Eventually all required SCEV 3165 / | expansion should happen here. 3166 / v 3167 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3168 | / | 3169 | / v 3170 || [ ] <-- vector pre header. 3171 |/ | 3172 | v 3173 | [ ] \ 3174 | [ ]_| <-- vector loop (created during VPlan execution). 3175 | | 3176 | v 3177 \ -[ ] <--- middle-block. 3178 \/ | 3179 /\ v 3180 | ->[ ] <--- new preheader. 3181 | | 3182 (opt) v <-- edge from middle to exit iff epilogue is not required. 3183 | [ ] \ 3184 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3185 \ | 3186 \ v 3187 >[ ] <-- exit block(s). 3188 ... 3189 */ 3190 3191 // Create an empty vector loop, and prepare basic blocks for the runtime 3192 // checks. 3193 createVectorLoopSkeleton(""); 3194 3195 // Now, compare the new count to zero. If it is zero skip the vector loop and 3196 // jump to the scalar loop. This check also covers the case where the 3197 // backedge-taken count is uint##_max: adding one to it will overflow leading 3198 // to an incorrect trip count of zero. In this (rare) case we will also jump 3199 // to the scalar loop. 3200 emitIterationCountCheck(LoopScalarPreHeader); 3201 3202 // Generate the code to check any assumptions that we've made for SCEV 3203 // expressions. 3204 emitSCEVChecks(LoopScalarPreHeader); 3205 3206 // Generate the code that checks in runtime if arrays overlap. We put the 3207 // checks into a separate block to make the more common case of few elements 3208 // faster. 3209 emitMemRuntimeChecks(LoopScalarPreHeader); 3210 3211 // Emit phis for the new starting index of the scalar loop. 3212 createInductionResumeValues(ExpandedSCEVs); 3213 3214 return {completeLoopSkeleton(), nullptr}; 3215 } 3216 3217 // Fix up external users of the induction variable. At this point, we are 3218 // in LCSSA form, with all external PHIs that use the IV having one input value, 3219 // coming from the remainder loop. We need those PHIs to also have a correct 3220 // value for the IV when arriving directly from the middle block. 3221 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3222 const InductionDescriptor &II, 3223 Value *VectorTripCount, Value *EndValue, 3224 BasicBlock *MiddleBlock, 3225 BasicBlock *VectorHeader, VPlan &Plan, 3226 VPTransformState &State) { 3227 // There are two kinds of external IV usages - those that use the value 3228 // computed in the last iteration (the PHI) and those that use the penultimate 3229 // value (the value that feeds into the phi from the loop latch). 3230 // We allow both, but they, obviously, have different values. 3231 3232 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3233 3234 DenseMap<Value *, Value *> MissingVals; 3235 3236 // An external user of the last iteration's value should see the value that 3237 // the remainder loop uses to initialize its own IV. 3238 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3239 for (User *U : PostInc->users()) { 3240 Instruction *UI = cast<Instruction>(U); 3241 if (!OrigLoop->contains(UI)) { 3242 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3243 MissingVals[UI] = EndValue; 3244 } 3245 } 3246 3247 // An external user of the penultimate value need to see EndValue - Step. 3248 // The simplest way to get this is to recompute it from the constituent SCEVs, 3249 // that is Start + (Step * (CRD - 1)). 3250 for (User *U : OrigPhi->users()) { 3251 auto *UI = cast<Instruction>(U); 3252 if (!OrigLoop->contains(UI)) { 3253 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3254 IRBuilder<> B(MiddleBlock->getTerminator()); 3255 3256 // Fast-math-flags propagate from the original induction instruction. 3257 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3258 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3259 3260 Value *CountMinusOne = B.CreateSub( 3261 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3262 CountMinusOne->setName("cmo"); 3263 3264 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3265 assert(StepVPV && "step must have been expanded during VPlan execution"); 3266 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3267 : State.get(StepVPV, {0, 0}); 3268 Value *Escape = 3269 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, 3270 II.getKind(), II.getInductionBinOp()); 3271 Escape->setName("ind.escape"); 3272 MissingVals[UI] = Escape; 3273 } 3274 } 3275 3276 for (auto &I : MissingVals) { 3277 PHINode *PHI = cast<PHINode>(I.first); 3278 // One corner case we have to handle is two IVs "chasing" each-other, 3279 // that is %IV2 = phi [...], [ %IV1, %latch ] 3280 // In this case, if IV1 has an external use, we need to avoid adding both 3281 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3282 // don't already have an incoming value for the middle block. 3283 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3284 PHI->addIncoming(I.second, MiddleBlock); 3285 Plan.removeLiveOut(PHI); 3286 } 3287 } 3288 } 3289 3290 namespace { 3291 3292 struct CSEDenseMapInfo { 3293 static bool canHandle(const Instruction *I) { 3294 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3295 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3296 } 3297 3298 static inline Instruction *getEmptyKey() { 3299 return DenseMapInfo<Instruction *>::getEmptyKey(); 3300 } 3301 3302 static inline Instruction *getTombstoneKey() { 3303 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3304 } 3305 3306 static unsigned getHashValue(const Instruction *I) { 3307 assert(canHandle(I) && "Unknown instruction!"); 3308 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3309 I->value_op_end())); 3310 } 3311 3312 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3313 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3314 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3315 return LHS == RHS; 3316 return LHS->isIdenticalTo(RHS); 3317 } 3318 }; 3319 3320 } // end anonymous namespace 3321 3322 ///Perform cse of induction variable instructions. 3323 static void cse(BasicBlock *BB) { 3324 // Perform simple cse. 3325 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3326 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3327 if (!CSEDenseMapInfo::canHandle(&In)) 3328 continue; 3329 3330 // Check if we can replace this instruction with any of the 3331 // visited instructions. 3332 if (Instruction *V = CSEMap.lookup(&In)) { 3333 In.replaceAllUsesWith(V); 3334 In.eraseFromParent(); 3335 continue; 3336 } 3337 3338 CSEMap[&In] = &In; 3339 } 3340 } 3341 3342 InstructionCost 3343 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3344 ElementCount VF) const { 3345 // We only need to calculate a cost if the VF is scalar; for actual vectors 3346 // we should already have a pre-calculated cost at each VF. 3347 if (!VF.isScalar()) 3348 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 3349 3350 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3351 Type *RetTy = CI->getType(); 3352 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 3353 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 3354 return *RedCost; 3355 3356 SmallVector<Type *, 4> Tys; 3357 for (auto &ArgOp : CI->args()) 3358 Tys.push_back(ArgOp->getType()); 3359 3360 InstructionCost ScalarCallCost = 3361 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 3362 3363 // If this is an intrinsic we may have a lower cost for it. 3364 if (getVectorIntrinsicIDForCall(CI, TLI)) { 3365 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 3366 return std::min(ScalarCallCost, IntrinsicCost); 3367 } 3368 return ScalarCallCost; 3369 } 3370 3371 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3372 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3373 return Elt; 3374 return VectorType::get(Elt, VF); 3375 } 3376 3377 InstructionCost 3378 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3379 ElementCount VF) const { 3380 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3381 assert(ID && "Expected intrinsic call!"); 3382 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3383 FastMathFlags FMF; 3384 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3385 FMF = FPMO->getFastMathFlags(); 3386 3387 SmallVector<const Value *> Arguments(CI->args()); 3388 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3389 SmallVector<Type *> ParamTys; 3390 std::transform(FTy->param_begin(), FTy->param_end(), 3391 std::back_inserter(ParamTys), 3392 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3393 3394 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3395 dyn_cast<IntrinsicInst>(CI)); 3396 return TTI.getIntrinsicInstrCost(CostAttrs, 3397 TargetTransformInfo::TCK_RecipThroughput); 3398 } 3399 3400 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3401 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3402 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3403 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3404 } 3405 3406 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3407 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3408 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3409 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3410 } 3411 3412 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3413 VPlan &Plan) { 3414 // Fix widened non-induction PHIs by setting up the PHI operands. 3415 if (EnableVPlanNativePath) 3416 fixNonInductionPHIs(Plan, State); 3417 3418 // At this point every instruction in the original loop is widened to a 3419 // vector form. Now we need to fix the recurrences in the loop. These PHI 3420 // nodes are currently empty because we did not want to introduce cycles. 3421 // This is the second stage of vectorizing recurrences. Note that fixing 3422 // reduction phis are already modeled in VPlan. 3423 // TODO: Also model fixing fixed-order recurrence phis in VPlan. 3424 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 3425 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 3426 for (VPRecipeBase &R : HeaderVPBB->phis()) { 3427 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3428 fixFixedOrderRecurrence(FOR, State); 3429 } 3430 3431 // Forget the original basic block. 3432 PSE.getSE()->forgetLoop(OrigLoop); 3433 PSE.getSE()->forgetBlockAndLoopDispositions(); 3434 3435 // After vectorization, the exit blocks of the original loop will have 3436 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3437 // looked through single-entry phis. 3438 SmallVector<BasicBlock *> ExitBlocks; 3439 OrigLoop->getExitBlocks(ExitBlocks); 3440 for (BasicBlock *Exit : ExitBlocks) 3441 for (PHINode &PN : Exit->phis()) 3442 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 3443 3444 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); 3445 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3446 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3447 // No edge from the middle block to the unique exit block has been inserted 3448 // and there is nothing to fix from vector loop; phis should have incoming 3449 // from scalar loop only. 3450 } else { 3451 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3452 // the cost model. 3453 3454 // If we inserted an edge from the middle block to the unique exit block, 3455 // update uses outside the loop (phis) to account for the newly inserted 3456 // edge. 3457 3458 // Fix-up external users of the induction variables. 3459 for (const auto &Entry : Legal->getInductionVars()) 3460 fixupIVUsers(Entry.first, Entry.second, 3461 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3462 IVEndValues[Entry.first], LoopMiddleBlock, 3463 VectorLoop->getHeader(), Plan, State); 3464 } 3465 3466 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3467 // in the exit block, so update the builder. 3468 State.Builder.SetInsertPoint(State.CFG.ExitBB, 3469 State.CFG.ExitBB->getFirstNonPHIIt()); 3470 for (const auto &KV : Plan.getLiveOuts()) 3471 KV.second->fixPhi(Plan, State); 3472 3473 for (Instruction *PI : PredicatedInstructions) 3474 sinkScalarOperands(&*PI); 3475 3476 // Remove redundant induction instructions. 3477 cse(VectorLoop->getHeader()); 3478 3479 // Set/update profile weights for the vector and remainder loops as original 3480 // loop iterations are now distributed among them. Note that original loop 3481 // represented by LoopScalarBody becomes remainder loop after vectorization. 3482 // 3483 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3484 // end up getting slightly roughened result but that should be OK since 3485 // profile is not inherently precise anyway. Note also possible bypass of 3486 // vector code caused by legality checks is ignored, assigning all the weight 3487 // to the vector loop, optimistically. 3488 // 3489 // For scalable vectorization we can't know at compile time how many iterations 3490 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3491 // vscale of '1'. 3492 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3493 LI->getLoopFor(LoopScalarBody), 3494 VF.getKnownMinValue() * UF); 3495 } 3496 3497 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3498 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3499 // This is the second phase of vectorizing first-order recurrences. An 3500 // overview of the transformation is described below. Suppose we have the 3501 // following loop. 3502 // 3503 // for (int i = 0; i < n; ++i) 3504 // b[i] = a[i] - a[i - 1]; 3505 // 3506 // There is a first-order recurrence on "a". For this loop, the shorthand 3507 // scalar IR looks like: 3508 // 3509 // scalar.ph: 3510 // s_init = a[-1] 3511 // br scalar.body 3512 // 3513 // scalar.body: 3514 // i = phi [0, scalar.ph], [i+1, scalar.body] 3515 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3516 // s2 = a[i] 3517 // b[i] = s2 - s1 3518 // br cond, scalar.body, ... 3519 // 3520 // In this example, s1 is a recurrence because it's value depends on the 3521 // previous iteration. In the first phase of vectorization, we created a 3522 // vector phi v1 for s1. We now complete the vectorization and produce the 3523 // shorthand vector IR shown below (for VF = 4, UF = 1). 3524 // 3525 // vector.ph: 3526 // v_init = vector(..., ..., ..., a[-1]) 3527 // br vector.body 3528 // 3529 // vector.body 3530 // i = phi [0, vector.ph], [i+4, vector.body] 3531 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3532 // v2 = a[i, i+1, i+2, i+3]; 3533 // v3 = vector(v1(3), v2(0, 1, 2)) 3534 // b[i, i+1, i+2, i+3] = v2 - v3 3535 // br cond, vector.body, middle.block 3536 // 3537 // middle.block: 3538 // x = v2(3) 3539 // br scalar.ph 3540 // 3541 // scalar.ph: 3542 // s_init = phi [x, middle.block], [a[-1], otherwise] 3543 // br scalar.body 3544 // 3545 // After execution completes the vector loop, we extract the next value of 3546 // the recurrence (x) to use as the initial value in the scalar loop. 3547 3548 // Extract the last vector element in the middle block. This will be the 3549 // initial value for the recurrence when jumping to the scalar loop. 3550 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3551 Value *Incoming = State.get(PreviousDef, UF - 1); 3552 auto *ExtractForScalar = Incoming; 3553 auto *IdxTy = Builder.getInt32Ty(); 3554 Value *RuntimeVF = nullptr; 3555 if (VF.isVector()) { 3556 auto *One = ConstantInt::get(IdxTy, 1); 3557 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3558 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3559 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3560 ExtractForScalar = 3561 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3562 } 3563 3564 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3565 assert(PhiR->getNumUsers() == 1 && 3566 RecurSplice->getOpcode() == 3567 VPInstruction::FirstOrderRecurrenceSplice && 3568 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3569 SmallVector<VPLiveOut *> LiveOuts; 3570 for (VPUser *U : RecurSplice->users()) 3571 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3572 LiveOuts.push_back(LiveOut); 3573 3574 if (!LiveOuts.empty()) { 3575 // Extract the second last element in the middle block if the 3576 // Phi is used outside the loop. We need to extract the phi itself 3577 // and not the last element (the phi update in the current iteration). This 3578 // will be the value when jumping to the exit block from the 3579 // LoopMiddleBlock, when the scalar loop is not run at all. 3580 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3581 if (VF.isVector()) { 3582 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3583 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3584 Incoming, Idx, "vector.recur.extract.for.phi"); 3585 } else { 3586 assert(UF > 1 && "VF and UF cannot both be 1"); 3587 // When loop is unrolled without vectorizing, initialize 3588 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3589 // value of `Incoming`. This is analogous to the vectorized case above: 3590 // extracting the second last element when VF > 1. 3591 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3592 } 3593 3594 for (VPLiveOut *LiveOut : LiveOuts) { 3595 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3596 PHINode *LCSSAPhi = LiveOut->getPhi(); 3597 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3598 State.Plan->removeLiveOut(LCSSAPhi); 3599 } 3600 } 3601 3602 // Fix the initial value of the original recurrence in the scalar loop. 3603 Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); 3604 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3605 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3606 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3607 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3608 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3609 Start->addIncoming(Incoming, BB); 3610 } 3611 3612 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3613 Phi->setName("scalar.recur"); 3614 } 3615 3616 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3617 // The basic block and loop containing the predicated instruction. 3618 auto *PredBB = PredInst->getParent(); 3619 auto *VectorLoop = LI->getLoopFor(PredBB); 3620 3621 // Initialize a worklist with the operands of the predicated instruction. 3622 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3623 3624 // Holds instructions that we need to analyze again. An instruction may be 3625 // reanalyzed if we don't yet know if we can sink it or not. 3626 SmallVector<Instruction *, 8> InstsToReanalyze; 3627 3628 // Returns true if a given use occurs in the predicated block. Phi nodes use 3629 // their operands in their corresponding predecessor blocks. 3630 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3631 auto *I = cast<Instruction>(U.getUser()); 3632 BasicBlock *BB = I->getParent(); 3633 if (auto *Phi = dyn_cast<PHINode>(I)) 3634 BB = Phi->getIncomingBlock( 3635 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3636 return BB == PredBB; 3637 }; 3638 3639 // Iteratively sink the scalarized operands of the predicated instruction 3640 // into the block we created for it. When an instruction is sunk, it's 3641 // operands are then added to the worklist. The algorithm ends after one pass 3642 // through the worklist doesn't sink a single instruction. 3643 bool Changed; 3644 do { 3645 // Add the instructions that need to be reanalyzed to the worklist, and 3646 // reset the changed indicator. 3647 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3648 InstsToReanalyze.clear(); 3649 Changed = false; 3650 3651 while (!Worklist.empty()) { 3652 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3653 3654 // We can't sink an instruction if it is a phi node, is not in the loop, 3655 // may have side effects or may read from memory. 3656 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 3657 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3658 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3659 continue; 3660 3661 // If the instruction is already in PredBB, check if we can sink its 3662 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3663 // sinking the scalar instruction I, hence it appears in PredBB; but it 3664 // may have failed to sink I's operands (recursively), which we try 3665 // (again) here. 3666 if (I->getParent() == PredBB) { 3667 Worklist.insert(I->op_begin(), I->op_end()); 3668 continue; 3669 } 3670 3671 // It's legal to sink the instruction if all its uses occur in the 3672 // predicated block. Otherwise, there's nothing to do yet, and we may 3673 // need to reanalyze the instruction. 3674 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3675 InstsToReanalyze.push_back(I); 3676 continue; 3677 } 3678 3679 // Move the instruction to the beginning of the predicated block, and add 3680 // it's operands to the worklist. 3681 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3682 Worklist.insert(I->op_begin(), I->op_end()); 3683 3684 // The sinking may have enabled other instructions to be sunk, so we will 3685 // need to iterate. 3686 Changed = true; 3687 } 3688 } while (Changed); 3689 } 3690 3691 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 3692 VPTransformState &State) { 3693 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3694 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3695 for (VPRecipeBase &P : VPBB->phis()) { 3696 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3697 if (!VPPhi) 3698 continue; 3699 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 3700 // Make sure the builder has a valid insert point. 3701 Builder.SetInsertPoint(NewPhi); 3702 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 3703 VPValue *Inc = VPPhi->getIncomingValue(i); 3704 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 3705 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 3706 } 3707 } 3708 } 3709 } 3710 3711 bool InnerLoopVectorizer::useOrderedReductions( 3712 const RecurrenceDescriptor &RdxDesc) { 3713 return Cost->useOrderedReductions(RdxDesc); 3714 } 3715 3716 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3717 // We should not collect Scalars more than once per VF. Right now, this 3718 // function is called from collectUniformsAndScalars(), which already does 3719 // this check. Collecting Scalars for VF=1 does not make any sense. 3720 assert(VF.isVector() && !Scalars.contains(VF) && 3721 "This function should not be visited twice for the same VF"); 3722 3723 // This avoids any chances of creating a REPLICATE recipe during planning 3724 // since that would result in generation of scalarized code during execution, 3725 // which is not supported for scalable vectors. 3726 if (VF.isScalable()) { 3727 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3728 return; 3729 } 3730 3731 SmallSetVector<Instruction *, 8> Worklist; 3732 3733 // These sets are used to seed the analysis with pointers used by memory 3734 // accesses that will remain scalar. 3735 SmallSetVector<Instruction *, 8> ScalarPtrs; 3736 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3737 auto *Latch = TheLoop->getLoopLatch(); 3738 3739 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3740 // The pointer operands of loads and stores will be scalar as long as the 3741 // memory access is not a gather or scatter operation. The value operand of a 3742 // store will remain scalar if the store is scalarized. 3743 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3744 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3745 assert(WideningDecision != CM_Unknown && 3746 "Widening decision should be ready at this moment"); 3747 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3748 if (Ptr == Store->getValueOperand()) 3749 return WideningDecision == CM_Scalarize; 3750 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3751 "Ptr is neither a value or pointer operand"); 3752 return WideningDecision != CM_GatherScatter; 3753 }; 3754 3755 // A helper that returns true if the given value is a bitcast or 3756 // getelementptr instruction contained in the loop. 3757 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 3758 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 3759 isa<GetElementPtrInst>(V)) && 3760 !TheLoop->isLoopInvariant(V); 3761 }; 3762 3763 // A helper that evaluates a memory access's use of a pointer. If the use will 3764 // be a scalar use and the pointer is only used by memory accesses, we place 3765 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3766 // PossibleNonScalarPtrs. 3767 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3768 // We only care about bitcast and getelementptr instructions contained in 3769 // the loop. 3770 if (!isLoopVaryingBitCastOrGEP(Ptr)) 3771 return; 3772 3773 // If the pointer has already been identified as scalar (e.g., if it was 3774 // also identified as uniform), there's nothing to do. 3775 auto *I = cast<Instruction>(Ptr); 3776 if (Worklist.count(I)) 3777 return; 3778 3779 // If the use of the pointer will be a scalar use, and all users of the 3780 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3781 // place the pointer in PossibleNonScalarPtrs. 3782 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 3783 return isa<LoadInst>(U) || isa<StoreInst>(U); 3784 })) 3785 ScalarPtrs.insert(I); 3786 else 3787 PossibleNonScalarPtrs.insert(I); 3788 }; 3789 3790 // We seed the scalars analysis with three classes of instructions: (1) 3791 // instructions marked uniform-after-vectorization and (2) bitcast, 3792 // getelementptr and (pointer) phi instructions used by memory accesses 3793 // requiring a scalar use. 3794 // 3795 // (1) Add to the worklist all instructions that have been identified as 3796 // uniform-after-vectorization. 3797 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3798 3799 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3800 // memory accesses requiring a scalar use. The pointer operands of loads and 3801 // stores will be scalar as long as the memory accesses is not a gather or 3802 // scatter operation. The value operand of a store will remain scalar if the 3803 // store is scalarized. 3804 for (auto *BB : TheLoop->blocks()) 3805 for (auto &I : *BB) { 3806 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3807 evaluatePtrUse(Load, Load->getPointerOperand()); 3808 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3809 evaluatePtrUse(Store, Store->getPointerOperand()); 3810 evaluatePtrUse(Store, Store->getValueOperand()); 3811 } 3812 } 3813 for (auto *I : ScalarPtrs) 3814 if (!PossibleNonScalarPtrs.count(I)) { 3815 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3816 Worklist.insert(I); 3817 } 3818 3819 // Insert the forced scalars. 3820 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3821 // induction variable when the PHI user is scalarized. 3822 auto ForcedScalar = ForcedScalars.find(VF); 3823 if (ForcedScalar != ForcedScalars.end()) 3824 for (auto *I : ForcedScalar->second) { 3825 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3826 Worklist.insert(I); 3827 } 3828 3829 // Expand the worklist by looking through any bitcasts and getelementptr 3830 // instructions we've already identified as scalar. This is similar to the 3831 // expansion step in collectLoopUniforms(); however, here we're only 3832 // expanding to include additional bitcasts and getelementptr instructions. 3833 unsigned Idx = 0; 3834 while (Idx != Worklist.size()) { 3835 Instruction *Dst = Worklist[Idx++]; 3836 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 3837 continue; 3838 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3839 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3840 auto *J = cast<Instruction>(U); 3841 return !TheLoop->contains(J) || Worklist.count(J) || 3842 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3843 isScalarUse(J, Src)); 3844 })) { 3845 Worklist.insert(Src); 3846 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3847 } 3848 } 3849 3850 // An induction variable will remain scalar if all users of the induction 3851 // variable and induction variable update remain scalar. 3852 for (const auto &Induction : Legal->getInductionVars()) { 3853 auto *Ind = Induction.first; 3854 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3855 3856 // If tail-folding is applied, the primary induction variable will be used 3857 // to feed a vector compare. 3858 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3859 continue; 3860 3861 // Returns true if \p Indvar is a pointer induction that is used directly by 3862 // load/store instruction \p I. 3863 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3864 Instruction *I) { 3865 return Induction.second.getKind() == 3866 InductionDescriptor::IK_PtrInduction && 3867 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3868 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 3869 }; 3870 3871 // Determine if all users of the induction variable are scalar after 3872 // vectorization. 3873 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 3874 auto *I = cast<Instruction>(U); 3875 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3876 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3877 }); 3878 if (!ScalarInd) 3879 continue; 3880 3881 // Determine if all users of the induction variable update instruction are 3882 // scalar after vectorization. 3883 auto ScalarIndUpdate = 3884 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 3885 auto *I = cast<Instruction>(U); 3886 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3887 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3888 }); 3889 if (!ScalarIndUpdate) 3890 continue; 3891 3892 // The induction variable and its update instruction will remain scalar. 3893 Worklist.insert(Ind); 3894 Worklist.insert(IndUpdate); 3895 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3896 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3897 << "\n"); 3898 } 3899 3900 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3901 } 3902 3903 bool LoopVectorizationCostModel::isScalarWithPredication( 3904 Instruction *I, ElementCount VF) const { 3905 if (!isPredicatedInst(I)) 3906 return false; 3907 3908 // Do we have a non-scalar lowering for this predicated 3909 // instruction? No - it is scalar with predication. 3910 switch(I->getOpcode()) { 3911 default: 3912 return true; 3913 case Instruction::Call: 3914 if (VF.isScalar()) 3915 return true; 3916 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3917 .Kind == CM_Scalarize; 3918 case Instruction::Load: 3919 case Instruction::Store: { 3920 auto *Ptr = getLoadStorePointerOperand(I); 3921 auto *Ty = getLoadStoreType(I); 3922 Type *VTy = Ty; 3923 if (VF.isVector()) 3924 VTy = VectorType::get(Ty, VF); 3925 const Align Alignment = getLoadStoreAlignment(I); 3926 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3927 TTI.isLegalMaskedGather(VTy, Alignment)) 3928 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3929 TTI.isLegalMaskedScatter(VTy, Alignment)); 3930 } 3931 case Instruction::UDiv: 3932 case Instruction::SDiv: 3933 case Instruction::SRem: 3934 case Instruction::URem: { 3935 // We have the option to use the safe-divisor idiom to avoid predication. 3936 // The cost based decision here will always select safe-divisor for 3937 // scalable vectors as scalarization isn't legal. 3938 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3939 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3940 } 3941 } 3942 } 3943 3944 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3945 if (!blockNeedsPredicationForAnyReason(I->getParent())) 3946 return false; 3947 3948 // Can we prove this instruction is safe to unconditionally execute? 3949 // If not, we must use some form of predication. 3950 switch(I->getOpcode()) { 3951 default: 3952 return false; 3953 case Instruction::Load: 3954 case Instruction::Store: { 3955 if (!Legal->isMaskRequired(I)) 3956 return false; 3957 // When we know the load's address is loop invariant and the instruction 3958 // in the original scalar loop was unconditionally executed then we 3959 // don't need to mark it as a predicated instruction. Tail folding may 3960 // introduce additional predication, but we're guaranteed to always have 3961 // at least one active lane. We call Legal->blockNeedsPredication here 3962 // because it doesn't query tail-folding. For stores, we need to prove 3963 // both speculation safety (which follows from the same argument as loads), 3964 // but also must prove the value being stored is correct. The easiest 3965 // form of the later is to require that all values stored are the same. 3966 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 3967 (isa<LoadInst>(I) || 3968 (isa<StoreInst>(I) && 3969 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 3970 !Legal->blockNeedsPredication(I->getParent())) 3971 return false; 3972 return true; 3973 } 3974 case Instruction::UDiv: 3975 case Instruction::SDiv: 3976 case Instruction::SRem: 3977 case Instruction::URem: 3978 // TODO: We can use the loop-preheader as context point here and get 3979 // context sensitive reasoning 3980 return !isSafeToSpeculativelyExecute(I); 3981 case Instruction::Call: 3982 return Legal->isMaskRequired(I); 3983 } 3984 } 3985 3986 std::pair<InstructionCost, InstructionCost> 3987 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3988 ElementCount VF) const { 3989 assert(I->getOpcode() == Instruction::UDiv || 3990 I->getOpcode() == Instruction::SDiv || 3991 I->getOpcode() == Instruction::SRem || 3992 I->getOpcode() == Instruction::URem); 3993 assert(!isSafeToSpeculativelyExecute(I)); 3994 3995 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3996 3997 // Scalarization isn't legal for scalable vector types 3998 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3999 if (!VF.isScalable()) { 4000 // Get the scalarization cost and scale this amount by the probability of 4001 // executing the predicated block. If the instruction is not predicated, 4002 // we fall through to the next case. 4003 ScalarizationCost = 0; 4004 4005 // These instructions have a non-void type, so account for the phi nodes 4006 // that we will create. This cost is likely to be zero. The phi node 4007 // cost, if any, should be scaled by the block probability because it 4008 // models a copy at the end of each predicated block. 4009 ScalarizationCost += VF.getKnownMinValue() * 4010 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4011 4012 // The cost of the non-predicated instruction. 4013 ScalarizationCost += VF.getKnownMinValue() * 4014 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4015 4016 // The cost of insertelement and extractelement instructions needed for 4017 // scalarization. 4018 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4019 4020 // Scale the cost by the probability of executing the predicated blocks. 4021 // This assumes the predicated block for each vector lane is equally 4022 // likely. 4023 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4024 } 4025 InstructionCost SafeDivisorCost = 0; 4026 4027 auto *VecTy = ToVectorTy(I->getType(), VF); 4028 4029 // The cost of the select guard to ensure all lanes are well defined 4030 // after we speculate above any internal control flow. 4031 SafeDivisorCost += TTI.getCmpSelInstrCost( 4032 Instruction::Select, VecTy, 4033 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4034 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4035 4036 // Certain instructions can be cheaper to vectorize if they have a constant 4037 // second vector operand. One example of this are shifts on x86. 4038 Value *Op2 = I->getOperand(1); 4039 auto Op2Info = TTI.getOperandInfo(Op2); 4040 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4041 Legal->isInvariant(Op2)) 4042 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4043 4044 SmallVector<const Value *, 4> Operands(I->operand_values()); 4045 SafeDivisorCost += TTI.getArithmeticInstrCost( 4046 I->getOpcode(), VecTy, CostKind, 4047 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4048 Op2Info, Operands, I); 4049 return {ScalarizationCost, SafeDivisorCost}; 4050 } 4051 4052 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4053 Instruction *I, ElementCount VF) { 4054 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4055 assert(getWideningDecision(I, VF) == CM_Unknown && 4056 "Decision should not be set yet."); 4057 auto *Group = getInterleavedAccessGroup(I); 4058 assert(Group && "Must have a group."); 4059 4060 // If the instruction's allocated size doesn't equal it's type size, it 4061 // requires padding and will be scalarized. 4062 auto &DL = I->getModule()->getDataLayout(); 4063 auto *ScalarTy = getLoadStoreType(I); 4064 if (hasIrregularType(ScalarTy, DL)) 4065 return false; 4066 4067 // If the group involves a non-integral pointer, we may not be able to 4068 // losslessly cast all values to a common type. 4069 unsigned InterleaveFactor = Group->getFactor(); 4070 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4071 for (unsigned i = 0; i < InterleaveFactor; i++) { 4072 Instruction *Member = Group->getMember(i); 4073 if (!Member) 4074 continue; 4075 auto *MemberTy = getLoadStoreType(Member); 4076 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4077 // Don't coerce non-integral pointers to integers or vice versa. 4078 if (MemberNI != ScalarNI) { 4079 // TODO: Consider adding special nullptr value case here 4080 return false; 4081 } else if (MemberNI && ScalarNI && 4082 ScalarTy->getPointerAddressSpace() != 4083 MemberTy->getPointerAddressSpace()) { 4084 return false; 4085 } 4086 } 4087 4088 // Check if masking is required. 4089 // A Group may need masking for one of two reasons: it resides in a block that 4090 // needs predication, or it was decided to use masking to deal with gaps 4091 // (either a gap at the end of a load-access that may result in a speculative 4092 // load, or any gaps in a store-access). 4093 bool PredicatedAccessRequiresMasking = 4094 blockNeedsPredicationForAnyReason(I->getParent()) && 4095 Legal->isMaskRequired(I); 4096 bool LoadAccessWithGapsRequiresEpilogMasking = 4097 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4098 !isScalarEpilogueAllowed(); 4099 bool StoreAccessWithGapsRequiresMasking = 4100 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4101 if (!PredicatedAccessRequiresMasking && 4102 !LoadAccessWithGapsRequiresEpilogMasking && 4103 !StoreAccessWithGapsRequiresMasking) 4104 return true; 4105 4106 // If masked interleaving is required, we expect that the user/target had 4107 // enabled it, because otherwise it either wouldn't have been created or 4108 // it should have been invalidated by the CostModel. 4109 assert(useMaskedInterleavedAccesses(TTI) && 4110 "Masked interleave-groups for predicated accesses are not enabled."); 4111 4112 if (Group->isReverse()) 4113 return false; 4114 4115 auto *Ty = getLoadStoreType(I); 4116 const Align Alignment = getLoadStoreAlignment(I); 4117 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4118 : TTI.isLegalMaskedStore(Ty, Alignment); 4119 } 4120 4121 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4122 Instruction *I, ElementCount VF) { 4123 // Get and ensure we have a valid memory instruction. 4124 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4125 4126 auto *Ptr = getLoadStorePointerOperand(I); 4127 auto *ScalarTy = getLoadStoreType(I); 4128 4129 // In order to be widened, the pointer should be consecutive, first of all. 4130 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4131 return false; 4132 4133 // If the instruction is a store located in a predicated block, it will be 4134 // scalarized. 4135 if (isScalarWithPredication(I, VF)) 4136 return false; 4137 4138 // If the instruction's allocated size doesn't equal it's type size, it 4139 // requires padding and will be scalarized. 4140 auto &DL = I->getModule()->getDataLayout(); 4141 if (hasIrregularType(ScalarTy, DL)) 4142 return false; 4143 4144 return true; 4145 } 4146 4147 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4148 // We should not collect Uniforms more than once per VF. Right now, 4149 // this function is called from collectUniformsAndScalars(), which 4150 // already does this check. Collecting Uniforms for VF=1 does not make any 4151 // sense. 4152 4153 assert(VF.isVector() && !Uniforms.contains(VF) && 4154 "This function should not be visited twice for the same VF"); 4155 4156 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4157 // not analyze again. Uniforms.count(VF) will return 1. 4158 Uniforms[VF].clear(); 4159 4160 // We now know that the loop is vectorizable! 4161 // Collect instructions inside the loop that will remain uniform after 4162 // vectorization. 4163 4164 // Global values, params and instructions outside of current loop are out of 4165 // scope. 4166 auto isOutOfScope = [&](Value *V) -> bool { 4167 Instruction *I = dyn_cast<Instruction>(V); 4168 return (!I || !TheLoop->contains(I)); 4169 }; 4170 4171 // Worklist containing uniform instructions demanding lane 0. 4172 SetVector<Instruction *> Worklist; 4173 BasicBlock *Latch = TheLoop->getLoopLatch(); 4174 4175 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4176 // that are scalar with predication must not be considered uniform after 4177 // vectorization, because that would create an erroneous replicating region 4178 // where only a single instance out of VF should be formed. 4179 // TODO: optimize such seldom cases if found important, see PR40816. 4180 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4181 if (isOutOfScope(I)) { 4182 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4183 << *I << "\n"); 4184 return; 4185 } 4186 if (isScalarWithPredication(I, VF)) { 4187 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4188 << *I << "\n"); 4189 return; 4190 } 4191 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4192 Worklist.insert(I); 4193 }; 4194 4195 // Start with the conditional branch. If the branch condition is an 4196 // instruction contained in the loop that is only used by the branch, it is 4197 // uniform. 4198 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4199 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4200 addToWorklistIfAllowed(Cmp); 4201 4202 auto PrevVF = VF.divideCoefficientBy(2); 4203 // Return true if all lanes perform the same memory operation, and we can 4204 // thus chose to execute only one. 4205 auto isUniformMemOpUse = [&](Instruction *I) { 4206 // If the value was already known to not be uniform for the previous 4207 // (smaller VF), it cannot be uniform for the larger VF. 4208 if (PrevVF.isVector()) { 4209 auto Iter = Uniforms.find(PrevVF); 4210 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4211 return false; 4212 } 4213 if (!Legal->isUniformMemOp(*I, VF)) 4214 return false; 4215 if (isa<LoadInst>(I)) 4216 // Loading the same address always produces the same result - at least 4217 // assuming aliasing and ordering which have already been checked. 4218 return true; 4219 // Storing the same value on every iteration. 4220 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4221 }; 4222 4223 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4224 InstWidening WideningDecision = getWideningDecision(I, VF); 4225 assert(WideningDecision != CM_Unknown && 4226 "Widening decision should be ready at this moment"); 4227 4228 if (isUniformMemOpUse(I)) 4229 return true; 4230 4231 return (WideningDecision == CM_Widen || 4232 WideningDecision == CM_Widen_Reverse || 4233 WideningDecision == CM_Interleave); 4234 }; 4235 4236 // Returns true if Ptr is the pointer operand of a memory access instruction 4237 // I, I is known to not require scalarization, and the pointer is not also 4238 // stored. 4239 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4240 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4241 return false; 4242 return getLoadStorePointerOperand(I) == Ptr && 4243 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4244 }; 4245 4246 // Holds a list of values which are known to have at least one uniform use. 4247 // Note that there may be other uses which aren't uniform. A "uniform use" 4248 // here is something which only demands lane 0 of the unrolled iterations; 4249 // it does not imply that all lanes produce the same value (e.g. this is not 4250 // the usual meaning of uniform) 4251 SetVector<Value *> HasUniformUse; 4252 4253 // Scan the loop for instructions which are either a) known to have only 4254 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4255 for (auto *BB : TheLoop->blocks()) 4256 for (auto &I : *BB) { 4257 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4258 switch (II->getIntrinsicID()) { 4259 case Intrinsic::sideeffect: 4260 case Intrinsic::experimental_noalias_scope_decl: 4261 case Intrinsic::assume: 4262 case Intrinsic::lifetime_start: 4263 case Intrinsic::lifetime_end: 4264 if (TheLoop->hasLoopInvariantOperands(&I)) 4265 addToWorklistIfAllowed(&I); 4266 break; 4267 default: 4268 break; 4269 } 4270 } 4271 4272 // ExtractValue instructions must be uniform, because the operands are 4273 // known to be loop-invariant. 4274 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4275 assert(isOutOfScope(EVI->getAggregateOperand()) && 4276 "Expected aggregate value to be loop invariant"); 4277 addToWorklistIfAllowed(EVI); 4278 continue; 4279 } 4280 4281 // If there's no pointer operand, there's nothing to do. 4282 auto *Ptr = getLoadStorePointerOperand(&I); 4283 if (!Ptr) 4284 continue; 4285 4286 if (isUniformMemOpUse(&I)) 4287 addToWorklistIfAllowed(&I); 4288 4289 if (isVectorizedMemAccessUse(&I, Ptr)) 4290 HasUniformUse.insert(Ptr); 4291 } 4292 4293 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4294 // demanding) users. Since loops are assumed to be in LCSSA form, this 4295 // disallows uses outside the loop as well. 4296 for (auto *V : HasUniformUse) { 4297 if (isOutOfScope(V)) 4298 continue; 4299 auto *I = cast<Instruction>(V); 4300 auto UsersAreMemAccesses = 4301 llvm::all_of(I->users(), [&](User *U) -> bool { 4302 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4303 }); 4304 if (UsersAreMemAccesses) 4305 addToWorklistIfAllowed(I); 4306 } 4307 4308 // Expand Worklist in topological order: whenever a new instruction 4309 // is added , its users should be already inside Worklist. It ensures 4310 // a uniform instruction will only be used by uniform instructions. 4311 unsigned idx = 0; 4312 while (idx != Worklist.size()) { 4313 Instruction *I = Worklist[idx++]; 4314 4315 for (auto *OV : I->operand_values()) { 4316 // isOutOfScope operands cannot be uniform instructions. 4317 if (isOutOfScope(OV)) 4318 continue; 4319 // First order recurrence Phi's should typically be considered 4320 // non-uniform. 4321 auto *OP = dyn_cast<PHINode>(OV); 4322 if (OP && Legal->isFixedOrderRecurrence(OP)) 4323 continue; 4324 // If all the users of the operand are uniform, then add the 4325 // operand into the uniform worklist. 4326 auto *OI = cast<Instruction>(OV); 4327 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4328 auto *J = cast<Instruction>(U); 4329 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4330 })) 4331 addToWorklistIfAllowed(OI); 4332 } 4333 } 4334 4335 // For an instruction to be added into Worklist above, all its users inside 4336 // the loop should also be in Worklist. However, this condition cannot be 4337 // true for phi nodes that form a cyclic dependence. We must process phi 4338 // nodes separately. An induction variable will remain uniform if all users 4339 // of the induction variable and induction variable update remain uniform. 4340 // The code below handles both pointer and non-pointer induction variables. 4341 for (const auto &Induction : Legal->getInductionVars()) { 4342 auto *Ind = Induction.first; 4343 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4344 4345 // Determine if all users of the induction variable are uniform after 4346 // vectorization. 4347 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4348 auto *I = cast<Instruction>(U); 4349 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4350 isVectorizedMemAccessUse(I, Ind); 4351 }); 4352 if (!UniformInd) 4353 continue; 4354 4355 // Determine if all users of the induction variable update instruction are 4356 // uniform after vectorization. 4357 auto UniformIndUpdate = 4358 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4359 auto *I = cast<Instruction>(U); 4360 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4361 isVectorizedMemAccessUse(I, IndUpdate); 4362 }); 4363 if (!UniformIndUpdate) 4364 continue; 4365 4366 // The induction variable and its update instruction will remain uniform. 4367 addToWorklistIfAllowed(Ind); 4368 addToWorklistIfAllowed(IndUpdate); 4369 } 4370 4371 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4372 } 4373 4374 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4375 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4376 4377 if (Legal->getRuntimePointerChecking()->Need) { 4378 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4379 "runtime pointer checks needed. Enable vectorization of this " 4380 "loop with '#pragma clang loop vectorize(enable)' when " 4381 "compiling with -Os/-Oz", 4382 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4383 return true; 4384 } 4385 4386 if (!PSE.getPredicate().isAlwaysTrue()) { 4387 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4388 "runtime SCEV checks needed. Enable vectorization of this " 4389 "loop with '#pragma clang loop vectorize(enable)' when " 4390 "compiling with -Os/-Oz", 4391 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4392 return true; 4393 } 4394 4395 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4396 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4397 reportVectorizationFailure("Runtime stride check for small trip count", 4398 "runtime stride == 1 checks needed. Enable vectorization of " 4399 "this loop without such check by compiling with -Os/-Oz", 4400 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4401 return true; 4402 } 4403 4404 return false; 4405 } 4406 4407 ElementCount 4408 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4409 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4410 return ElementCount::getScalable(0); 4411 4412 if (Hints->isScalableVectorizationDisabled()) { 4413 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4414 "ScalableVectorizationDisabled", ORE, TheLoop); 4415 return ElementCount::getScalable(0); 4416 } 4417 4418 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4419 4420 auto MaxScalableVF = ElementCount::getScalable( 4421 std::numeric_limits<ElementCount::ScalarTy>::max()); 4422 4423 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4424 // FIXME: While for scalable vectors this is currently sufficient, this should 4425 // be replaced by a more detailed mechanism that filters out specific VFs, 4426 // instead of invalidating vectorization for a whole set of VFs based on the 4427 // MaxVF. 4428 4429 // Disable scalable vectorization if the loop contains unsupported reductions. 4430 if (!canVectorizeReductions(MaxScalableVF)) { 4431 reportVectorizationInfo( 4432 "Scalable vectorization not supported for the reduction " 4433 "operations found in this loop.", 4434 "ScalableVFUnfeasible", ORE, TheLoop); 4435 return ElementCount::getScalable(0); 4436 } 4437 4438 // Disable scalable vectorization if the loop contains any instructions 4439 // with element types not supported for scalable vectors. 4440 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4441 return !Ty->isVoidTy() && 4442 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4443 })) { 4444 reportVectorizationInfo("Scalable vectorization is not supported " 4445 "for all element types found in this loop.", 4446 "ScalableVFUnfeasible", ORE, TheLoop); 4447 return ElementCount::getScalable(0); 4448 } 4449 4450 if (Legal->isSafeForAnyVectorWidth()) 4451 return MaxScalableVF; 4452 4453 // Limit MaxScalableVF by the maximum safe dependence distance. 4454 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4455 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4456 else 4457 MaxScalableVF = ElementCount::getScalable(0); 4458 4459 if (!MaxScalableVF) 4460 reportVectorizationInfo( 4461 "Max legal vector width too small, scalable vectorization " 4462 "unfeasible.", 4463 "ScalableVFUnfeasible", ORE, TheLoop); 4464 4465 return MaxScalableVF; 4466 } 4467 4468 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4469 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4470 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4471 unsigned SmallestType, WidestType; 4472 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4473 4474 // Get the maximum safe dependence distance in bits computed by LAA. 4475 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4476 // the memory accesses that is most restrictive (involved in the smallest 4477 // dependence distance). 4478 unsigned MaxSafeElements = 4479 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4480 4481 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4482 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4483 4484 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4485 << ".\n"); 4486 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4487 << ".\n"); 4488 4489 // First analyze the UserVF, fall back if the UserVF should be ignored. 4490 if (UserVF) { 4491 auto MaxSafeUserVF = 4492 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4493 4494 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4495 // If `VF=vscale x N` is safe, then so is `VF=N` 4496 if (UserVF.isScalable()) 4497 return FixedScalableVFPair( 4498 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4499 else 4500 return UserVF; 4501 } 4502 4503 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4504 4505 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4506 // is better to ignore the hint and let the compiler choose a suitable VF. 4507 if (!UserVF.isScalable()) { 4508 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4509 << " is unsafe, clamping to max safe VF=" 4510 << MaxSafeFixedVF << ".\n"); 4511 ORE->emit([&]() { 4512 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4513 TheLoop->getStartLoc(), 4514 TheLoop->getHeader()) 4515 << "User-specified vectorization factor " 4516 << ore::NV("UserVectorizationFactor", UserVF) 4517 << " is unsafe, clamping to maximum safe vectorization factor " 4518 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4519 }); 4520 return MaxSafeFixedVF; 4521 } 4522 4523 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4524 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4525 << " is ignored because scalable vectors are not " 4526 "available.\n"); 4527 ORE->emit([&]() { 4528 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4529 TheLoop->getStartLoc(), 4530 TheLoop->getHeader()) 4531 << "User-specified vectorization factor " 4532 << ore::NV("UserVectorizationFactor", UserVF) 4533 << " is ignored because the target does not support scalable " 4534 "vectors. The compiler will pick a more suitable value."; 4535 }); 4536 } else { 4537 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4538 << " is unsafe. Ignoring scalable UserVF.\n"); 4539 ORE->emit([&]() { 4540 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4541 TheLoop->getStartLoc(), 4542 TheLoop->getHeader()) 4543 << "User-specified vectorization factor " 4544 << ore::NV("UserVectorizationFactor", UserVF) 4545 << " is unsafe. Ignoring the hint to let the compiler pick a " 4546 "more suitable value."; 4547 }); 4548 } 4549 } 4550 4551 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4552 << " / " << WidestType << " bits.\n"); 4553 4554 FixedScalableVFPair Result(ElementCount::getFixed(1), 4555 ElementCount::getScalable(0)); 4556 if (auto MaxVF = 4557 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4558 MaxSafeFixedVF, FoldTailByMasking)) 4559 Result.FixedVF = MaxVF; 4560 4561 if (auto MaxVF = 4562 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4563 MaxSafeScalableVF, FoldTailByMasking)) 4564 if (MaxVF.isScalable()) { 4565 Result.ScalableVF = MaxVF; 4566 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4567 << "\n"); 4568 } 4569 4570 return Result; 4571 } 4572 4573 FixedScalableVFPair 4574 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4575 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4576 // TODO: It may by useful to do since it's still likely to be dynamically 4577 // uniform if the target can skip. 4578 reportVectorizationFailure( 4579 "Not inserting runtime ptr check for divergent target", 4580 "runtime pointer checks needed. Not enabled for divergent target", 4581 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4582 return FixedScalableVFPair::getNone(); 4583 } 4584 4585 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4586 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 4587 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4588 if (TC == 1) { 4589 reportVectorizationFailure("Single iteration (non) loop", 4590 "loop trip count is one, irrelevant for vectorization", 4591 "SingleIterationLoop", ORE, TheLoop); 4592 return FixedScalableVFPair::getNone(); 4593 } 4594 4595 switch (ScalarEpilogueStatus) { 4596 case CM_ScalarEpilogueAllowed: 4597 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4598 case CM_ScalarEpilogueNotAllowedUsePredicate: 4599 [[fallthrough]]; 4600 case CM_ScalarEpilogueNotNeededUsePredicate: 4601 LLVM_DEBUG( 4602 dbgs() << "LV: vector predicate hint/switch found.\n" 4603 << "LV: Not allowing scalar epilogue, creating predicated " 4604 << "vector loop.\n"); 4605 break; 4606 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4607 // fallthrough as a special case of OptForSize 4608 case CM_ScalarEpilogueNotAllowedOptSize: 4609 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4610 LLVM_DEBUG( 4611 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4612 else 4613 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4614 << "count.\n"); 4615 4616 // Bail if runtime checks are required, which are not good when optimising 4617 // for size. 4618 if (runtimeChecksRequired()) 4619 return FixedScalableVFPair::getNone(); 4620 4621 break; 4622 } 4623 4624 // The only loops we can vectorize without a scalar epilogue, are loops with 4625 // a bottom-test and a single exiting block. We'd have to handle the fact 4626 // that not every instruction executes on the last iteration. This will 4627 // require a lane mask which varies through the vector loop body. (TODO) 4628 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4629 // If there was a tail-folding hint/switch, but we can't fold the tail by 4630 // masking, fallback to a vectorization with a scalar epilogue. 4631 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4632 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4633 "scalar epilogue instead.\n"); 4634 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4635 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4636 } 4637 return FixedScalableVFPair::getNone(); 4638 } 4639 4640 // Now try the tail folding 4641 4642 // Invalidate interleave groups that require an epilogue if we can't mask 4643 // the interleave-group. 4644 if (!useMaskedInterleavedAccesses(TTI)) { 4645 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4646 "No decisions should have been taken at this point"); 4647 // Note: There is no need to invalidate any cost modeling decisions here, as 4648 // non where taken so far. 4649 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4650 } 4651 4652 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4653 4654 // Avoid tail folding if the trip count is known to be a multiple of any VF 4655 // we choose. 4656 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4657 MaxFactors.FixedVF.getFixedValue(); 4658 if (MaxFactors.ScalableVF) { 4659 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4660 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4661 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4662 *MaxPowerOf2RuntimeVF, 4663 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4664 } else 4665 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4666 } 4667 4668 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4669 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4670 "MaxFixedVF must be a power of 2"); 4671 unsigned MaxVFtimesIC = 4672 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4673 ScalarEvolution *SE = PSE.getSE(); 4674 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 4675 const SCEV *ExitCount = SE->getAddExpr( 4676 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4677 const SCEV *Rem = SE->getURemExpr( 4678 SE->applyLoopGuards(ExitCount, TheLoop), 4679 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4680 if (Rem->isZero()) { 4681 // Accept MaxFixedVF if we do not have a tail. 4682 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4683 return MaxFactors; 4684 } 4685 } 4686 4687 // If we don't know the precise trip count, or if the trip count that we 4688 // found modulo the vectorization factor is not zero, try to fold the tail 4689 // by masking. 4690 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4691 if (Legal->prepareToFoldTailByMasking()) { 4692 CanFoldTailByMasking = true; 4693 return MaxFactors; 4694 } 4695 4696 // If there was a tail-folding hint/switch, but we can't fold the tail by 4697 // masking, fallback to a vectorization with a scalar epilogue. 4698 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4699 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4700 "scalar epilogue instead.\n"); 4701 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4702 return MaxFactors; 4703 } 4704 4705 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4706 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4707 return FixedScalableVFPair::getNone(); 4708 } 4709 4710 if (TC == 0) { 4711 reportVectorizationFailure( 4712 "Unable to calculate the loop count due to complex control flow", 4713 "unable to calculate the loop count due to complex control flow", 4714 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4715 return FixedScalableVFPair::getNone(); 4716 } 4717 4718 reportVectorizationFailure( 4719 "Cannot optimize for size and vectorize at the same time.", 4720 "cannot optimize for size and vectorize at the same time. " 4721 "Enable vectorization of this loop with '#pragma clang loop " 4722 "vectorize(enable)' when compiling with -Os/-Oz", 4723 "NoTailLoopWithOptForSize", ORE, TheLoop); 4724 return FixedScalableVFPair::getNone(); 4725 } 4726 4727 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4728 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4729 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4730 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4731 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4732 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4733 : TargetTransformInfo::RGK_FixedWidthVector); 4734 4735 // Convenience function to return the minimum of two ElementCounts. 4736 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4737 assert((LHS.isScalable() == RHS.isScalable()) && 4738 "Scalable flags must match"); 4739 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4740 }; 4741 4742 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4743 // Note that both WidestRegister and WidestType may not be a powers of 2. 4744 auto MaxVectorElementCount = ElementCount::get( 4745 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4746 ComputeScalableMaxVF); 4747 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4748 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4749 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4750 4751 if (!MaxVectorElementCount) { 4752 LLVM_DEBUG(dbgs() << "LV: The target has no " 4753 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4754 << " vector registers.\n"); 4755 return ElementCount::getFixed(1); 4756 } 4757 4758 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4759 if (MaxVectorElementCount.isScalable() && 4760 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4761 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4762 auto Min = Attr.getVScaleRangeMin(); 4763 WidestRegisterMinEC *= Min; 4764 } 4765 4766 // When a scalar epilogue is required, at least one iteration of the scalar 4767 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4768 // max VF that results in a dead vector loop. 4769 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4770 MaxTripCount -= 1; 4771 4772 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4773 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4774 // If upper bound loop trip count (TC) is known at compile time there is no 4775 // point in choosing VF greater than TC (as done in the loop below). Select 4776 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4777 // scalable, we only fall back on a fixed VF when the TC is less than or 4778 // equal to the known number of lanes. 4779 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4780 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4781 "exceeding the constant trip count: " 4782 << ClampedUpperTripCount << "\n"); 4783 return ElementCount::get( 4784 ClampedUpperTripCount, 4785 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4786 } 4787 4788 TargetTransformInfo::RegisterKind RegKind = 4789 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4790 : TargetTransformInfo::RGK_FixedWidthVector; 4791 ElementCount MaxVF = MaxVectorElementCount; 4792 if (MaximizeBandwidth || 4793 (MaximizeBandwidth.getNumOccurrences() == 0 && 4794 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4795 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4796 auto MaxVectorElementCountMaxBW = ElementCount::get( 4797 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4798 ComputeScalableMaxVF); 4799 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4800 4801 // Collect all viable vectorization factors larger than the default MaxVF 4802 // (i.e. MaxVectorElementCount). 4803 SmallVector<ElementCount, 8> VFs; 4804 for (ElementCount VS = MaxVectorElementCount * 2; 4805 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4806 VFs.push_back(VS); 4807 4808 // For each VF calculate its register usage. 4809 auto RUs = calculateRegisterUsage(VFs); 4810 4811 // Select the largest VF which doesn't require more registers than existing 4812 // ones. 4813 for (int i = RUs.size() - 1; i >= 0; --i) { 4814 bool Selected = true; 4815 for (auto &pair : RUs[i].MaxLocalUsers) { 4816 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 4817 if (pair.second > TargetNumRegisters) 4818 Selected = false; 4819 } 4820 if (Selected) { 4821 MaxVF = VFs[i]; 4822 break; 4823 } 4824 } 4825 if (ElementCount MinVF = 4826 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4827 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4828 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4829 << ") with target's minimum: " << MinVF << '\n'); 4830 MaxVF = MinVF; 4831 } 4832 } 4833 4834 // Invalidate any widening decisions we might have made, in case the loop 4835 // requires prediction (decided later), but we have already made some 4836 // load/store widening decisions. 4837 invalidateCostModelingDecisions(); 4838 } 4839 return MaxVF; 4840 } 4841 4842 /// Convenience function that returns the value of vscale_range iff 4843 /// vscale_range.min == vscale_range.max or otherwise returns the value 4844 /// returned by the corresponding TTI method. 4845 static std::optional<unsigned> 4846 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4847 const Function *Fn = L->getHeader()->getParent(); 4848 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4849 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4850 auto Min = Attr.getVScaleRangeMin(); 4851 auto Max = Attr.getVScaleRangeMax(); 4852 if (Max && Min == Max) 4853 return Max; 4854 } 4855 4856 return TTI.getVScaleForTuning(); 4857 } 4858 4859 bool LoopVectorizationPlanner::isMoreProfitable( 4860 const VectorizationFactor &A, const VectorizationFactor &B) const { 4861 InstructionCost CostA = A.Cost; 4862 InstructionCost CostB = B.Cost; 4863 4864 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 4865 4866 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 4867 // If the trip count is a known (possibly small) constant, the trip count 4868 // will be rounded up to an integer number of iterations under 4869 // FoldTailByMasking. The total cost in that case will be 4870 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4871 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4872 // some extra overheads, but for the purpose of comparing the costs of 4873 // different VFs we can use this to compare the total loop-body cost 4874 // expected after vectorization. 4875 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4876 InstructionCost VectorCost, 4877 InstructionCost ScalarCost) { 4878 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 4879 : VectorCost * (MaxTripCount / VF) + 4880 ScalarCost * (MaxTripCount % VF); 4881 }; 4882 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 4883 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 4884 4885 return RTCostA < RTCostB; 4886 } 4887 4888 // Improve estimate for the vector width if it is scalable. 4889 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4890 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4891 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4892 if (A.Width.isScalable()) 4893 EstimatedWidthA *= *VScale; 4894 if (B.Width.isScalable()) 4895 EstimatedWidthB *= *VScale; 4896 } 4897 4898 // Assume vscale may be larger than 1 (or the value being tuned for), 4899 // so that scalable vectorization is slightly favorable over fixed-width 4900 // vectorization. 4901 if (A.Width.isScalable() && !B.Width.isScalable()) 4902 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 4903 4904 // To avoid the need for FP division: 4905 // (CostA / A.Width) < (CostB / B.Width) 4906 // <=> (CostA * B.Width) < (CostB * A.Width) 4907 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 4908 } 4909 4910 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 4911 OptimizationRemarkEmitter *ORE, 4912 Loop *TheLoop) { 4913 if (InvalidCosts.empty()) 4914 return; 4915 4916 // Emit a report of VFs with invalid costs in the loop. 4917 4918 // Group the remarks per instruction, keeping the instruction order from 4919 // InvalidCosts. 4920 std::map<Instruction *, unsigned> Numbering; 4921 unsigned I = 0; 4922 for (auto &Pair : InvalidCosts) 4923 if (!Numbering.count(Pair.first)) 4924 Numbering[Pair.first] = I++; 4925 4926 // Sort the list, first on instruction(number) then on VF. 4927 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 4928 if (Numbering[A.first] != Numbering[B.first]) 4929 return Numbering[A.first] < Numbering[B.first]; 4930 ElementCountComparator ECC; 4931 return ECC(A.second, B.second); 4932 }); 4933 4934 // For a list of ordered instruction-vf pairs: 4935 // [(load, vf1), (load, vf2), (store, vf1)] 4936 // Group the instructions together to emit separate remarks for: 4937 // load (vf1, vf2) 4938 // store (vf1) 4939 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 4940 auto Subset = ArrayRef<InstructionVFPair>(); 4941 do { 4942 if (Subset.empty()) 4943 Subset = Tail.take_front(1); 4944 4945 Instruction *I = Subset.front().first; 4946 4947 // If the next instruction is different, or if there are no other pairs, 4948 // emit a remark for the collated subset. e.g. 4949 // [(load, vf1), (load, vf2))] 4950 // to emit: 4951 // remark: invalid costs for 'load' at VF=(vf, vf2) 4952 if (Subset == Tail || Tail[Subset.size()].first != I) { 4953 std::string OutString; 4954 raw_string_ostream OS(OutString); 4955 assert(!Subset.empty() && "Unexpected empty range"); 4956 OS << "Instruction with invalid costs prevented vectorization at VF=("; 4957 for (const auto &Pair : Subset) 4958 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4959 OS << "):"; 4960 if (auto *CI = dyn_cast<CallInst>(I)) 4961 OS << " call to " << CI->getCalledFunction()->getName(); 4962 else 4963 OS << " " << I->getOpcodeName(); 4964 OS.flush(); 4965 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 4966 Tail = Tail.drop_front(Subset.size()); 4967 Subset = {}; 4968 } else 4969 // Grow the subset by one element 4970 Subset = Tail.take_front(Subset.size() + 1); 4971 } while (!Tail.empty()); 4972 } 4973 4974 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 4975 const ElementCountSet &VFCandidates) { 4976 InstructionCost ExpectedCost = 4977 CM.expectedCost(ElementCount::getFixed(1)).first; 4978 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4979 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4980 assert(VFCandidates.count(ElementCount::getFixed(1)) && 4981 "Expected Scalar VF to be a candidate"); 4982 4983 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4984 ExpectedCost); 4985 VectorizationFactor ChosenFactor = ScalarCost; 4986 4987 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4988 if (ForceVectorization && VFCandidates.size() > 1) { 4989 // Ignore scalar width, because the user explicitly wants vectorization. 4990 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4991 // evaluation. 4992 ChosenFactor.Cost = InstructionCost::getMax(); 4993 } 4994 4995 SmallVector<InstructionVFPair> InvalidCosts; 4996 for (const auto &i : VFCandidates) { 4997 // The cost for scalar VF=1 is already calculated, so ignore it. 4998 if (i.isScalar()) 4999 continue; 5000 5001 LoopVectorizationCostModel::VectorizationCostTy C = 5002 CM.expectedCost(i, &InvalidCosts); 5003 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5004 5005 #ifndef NDEBUG 5006 unsigned AssumedMinimumVscale = 5007 getVScaleForTuning(OrigLoop, TTI).value_or(1); 5008 unsigned Width = 5009 Candidate.Width.isScalable() 5010 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5011 : Candidate.Width.getFixedValue(); 5012 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5013 << " costs: " << (Candidate.Cost / Width)); 5014 if (i.isScalable()) 5015 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5016 << AssumedMinimumVscale << ")"); 5017 LLVM_DEBUG(dbgs() << ".\n"); 5018 #endif 5019 5020 if (!C.second && !ForceVectorization) { 5021 LLVM_DEBUG( 5022 dbgs() << "LV: Not considering vector loop of width " << i 5023 << " because it will not generate any vector instructions.\n"); 5024 continue; 5025 } 5026 5027 // If profitable add it to ProfitableVF list. 5028 if (isMoreProfitable(Candidate, ScalarCost)) 5029 ProfitableVFs.push_back(Candidate); 5030 5031 if (isMoreProfitable(Candidate, ChosenFactor)) 5032 ChosenFactor = Candidate; 5033 } 5034 5035 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5036 5037 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5038 reportVectorizationFailure( 5039 "There are conditional stores.", 5040 "store that is conditionally executed prevents vectorization", 5041 "ConditionalStore", ORE, OrigLoop); 5042 ChosenFactor = ScalarCost; 5043 } 5044 5045 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5046 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5047 << "LV: Vectorization seems to be not beneficial, " 5048 << "but was forced by a user.\n"); 5049 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5050 return ChosenFactor; 5051 } 5052 5053 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5054 ElementCount VF) const { 5055 // Cross iteration phis such as reductions need special handling and are 5056 // currently unsupported. 5057 if (any_of(OrigLoop->getHeader()->phis(), 5058 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5059 return false; 5060 5061 // Phis with uses outside of the loop require special handling and are 5062 // currently unsupported. 5063 for (const auto &Entry : Legal->getInductionVars()) { 5064 // Look for uses of the value of the induction at the last iteration. 5065 Value *PostInc = 5066 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5067 for (User *U : PostInc->users()) 5068 if (!OrigLoop->contains(cast<Instruction>(U))) 5069 return false; 5070 // Look for uses of penultimate value of the induction. 5071 for (User *U : Entry.first->users()) 5072 if (!OrigLoop->contains(cast<Instruction>(U))) 5073 return false; 5074 } 5075 5076 // Epilogue vectorization code has not been auditted to ensure it handles 5077 // non-latch exits properly. It may be fine, but it needs auditted and 5078 // tested. 5079 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5080 return false; 5081 5082 return true; 5083 } 5084 5085 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5086 const ElementCount VF) const { 5087 // FIXME: We need a much better cost-model to take different parameters such 5088 // as register pressure, code size increase and cost of extra branches into 5089 // account. For now we apply a very crude heuristic and only consider loops 5090 // with vectorization factors larger than a certain value. 5091 5092 // Allow the target to opt out entirely. 5093 if (!TTI.preferEpilogueVectorization()) 5094 return false; 5095 5096 // We also consider epilogue vectorization unprofitable for targets that don't 5097 // consider interleaving beneficial (eg. MVE). 5098 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5099 return false; 5100 5101 unsigned Multiplier = 1; 5102 if (VF.isScalable()) 5103 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5104 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5105 return true; 5106 return false; 5107 } 5108 5109 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5110 const ElementCount MainLoopVF, unsigned IC) { 5111 VectorizationFactor Result = VectorizationFactor::Disabled(); 5112 if (!EnableEpilogueVectorization) { 5113 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5114 return Result; 5115 } 5116 5117 if (!CM.isScalarEpilogueAllowed()) { 5118 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5119 "epilogue is allowed.\n"); 5120 return Result; 5121 } 5122 5123 // Not really a cost consideration, but check for unsupported cases here to 5124 // simplify the logic. 5125 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5126 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5127 "is not a supported candidate.\n"); 5128 return Result; 5129 } 5130 5131 if (EpilogueVectorizationForceVF > 1) { 5132 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5133 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5134 if (hasPlanWithVF(ForcedEC)) 5135 return {ForcedEC, 0, 0}; 5136 else { 5137 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5138 "viable.\n"); 5139 return Result; 5140 } 5141 } 5142 5143 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5144 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5145 LLVM_DEBUG( 5146 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5147 return Result; 5148 } 5149 5150 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5151 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5152 "this loop\n"); 5153 return Result; 5154 } 5155 5156 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5157 // the main loop handles 8 lanes per iteration. We could still benefit from 5158 // vectorizing the epilogue loop with VF=4. 5159 ElementCount EstimatedRuntimeVF = MainLoopVF; 5160 if (MainLoopVF.isScalable()) { 5161 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5162 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5163 EstimatedRuntimeVF *= *VScale; 5164 } 5165 5166 ScalarEvolution &SE = *PSE.getSE(); 5167 Type *TCType = Legal->getWidestInductionType(); 5168 const SCEV *RemainingIterations = nullptr; 5169 for (auto &NextVF : ProfitableVFs) { 5170 // Skip candidate VFs without a corresponding VPlan. 5171 if (!hasPlanWithVF(NextVF.Width)) 5172 continue; 5173 5174 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5175 // vectors) or the VF of the main loop (fixed vectors). 5176 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5177 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5178 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5179 continue; 5180 5181 // If NextVF is greater than the number of remaining iterations, the 5182 // epilogue loop would be dead. Skip such factors. 5183 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5184 // TODO: extend to support scalable VFs. 5185 if (!RemainingIterations) { 5186 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5187 RemainingIterations = SE.getURemExpr( 5188 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5189 } 5190 if (SE.isKnownPredicate( 5191 CmpInst::ICMP_UGT, 5192 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5193 RemainingIterations)) 5194 continue; 5195 } 5196 5197 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5198 Result = NextVF; 5199 } 5200 5201 if (Result != VectorizationFactor::Disabled()) 5202 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5203 << Result.Width << "\n"); 5204 return Result; 5205 } 5206 5207 std::pair<unsigned, unsigned> 5208 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5209 unsigned MinWidth = -1U; 5210 unsigned MaxWidth = 8; 5211 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5212 // For in-loop reductions, no element types are added to ElementTypesInLoop 5213 // if there are no loads/stores in the loop. In this case, check through the 5214 // reduction variables to determine the maximum width. 5215 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5216 // Reset MaxWidth so that we can find the smallest type used by recurrences 5217 // in the loop. 5218 MaxWidth = -1U; 5219 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5220 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5221 // When finding the min width used by the recurrence we need to account 5222 // for casts on the input operands of the recurrence. 5223 MaxWidth = std::min<unsigned>( 5224 MaxWidth, std::min<unsigned>( 5225 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5226 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5227 } 5228 } else { 5229 for (Type *T : ElementTypesInLoop) { 5230 MinWidth = std::min<unsigned>( 5231 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5232 MaxWidth = std::max<unsigned>( 5233 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5234 } 5235 } 5236 return {MinWidth, MaxWidth}; 5237 } 5238 5239 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5240 ElementTypesInLoop.clear(); 5241 // For each block. 5242 for (BasicBlock *BB : TheLoop->blocks()) { 5243 // For each instruction in the loop. 5244 for (Instruction &I : BB->instructionsWithoutDebug()) { 5245 Type *T = I.getType(); 5246 5247 // Skip ignored values. 5248 if (ValuesToIgnore.count(&I)) 5249 continue; 5250 5251 // Only examine Loads, Stores and PHINodes. 5252 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5253 continue; 5254 5255 // Examine PHI nodes that are reduction variables. Update the type to 5256 // account for the recurrence type. 5257 if (auto *PN = dyn_cast<PHINode>(&I)) { 5258 if (!Legal->isReductionVariable(PN)) 5259 continue; 5260 const RecurrenceDescriptor &RdxDesc = 5261 Legal->getReductionVars().find(PN)->second; 5262 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5263 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5264 RdxDesc.getRecurrenceType(), 5265 TargetTransformInfo::ReductionFlags())) 5266 continue; 5267 T = RdxDesc.getRecurrenceType(); 5268 } 5269 5270 // Examine the stored values. 5271 if (auto *ST = dyn_cast<StoreInst>(&I)) 5272 T = ST->getValueOperand()->getType(); 5273 5274 assert(T->isSized() && 5275 "Expected the load/store/recurrence type to be sized"); 5276 5277 ElementTypesInLoop.insert(T); 5278 } 5279 } 5280 } 5281 5282 unsigned 5283 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5284 InstructionCost LoopCost) { 5285 // -- The interleave heuristics -- 5286 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5287 // There are many micro-architectural considerations that we can't predict 5288 // at this level. For example, frontend pressure (on decode or fetch) due to 5289 // code size, or the number and capabilities of the execution ports. 5290 // 5291 // We use the following heuristics to select the interleave count: 5292 // 1. If the code has reductions, then we interleave to break the cross 5293 // iteration dependency. 5294 // 2. If the loop is really small, then we interleave to reduce the loop 5295 // overhead. 5296 // 3. We don't interleave if we think that we will spill registers to memory 5297 // due to the increased register pressure. 5298 5299 if (!isScalarEpilogueAllowed()) 5300 return 1; 5301 5302 // We used the distance for the interleave count. 5303 if (!Legal->isSafeForAnyVectorWidth()) 5304 return 1; 5305 5306 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5307 const bool HasReductions = !Legal->getReductionVars().empty(); 5308 // Do not interleave loops with a relatively small known or estimated trip 5309 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5310 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5311 // because with the above conditions interleaving can expose ILP and break 5312 // cross iteration dependences for reductions. 5313 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5314 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5315 return 1; 5316 5317 // If we did not calculate the cost for VF (because the user selected the VF) 5318 // then we calculate the cost of VF here. 5319 if (LoopCost == 0) { 5320 LoopCost = expectedCost(VF).first; 5321 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5322 5323 // Loop body is free and there is no need for interleaving. 5324 if (LoopCost == 0) 5325 return 1; 5326 } 5327 5328 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5329 // We divide by these constants so assume that we have at least one 5330 // instruction that uses at least one register. 5331 for (auto& pair : R.MaxLocalUsers) { 5332 pair.second = std::max(pair.second, 1U); 5333 } 5334 5335 // We calculate the interleave count using the following formula. 5336 // Subtract the number of loop invariants from the number of available 5337 // registers. These registers are used by all of the interleaved instances. 5338 // Next, divide the remaining registers by the number of registers that is 5339 // required by the loop, in order to estimate how many parallel instances 5340 // fit without causing spills. All of this is rounded down if necessary to be 5341 // a power of two. We want power of two interleave count to simplify any 5342 // addressing operations or alignment considerations. 5343 // We also want power of two interleave counts to ensure that the induction 5344 // variable of the vector loop wraps to zero, when tail is folded by masking; 5345 // this currently happens when OptForSize, in which case IC is set to 1 above. 5346 unsigned IC = UINT_MAX; 5347 5348 for (auto& pair : R.MaxLocalUsers) { 5349 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5350 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5351 << " registers of " 5352 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5353 if (VF.isScalar()) { 5354 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5355 TargetNumRegisters = ForceTargetNumScalarRegs; 5356 } else { 5357 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5358 TargetNumRegisters = ForceTargetNumVectorRegs; 5359 } 5360 unsigned MaxLocalUsers = pair.second; 5361 unsigned LoopInvariantRegs = 0; 5362 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5363 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5364 5365 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5366 MaxLocalUsers); 5367 // Don't count the induction variable as interleaved. 5368 if (EnableIndVarRegisterHeur) { 5369 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5370 std::max(1U, (MaxLocalUsers - 1))); 5371 } 5372 5373 IC = std::min(IC, TmpIC); 5374 } 5375 5376 // Clamp the interleave ranges to reasonable counts. 5377 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5378 5379 // Check if the user has overridden the max. 5380 if (VF.isScalar()) { 5381 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5382 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5383 } else { 5384 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5385 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5386 } 5387 5388 unsigned EstimatedVF = VF.getKnownMinValue(); 5389 if (VF.isScalable()) { 5390 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) 5391 EstimatedVF *= *VScale; 5392 } 5393 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 5394 5395 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5396 if (KnownTC) { 5397 // If trip count is known we select between two prospective ICs, where 5398 // 1) the aggressive IC is capped by the trip count divided by VF 5399 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 5400 // The final IC is selected in a way that the epilogue loop trip count is 5401 // minimized while maximizing the IC itself, so that we either run the 5402 // vector loop at least once if it generates a small epilogue loop, or else 5403 // we run the vector loop at least twice. 5404 5405 unsigned InterleaveCountUB = bit_floor( 5406 std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); 5407 unsigned InterleaveCountLB = bit_floor(std::max( 5408 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5409 MaxInterleaveCount = InterleaveCountLB; 5410 5411 if (InterleaveCountUB != InterleaveCountLB) { 5412 unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); 5413 unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); 5414 // If both produce same scalar tail, maximize the IC to do the same work 5415 // in fewer vector loop iterations 5416 if (TailTripCountUB == TailTripCountLB) 5417 MaxInterleaveCount = InterleaveCountUB; 5418 } 5419 } else if (BestKnownTC) { 5420 // If trip count is an estimated compile time constant, limit the 5421 // IC to be capped by the trip count divided by VF * 2, such that the vector 5422 // loop runs at least twice to make interleaving seem profitable when there 5423 // is an epilogue loop present. Since exact Trip count is not known we 5424 // choose to be conservative in our IC estimate. 5425 MaxInterleaveCount = bit_floor(std::max( 5426 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5427 } 5428 5429 assert(MaxInterleaveCount > 0 && 5430 "Maximum interleave count must be greater than 0"); 5431 5432 // Clamp the calculated IC to be between the 1 and the max interleave count 5433 // that the target and trip count allows. 5434 if (IC > MaxInterleaveCount) 5435 IC = MaxInterleaveCount; 5436 else 5437 // Make sure IC is greater than 0. 5438 IC = std::max(1u, IC); 5439 5440 assert(IC > 0 && "Interleave count must be greater than 0."); 5441 5442 // Interleave if we vectorized this loop and there is a reduction that could 5443 // benefit from interleaving. 5444 if (VF.isVector() && HasReductions) { 5445 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5446 return IC; 5447 } 5448 5449 // For any scalar loop that either requires runtime checks or predication we 5450 // are better off leaving this to the unroller. Note that if we've already 5451 // vectorized the loop we will have done the runtime check and so interleaving 5452 // won't require further checks. 5453 bool ScalarInterleavingRequiresPredication = 5454 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5455 return Legal->blockNeedsPredication(BB); 5456 })); 5457 bool ScalarInterleavingRequiresRuntimePointerCheck = 5458 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5459 5460 // We want to interleave small loops in order to reduce the loop overhead and 5461 // potentially expose ILP opportunities. 5462 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5463 << "LV: IC is " << IC << '\n' 5464 << "LV: VF is " << VF << '\n'); 5465 const bool AggressivelyInterleaveReductions = 5466 TTI.enableAggressiveInterleaving(HasReductions); 5467 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5468 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5469 // We assume that the cost overhead is 1 and we use the cost model 5470 // to estimate the cost of the loop and interleave until the cost of the 5471 // loop overhead is about 5% of the cost of the loop. 5472 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5473 SmallLoopCost / *LoopCost.getValue())); 5474 5475 // Interleave until store/load ports (estimated by max interleave count) are 5476 // saturated. 5477 unsigned NumStores = Legal->getNumStores(); 5478 unsigned NumLoads = Legal->getNumLoads(); 5479 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5480 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5481 5482 // There is little point in interleaving for reductions containing selects 5483 // and compares when VF=1 since it may just create more overhead than it's 5484 // worth for loops with small trip counts. This is because we still have to 5485 // do the final reduction after the loop. 5486 bool HasSelectCmpReductions = 5487 HasReductions && 5488 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5489 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5490 return RecurrenceDescriptor::isAnyOfRecurrenceKind( 5491 RdxDesc.getRecurrenceKind()); 5492 }); 5493 if (HasSelectCmpReductions) { 5494 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5495 return 1; 5496 } 5497 5498 // If we have a scalar reduction (vector reductions are already dealt with 5499 // by this point), we can increase the critical path length if the loop 5500 // we're interleaving is inside another loop. For tree-wise reductions 5501 // set the limit to 2, and for ordered reductions it's best to disable 5502 // interleaving entirely. 5503 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5504 bool HasOrderedReductions = 5505 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5506 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5507 return RdxDesc.isOrdered(); 5508 }); 5509 if (HasOrderedReductions) { 5510 LLVM_DEBUG( 5511 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5512 return 1; 5513 } 5514 5515 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5516 SmallIC = std::min(SmallIC, F); 5517 StoresIC = std::min(StoresIC, F); 5518 LoadsIC = std::min(LoadsIC, F); 5519 } 5520 5521 if (EnableLoadStoreRuntimeInterleave && 5522 std::max(StoresIC, LoadsIC) > SmallIC) { 5523 LLVM_DEBUG( 5524 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5525 return std::max(StoresIC, LoadsIC); 5526 } 5527 5528 // If there are scalar reductions and TTI has enabled aggressive 5529 // interleaving for reductions, we will interleave to expose ILP. 5530 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5531 AggressivelyInterleaveReductions) { 5532 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5533 // Interleave no less than SmallIC but not as aggressive as the normal IC 5534 // to satisfy the rare situation when resources are too limited. 5535 return std::max(IC / 2, SmallIC); 5536 } else { 5537 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5538 return SmallIC; 5539 } 5540 } 5541 5542 // Interleave if this is a large loop (small loops are already dealt with by 5543 // this point) that could benefit from interleaving. 5544 if (AggressivelyInterleaveReductions) { 5545 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5546 return IC; 5547 } 5548 5549 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5550 return 1; 5551 } 5552 5553 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5554 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5555 // This function calculates the register usage by measuring the highest number 5556 // of values that are alive at a single location. Obviously, this is a very 5557 // rough estimation. We scan the loop in a topological order in order and 5558 // assign a number to each instruction. We use RPO to ensure that defs are 5559 // met before their users. We assume that each instruction that has in-loop 5560 // users starts an interval. We record every time that an in-loop value is 5561 // used, so we have a list of the first and last occurrences of each 5562 // instruction. Next, we transpose this data structure into a multi map that 5563 // holds the list of intervals that *end* at a specific location. This multi 5564 // map allows us to perform a linear search. We scan the instructions linearly 5565 // and record each time that a new interval starts, by placing it in a set. 5566 // If we find this value in the multi-map then we remove it from the set. 5567 // The max register usage is the maximum size of the set. 5568 // We also search for instructions that are defined outside the loop, but are 5569 // used inside the loop. We need this number separately from the max-interval 5570 // usage number because when we unroll, loop-invariant values do not take 5571 // more register. 5572 LoopBlocksDFS DFS(TheLoop); 5573 DFS.perform(LI); 5574 5575 RegisterUsage RU; 5576 5577 // Each 'key' in the map opens a new interval. The values 5578 // of the map are the index of the 'last seen' usage of the 5579 // instruction that is the key. 5580 using IntervalMap = DenseMap<Instruction *, unsigned>; 5581 5582 // Maps instruction to its index. 5583 SmallVector<Instruction *, 64> IdxToInstr; 5584 // Marks the end of each interval. 5585 IntervalMap EndPoint; 5586 // Saves the list of instruction indices that are used in the loop. 5587 SmallPtrSet<Instruction *, 8> Ends; 5588 // Saves the list of values that are used in the loop but are defined outside 5589 // the loop (not including non-instruction values such as arguments and 5590 // constants). 5591 SmallSetVector<Instruction *, 8> LoopInvariants; 5592 5593 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5594 for (Instruction &I : BB->instructionsWithoutDebug()) { 5595 IdxToInstr.push_back(&I); 5596 5597 // Save the end location of each USE. 5598 for (Value *U : I.operands()) { 5599 auto *Instr = dyn_cast<Instruction>(U); 5600 5601 // Ignore non-instruction values such as arguments, constants, etc. 5602 // FIXME: Might need some motivation why these values are ignored. If 5603 // for example an argument is used inside the loop it will increase the 5604 // register pressure (so shouldn't we add it to LoopInvariants). 5605 if (!Instr) 5606 continue; 5607 5608 // If this instruction is outside the loop then record it and continue. 5609 if (!TheLoop->contains(Instr)) { 5610 LoopInvariants.insert(Instr); 5611 continue; 5612 } 5613 5614 // Overwrite previous end points. 5615 EndPoint[Instr] = IdxToInstr.size(); 5616 Ends.insert(Instr); 5617 } 5618 } 5619 } 5620 5621 // Saves the list of intervals that end with the index in 'key'. 5622 using InstrList = SmallVector<Instruction *, 2>; 5623 DenseMap<unsigned, InstrList> TransposeEnds; 5624 5625 // Transpose the EndPoints to a list of values that end at each index. 5626 for (auto &Interval : EndPoint) 5627 TransposeEnds[Interval.second].push_back(Interval.first); 5628 5629 SmallPtrSet<Instruction *, 8> OpenIntervals; 5630 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5631 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5632 5633 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5634 5635 const auto &TTICapture = TTI; 5636 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5637 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5638 return 0; 5639 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5640 }; 5641 5642 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5643 Instruction *I = IdxToInstr[i]; 5644 5645 // Remove all of the instructions that end at this location. 5646 InstrList &List = TransposeEnds[i]; 5647 for (Instruction *ToRemove : List) 5648 OpenIntervals.erase(ToRemove); 5649 5650 // Ignore instructions that are never used within the loop. 5651 if (!Ends.count(I)) 5652 continue; 5653 5654 // Skip ignored values. 5655 if (ValuesToIgnore.count(I)) 5656 continue; 5657 5658 collectInLoopReductions(); 5659 5660 // For each VF find the maximum usage of registers. 5661 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5662 // Count the number of registers used, per register class, given all open 5663 // intervals. 5664 // Note that elements in this SmallMapVector will be default constructed 5665 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5666 // there is no previous entry for ClassID. 5667 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5668 5669 if (VFs[j].isScalar()) { 5670 for (auto *Inst : OpenIntervals) { 5671 unsigned ClassID = 5672 TTI.getRegisterClassForType(false, Inst->getType()); 5673 // FIXME: The target might use more than one register for the type 5674 // even in the scalar case. 5675 RegUsage[ClassID] += 1; 5676 } 5677 } else { 5678 collectUniformsAndScalars(VFs[j]); 5679 for (auto *Inst : OpenIntervals) { 5680 // Skip ignored values for VF > 1. 5681 if (VecValuesToIgnore.count(Inst)) 5682 continue; 5683 if (isScalarAfterVectorization(Inst, VFs[j])) { 5684 unsigned ClassID = 5685 TTI.getRegisterClassForType(false, Inst->getType()); 5686 // FIXME: The target might use more than one register for the type 5687 // even in the scalar case. 5688 RegUsage[ClassID] += 1; 5689 } else { 5690 unsigned ClassID = 5691 TTI.getRegisterClassForType(true, Inst->getType()); 5692 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5693 } 5694 } 5695 } 5696 5697 for (auto& pair : RegUsage) { 5698 auto &Entry = MaxUsages[j][pair.first]; 5699 Entry = std::max(Entry, pair.second); 5700 } 5701 } 5702 5703 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5704 << OpenIntervals.size() << '\n'); 5705 5706 // Add the current instruction to the list of open intervals. 5707 OpenIntervals.insert(I); 5708 } 5709 5710 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5711 // Note that elements in this SmallMapVector will be default constructed 5712 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5713 // there is no previous entry for ClassID. 5714 SmallMapVector<unsigned, unsigned, 4> Invariant; 5715 5716 for (auto *Inst : LoopInvariants) { 5717 // FIXME: The target might use more than one register for the type 5718 // even in the scalar case. 5719 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5720 auto *I = cast<Instruction>(U); 5721 return TheLoop != LI->getLoopFor(I->getParent()) || 5722 isScalarAfterVectorization(I, VFs[i]); 5723 }); 5724 5725 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 5726 unsigned ClassID = 5727 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5728 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5729 } 5730 5731 LLVM_DEBUG({ 5732 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5733 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5734 << " item\n"; 5735 for (const auto &pair : MaxUsages[i]) { 5736 dbgs() << "LV(REG): RegisterClass: " 5737 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5738 << " registers\n"; 5739 } 5740 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5741 << " item\n"; 5742 for (const auto &pair : Invariant) { 5743 dbgs() << "LV(REG): RegisterClass: " 5744 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5745 << " registers\n"; 5746 } 5747 }); 5748 5749 RU.LoopInvariantRegs = Invariant; 5750 RU.MaxLocalUsers = MaxUsages[i]; 5751 RUs[i] = RU; 5752 } 5753 5754 return RUs; 5755 } 5756 5757 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5758 ElementCount VF) { 5759 // TODO: Cost model for emulated masked load/store is completely 5760 // broken. This hack guides the cost model to use an artificially 5761 // high enough value to practically disable vectorization with such 5762 // operations, except where previously deployed legality hack allowed 5763 // using very low cost values. This is to avoid regressions coming simply 5764 // from moving "masked load/store" check from legality to cost model. 5765 // Masked Load/Gather emulation was previously never allowed. 5766 // Limited number of Masked Store/Scatter emulation was allowed. 5767 assert((isPredicatedInst(I)) && 5768 "Expecting a scalar emulated instruction"); 5769 return isa<LoadInst>(I) || 5770 (isa<StoreInst>(I) && 5771 NumPredStores > NumberOfStoresToPredicate); 5772 } 5773 5774 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5775 // If we aren't vectorizing the loop, or if we've already collected the 5776 // instructions to scalarize, there's nothing to do. Collection may already 5777 // have occurred if we have a user-selected VF and are now computing the 5778 // expected cost for interleaving. 5779 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5780 return; 5781 5782 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5783 // not profitable to scalarize any instructions, the presence of VF in the 5784 // map will indicate that we've analyzed it already. 5785 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5786 5787 PredicatedBBsAfterVectorization[VF].clear(); 5788 5789 // Find all the instructions that are scalar with predication in the loop and 5790 // determine if it would be better to not if-convert the blocks they are in. 5791 // If so, we also record the instructions to scalarize. 5792 for (BasicBlock *BB : TheLoop->blocks()) { 5793 if (!blockNeedsPredicationForAnyReason(BB)) 5794 continue; 5795 for (Instruction &I : *BB) 5796 if (isScalarWithPredication(&I, VF)) { 5797 ScalarCostsTy ScalarCosts; 5798 // Do not apply discount if scalable, because that would lead to 5799 // invalid scalarization costs. 5800 // Do not apply discount logic if hacked cost is needed 5801 // for emulated masked memrefs. 5802 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 5803 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5804 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5805 // Remember that BB will remain after vectorization. 5806 PredicatedBBsAfterVectorization[VF].insert(BB); 5807 } 5808 } 5809 } 5810 5811 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5812 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5813 assert(!isUniformAfterVectorization(PredInst, VF) && 5814 "Instruction marked uniform-after-vectorization will be predicated"); 5815 5816 // Initialize the discount to zero, meaning that the scalar version and the 5817 // vector version cost the same. 5818 InstructionCost Discount = 0; 5819 5820 // Holds instructions to analyze. The instructions we visit are mapped in 5821 // ScalarCosts. Those instructions are the ones that would be scalarized if 5822 // we find that the scalar version costs less. 5823 SmallVector<Instruction *, 8> Worklist; 5824 5825 // Returns true if the given instruction can be scalarized. 5826 auto canBeScalarized = [&](Instruction *I) -> bool { 5827 // We only attempt to scalarize instructions forming a single-use chain 5828 // from the original predicated block that would otherwise be vectorized. 5829 // Although not strictly necessary, we give up on instructions we know will 5830 // already be scalar to avoid traversing chains that are unlikely to be 5831 // beneficial. 5832 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5833 isScalarAfterVectorization(I, VF)) 5834 return false; 5835 5836 // If the instruction is scalar with predication, it will be analyzed 5837 // separately. We ignore it within the context of PredInst. 5838 if (isScalarWithPredication(I, VF)) 5839 return false; 5840 5841 // If any of the instruction's operands are uniform after vectorization, 5842 // the instruction cannot be scalarized. This prevents, for example, a 5843 // masked load from being scalarized. 5844 // 5845 // We assume we will only emit a value for lane zero of an instruction 5846 // marked uniform after vectorization, rather than VF identical values. 5847 // Thus, if we scalarize an instruction that uses a uniform, we would 5848 // create uses of values corresponding to the lanes we aren't emitting code 5849 // for. This behavior can be changed by allowing getScalarValue to clone 5850 // the lane zero values for uniforms rather than asserting. 5851 for (Use &U : I->operands()) 5852 if (auto *J = dyn_cast<Instruction>(U.get())) 5853 if (isUniformAfterVectorization(J, VF)) 5854 return false; 5855 5856 // Otherwise, we can scalarize the instruction. 5857 return true; 5858 }; 5859 5860 // Compute the expected cost discount from scalarizing the entire expression 5861 // feeding the predicated instruction. We currently only consider expressions 5862 // that are single-use instruction chains. 5863 Worklist.push_back(PredInst); 5864 while (!Worklist.empty()) { 5865 Instruction *I = Worklist.pop_back_val(); 5866 5867 // If we've already analyzed the instruction, there's nothing to do. 5868 if (ScalarCosts.contains(I)) 5869 continue; 5870 5871 // Compute the cost of the vector instruction. Note that this cost already 5872 // includes the scalarization overhead of the predicated instruction. 5873 InstructionCost VectorCost = getInstructionCost(I, VF).first; 5874 5875 // Compute the cost of the scalarized instruction. This cost is the cost of 5876 // the instruction as if it wasn't if-converted and instead remained in the 5877 // predicated block. We will scale this cost by block probability after 5878 // computing the scalarization overhead. 5879 InstructionCost ScalarCost = 5880 VF.getFixedValue() * 5881 getInstructionCost(I, ElementCount::getFixed(1)).first; 5882 5883 // Compute the scalarization overhead of needed insertelement instructions 5884 // and phi nodes. 5885 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5886 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5887 ScalarCost += TTI.getScalarizationOverhead( 5888 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5889 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5890 /*Extract*/ false, CostKind); 5891 ScalarCost += 5892 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5893 } 5894 5895 // Compute the scalarization overhead of needed extractelement 5896 // instructions. For each of the instruction's operands, if the operand can 5897 // be scalarized, add it to the worklist; otherwise, account for the 5898 // overhead. 5899 for (Use &U : I->operands()) 5900 if (auto *J = dyn_cast<Instruction>(U.get())) { 5901 assert(VectorType::isValidElementType(J->getType()) && 5902 "Instruction has non-scalar type"); 5903 if (canBeScalarized(J)) 5904 Worklist.push_back(J); 5905 else if (needsExtract(J, VF)) { 5906 ScalarCost += TTI.getScalarizationOverhead( 5907 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5908 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5909 /*Extract*/ true, CostKind); 5910 } 5911 } 5912 5913 // Scale the total scalar cost by block probability. 5914 ScalarCost /= getReciprocalPredBlockProb(); 5915 5916 // Compute the discount. A non-negative discount means the vector version 5917 // of the instruction costs more, and scalarizing would be beneficial. 5918 Discount += VectorCost - ScalarCost; 5919 ScalarCosts[I] = ScalarCost; 5920 } 5921 5922 return Discount; 5923 } 5924 5925 LoopVectorizationCostModel::VectorizationCostTy 5926 LoopVectorizationCostModel::expectedCost( 5927 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 5928 VectorizationCostTy Cost; 5929 5930 // For each block. 5931 for (BasicBlock *BB : TheLoop->blocks()) { 5932 VectorizationCostTy BlockCost; 5933 5934 // For each instruction in the old loop. 5935 for (Instruction &I : BB->instructionsWithoutDebug()) { 5936 // Skip ignored values. 5937 if (ValuesToIgnore.count(&I) || 5938 (VF.isVector() && VecValuesToIgnore.count(&I))) 5939 continue; 5940 5941 VectorizationCostTy C = getInstructionCost(&I, VF); 5942 5943 // Check if we should override the cost. 5944 if (C.first.isValid() && 5945 ForceTargetInstructionCost.getNumOccurrences() > 0) 5946 C.first = InstructionCost(ForceTargetInstructionCost); 5947 5948 // Keep a list of instructions with invalid costs. 5949 if (Invalid && !C.first.isValid()) 5950 Invalid->emplace_back(&I, VF); 5951 5952 BlockCost.first += C.first; 5953 BlockCost.second |= C.second; 5954 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5955 << " for VF " << VF << " For instruction: " << I 5956 << '\n'); 5957 } 5958 5959 // If we are vectorizing a predicated block, it will have been 5960 // if-converted. This means that the block's instructions (aside from 5961 // stores and instructions that may divide by zero) will now be 5962 // unconditionally executed. For the scalar case, we may not always execute 5963 // the predicated block, if it is an if-else block. Thus, scale the block's 5964 // cost by the probability of executing it. blockNeedsPredication from 5965 // Legal is used so as to not include all blocks in tail folded loops. 5966 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5967 BlockCost.first /= getReciprocalPredBlockProb(); 5968 5969 Cost.first += BlockCost.first; 5970 Cost.second |= BlockCost.second; 5971 } 5972 5973 return Cost; 5974 } 5975 5976 /// Gets Address Access SCEV after verifying that the access pattern 5977 /// is loop invariant except the induction variable dependence. 5978 /// 5979 /// This SCEV can be sent to the Target in order to estimate the address 5980 /// calculation cost. 5981 static const SCEV *getAddressAccessSCEV( 5982 Value *Ptr, 5983 LoopVectorizationLegality *Legal, 5984 PredicatedScalarEvolution &PSE, 5985 const Loop *TheLoop) { 5986 5987 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5988 if (!Gep) 5989 return nullptr; 5990 5991 // We are looking for a gep with all loop invariant indices except for one 5992 // which should be an induction variable. 5993 auto SE = PSE.getSE(); 5994 unsigned NumOperands = Gep->getNumOperands(); 5995 for (unsigned i = 1; i < NumOperands; ++i) { 5996 Value *Opd = Gep->getOperand(i); 5997 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5998 !Legal->isInductionVariable(Opd)) 5999 return nullptr; 6000 } 6001 6002 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6003 return PSE.getSCEV(Ptr); 6004 } 6005 6006 InstructionCost 6007 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6008 ElementCount VF) { 6009 assert(VF.isVector() && 6010 "Scalarization cost of instruction implies vectorization."); 6011 if (VF.isScalable()) 6012 return InstructionCost::getInvalid(); 6013 6014 Type *ValTy = getLoadStoreType(I); 6015 auto SE = PSE.getSE(); 6016 6017 unsigned AS = getLoadStoreAddressSpace(I); 6018 Value *Ptr = getLoadStorePointerOperand(I); 6019 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6020 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6021 // that it is being called from this specific place. 6022 6023 // Figure out whether the access is strided and get the stride value 6024 // if it's known in compile time 6025 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6026 6027 // Get the cost of the scalar memory instruction and address computation. 6028 InstructionCost Cost = 6029 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6030 6031 // Don't pass *I here, since it is scalar but will actually be part of a 6032 // vectorized loop where the user of it is a vectorized instruction. 6033 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6034 const Align Alignment = getLoadStoreAlignment(I); 6035 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6036 ValTy->getScalarType(), 6037 Alignment, AS, CostKind); 6038 6039 // Get the overhead of the extractelement and insertelement instructions 6040 // we might create due to scalarization. 6041 Cost += getScalarizationOverhead(I, VF, CostKind); 6042 6043 // If we have a predicated load/store, it will need extra i1 extracts and 6044 // conditional branches, but may not be executed for each vector lane. Scale 6045 // the cost by the probability of executing the predicated block. 6046 if (isPredicatedInst(I)) { 6047 Cost /= getReciprocalPredBlockProb(); 6048 6049 // Add the cost of an i1 extract and a branch 6050 auto *Vec_i1Ty = 6051 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6052 Cost += TTI.getScalarizationOverhead( 6053 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6054 /*Insert=*/false, /*Extract=*/true, CostKind); 6055 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6056 6057 if (useEmulatedMaskMemRefHack(I, VF)) 6058 // Artificially setting to a high enough value to practically disable 6059 // vectorization with such operations. 6060 Cost = 3000000; 6061 } 6062 6063 return Cost; 6064 } 6065 6066 InstructionCost 6067 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6068 ElementCount VF) { 6069 Type *ValTy = getLoadStoreType(I); 6070 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6071 Value *Ptr = getLoadStorePointerOperand(I); 6072 unsigned AS = getLoadStoreAddressSpace(I); 6073 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6074 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6075 6076 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6077 "Stride should be 1 or -1 for consecutive memory access"); 6078 const Align Alignment = getLoadStoreAlignment(I); 6079 InstructionCost Cost = 0; 6080 if (Legal->isMaskRequired(I)) { 6081 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6082 CostKind); 6083 } else { 6084 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6085 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6086 CostKind, OpInfo, I); 6087 } 6088 6089 bool Reverse = ConsecutiveStride < 0; 6090 if (Reverse) 6091 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6092 std::nullopt, CostKind, 0); 6093 return Cost; 6094 } 6095 6096 InstructionCost 6097 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6098 ElementCount VF) { 6099 assert(Legal->isUniformMemOp(*I, VF)); 6100 6101 Type *ValTy = getLoadStoreType(I); 6102 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6103 const Align Alignment = getLoadStoreAlignment(I); 6104 unsigned AS = getLoadStoreAddressSpace(I); 6105 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6106 if (isa<LoadInst>(I)) { 6107 return TTI.getAddressComputationCost(ValTy) + 6108 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6109 CostKind) + 6110 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6111 } 6112 StoreInst *SI = cast<StoreInst>(I); 6113 6114 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6115 return TTI.getAddressComputationCost(ValTy) + 6116 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6117 CostKind) + 6118 (isLoopInvariantStoreValue 6119 ? 0 6120 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6121 CostKind, VF.getKnownMinValue() - 1)); 6122 } 6123 6124 InstructionCost 6125 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6126 ElementCount VF) { 6127 Type *ValTy = getLoadStoreType(I); 6128 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6129 const Align Alignment = getLoadStoreAlignment(I); 6130 const Value *Ptr = getLoadStorePointerOperand(I); 6131 6132 return TTI.getAddressComputationCost(VectorTy) + 6133 TTI.getGatherScatterOpCost( 6134 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6135 TargetTransformInfo::TCK_RecipThroughput, I); 6136 } 6137 6138 InstructionCost 6139 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6140 ElementCount VF) { 6141 Type *ValTy = getLoadStoreType(I); 6142 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6143 unsigned AS = getLoadStoreAddressSpace(I); 6144 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6145 6146 auto Group = getInterleavedAccessGroup(I); 6147 assert(Group && "Fail to get an interleaved access group."); 6148 6149 unsigned InterleaveFactor = Group->getFactor(); 6150 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6151 6152 // Holds the indices of existing members in the interleaved group. 6153 SmallVector<unsigned, 4> Indices; 6154 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6155 if (Group->getMember(IF)) 6156 Indices.push_back(IF); 6157 6158 // Calculate the cost of the whole interleaved group. 6159 bool UseMaskForGaps = 6160 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6161 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6162 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6163 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6164 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6165 6166 if (Group->isReverse()) { 6167 // TODO: Add support for reversed masked interleaved access. 6168 assert(!Legal->isMaskRequired(I) && 6169 "Reverse masked interleaved access not supported."); 6170 Cost += Group->getNumMembers() * 6171 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6172 std::nullopt, CostKind, 0); 6173 } 6174 return Cost; 6175 } 6176 6177 std::optional<InstructionCost> 6178 LoopVectorizationCostModel::getReductionPatternCost( 6179 Instruction *I, ElementCount VF, Type *Ty, 6180 TTI::TargetCostKind CostKind) const { 6181 using namespace llvm::PatternMatch; 6182 // Early exit for no inloop reductions 6183 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6184 return std::nullopt; 6185 auto *VectorTy = cast<VectorType>(Ty); 6186 6187 // We are looking for a pattern of, and finding the minimal acceptable cost: 6188 // reduce(mul(ext(A), ext(B))) or 6189 // reduce(mul(A, B)) or 6190 // reduce(ext(A)) or 6191 // reduce(A). 6192 // The basic idea is that we walk down the tree to do that, finding the root 6193 // reduction instruction in InLoopReductionImmediateChains. From there we find 6194 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6195 // of the components. If the reduction cost is lower then we return it for the 6196 // reduction instruction and 0 for the other instructions in the pattern. If 6197 // it is not we return an invalid cost specifying the orignal cost method 6198 // should be used. 6199 Instruction *RetI = I; 6200 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6201 if (!RetI->hasOneUser()) 6202 return std::nullopt; 6203 RetI = RetI->user_back(); 6204 } 6205 6206 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6207 RetI->user_back()->getOpcode() == Instruction::Add) { 6208 RetI = RetI->user_back(); 6209 } 6210 6211 // Test if the found instruction is a reduction, and if not return an invalid 6212 // cost specifying the parent to use the original cost modelling. 6213 if (!InLoopReductionImmediateChains.count(RetI)) 6214 return std::nullopt; 6215 6216 // Find the reduction this chain is a part of and calculate the basic cost of 6217 // the reduction on its own. 6218 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 6219 Instruction *ReductionPhi = LastChain; 6220 while (!isa<PHINode>(ReductionPhi)) 6221 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 6222 6223 const RecurrenceDescriptor &RdxDesc = 6224 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6225 6226 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6227 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6228 6229 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6230 // normal fmul instruction to the cost of the fadd reduction. 6231 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6232 BaseCost += 6233 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6234 6235 // If we're using ordered reductions then we can just return the base cost 6236 // here, since getArithmeticReductionCost calculates the full ordered 6237 // reduction cost when FP reassociation is not allowed. 6238 if (useOrderedReductions(RdxDesc)) 6239 return BaseCost; 6240 6241 // Get the operand that was not the reduction chain and match it to one of the 6242 // patterns, returning the better cost if it is found. 6243 Instruction *RedOp = RetI->getOperand(1) == LastChain 6244 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6245 : dyn_cast<Instruction>(RetI->getOperand(1)); 6246 6247 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6248 6249 Instruction *Op0, *Op1; 6250 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6251 match(RedOp, 6252 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6253 match(Op0, m_ZExtOrSExt(m_Value())) && 6254 Op0->getOpcode() == Op1->getOpcode() && 6255 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6256 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6257 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6258 6259 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6260 // Note that the extend opcodes need to all match, or if A==B they will have 6261 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6262 // which is equally fine. 6263 bool IsUnsigned = isa<ZExtInst>(Op0); 6264 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6265 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6266 6267 InstructionCost ExtCost = 6268 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6269 TTI::CastContextHint::None, CostKind, Op0); 6270 InstructionCost MulCost = 6271 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6272 InstructionCost Ext2Cost = 6273 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6274 TTI::CastContextHint::None, CostKind, RedOp); 6275 6276 InstructionCost RedCost = TTI.getMulAccReductionCost( 6277 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6278 6279 if (RedCost.isValid() && 6280 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6281 return I == RetI ? RedCost : 0; 6282 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6283 !TheLoop->isLoopInvariant(RedOp)) { 6284 // Matched reduce(ext(A)) 6285 bool IsUnsigned = isa<ZExtInst>(RedOp); 6286 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6287 InstructionCost RedCost = TTI.getExtendedReductionCost( 6288 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6289 RdxDesc.getFastMathFlags(), CostKind); 6290 6291 InstructionCost ExtCost = 6292 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6293 TTI::CastContextHint::None, CostKind, RedOp); 6294 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6295 return I == RetI ? RedCost : 0; 6296 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6297 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6298 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6299 Op0->getOpcode() == Op1->getOpcode() && 6300 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6301 bool IsUnsigned = isa<ZExtInst>(Op0); 6302 Type *Op0Ty = Op0->getOperand(0)->getType(); 6303 Type *Op1Ty = Op1->getOperand(0)->getType(); 6304 Type *LargestOpTy = 6305 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6306 : Op0Ty; 6307 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6308 6309 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6310 // different sizes. We take the largest type as the ext to reduce, and add 6311 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6312 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6313 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6314 TTI::CastContextHint::None, CostKind, Op0); 6315 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6316 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6317 TTI::CastContextHint::None, CostKind, Op1); 6318 InstructionCost MulCost = 6319 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6320 6321 InstructionCost RedCost = TTI.getMulAccReductionCost( 6322 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6323 InstructionCost ExtraExtCost = 0; 6324 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6325 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6326 ExtraExtCost = TTI.getCastInstrCost( 6327 ExtraExtOp->getOpcode(), ExtType, 6328 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6329 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6330 } 6331 6332 if (RedCost.isValid() && 6333 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6334 return I == RetI ? RedCost : 0; 6335 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6336 // Matched reduce.add(mul()) 6337 InstructionCost MulCost = 6338 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6339 6340 InstructionCost RedCost = TTI.getMulAccReductionCost( 6341 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6342 6343 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6344 return I == RetI ? RedCost : 0; 6345 } 6346 } 6347 6348 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6349 } 6350 6351 InstructionCost 6352 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6353 ElementCount VF) { 6354 // Calculate scalar cost only. Vectorization cost should be ready at this 6355 // moment. 6356 if (VF.isScalar()) { 6357 Type *ValTy = getLoadStoreType(I); 6358 const Align Alignment = getLoadStoreAlignment(I); 6359 unsigned AS = getLoadStoreAddressSpace(I); 6360 6361 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6362 return TTI.getAddressComputationCost(ValTy) + 6363 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6364 TTI::TCK_RecipThroughput, OpInfo, I); 6365 } 6366 return getWideningCost(I, VF); 6367 } 6368 6369 LoopVectorizationCostModel::VectorizationCostTy 6370 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6371 ElementCount VF) { 6372 // If we know that this instruction will remain uniform, check the cost of 6373 // the scalar version. 6374 if (isUniformAfterVectorization(I, VF)) 6375 VF = ElementCount::getFixed(1); 6376 6377 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6378 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6379 6380 // Forced scalars do not have any scalarization overhead. 6381 auto ForcedScalar = ForcedScalars.find(VF); 6382 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6383 auto InstSet = ForcedScalar->second; 6384 if (InstSet.count(I)) 6385 return VectorizationCostTy( 6386 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6387 VF.getKnownMinValue()), 6388 false); 6389 } 6390 6391 Type *VectorTy; 6392 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6393 6394 bool TypeNotScalarized = false; 6395 if (VF.isVector() && VectorTy->isVectorTy()) { 6396 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6397 if (VF.isScalable()) 6398 // <vscale x 1 x iN> is assumed to be profitable over iN because 6399 // scalable registers are a distinct register class from scalar ones. 6400 // If we ever find a target which wants to lower scalable vectors 6401 // back to scalars, we'll need to update this code to explicitly 6402 // ask TTI about the register class uses for each part. 6403 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6404 else 6405 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6406 } else 6407 C = InstructionCost::getInvalid(); 6408 } 6409 return VectorizationCostTy(C, TypeNotScalarized); 6410 } 6411 6412 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6413 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6414 6415 // There is no mechanism yet to create a scalable scalarization loop, 6416 // so this is currently Invalid. 6417 if (VF.isScalable()) 6418 return InstructionCost::getInvalid(); 6419 6420 if (VF.isScalar()) 6421 return 0; 6422 6423 InstructionCost Cost = 0; 6424 Type *RetTy = ToVectorTy(I->getType(), VF); 6425 if (!RetTy->isVoidTy() && 6426 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6427 Cost += TTI.getScalarizationOverhead( 6428 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6429 /*Insert*/ true, 6430 /*Extract*/ false, CostKind); 6431 6432 // Some targets keep addresses scalar. 6433 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6434 return Cost; 6435 6436 // Some targets support efficient element stores. 6437 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6438 return Cost; 6439 6440 // Collect operands to consider. 6441 CallInst *CI = dyn_cast<CallInst>(I); 6442 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6443 6444 // Skip operands that do not require extraction/scalarization and do not incur 6445 // any overhead. 6446 SmallVector<Type *> Tys; 6447 for (auto *V : filterExtractingOperands(Ops, VF)) 6448 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6449 return Cost + TTI.getOperandsScalarizationOverhead( 6450 filterExtractingOperands(Ops, VF), Tys, CostKind); 6451 } 6452 6453 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6454 if (VF.isScalar()) 6455 return; 6456 NumPredStores = 0; 6457 for (BasicBlock *BB : TheLoop->blocks()) { 6458 // For each instruction in the old loop. 6459 for (Instruction &I : *BB) { 6460 Value *Ptr = getLoadStorePointerOperand(&I); 6461 if (!Ptr) 6462 continue; 6463 6464 // TODO: We should generate better code and update the cost model for 6465 // predicated uniform stores. Today they are treated as any other 6466 // predicated store (see added test cases in 6467 // invariant-store-vectorization.ll). 6468 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6469 NumPredStores++; 6470 6471 if (Legal->isUniformMemOp(I, VF)) { 6472 auto isLegalToScalarize = [&]() { 6473 if (!VF.isScalable()) 6474 // Scalarization of fixed length vectors "just works". 6475 return true; 6476 6477 // We have dedicated lowering for unpredicated uniform loads and 6478 // stores. Note that even with tail folding we know that at least 6479 // one lane is active (i.e. generalized predication is not possible 6480 // here), and the logic below depends on this fact. 6481 if (!foldTailByMasking()) 6482 return true; 6483 6484 // For scalable vectors, a uniform memop load is always 6485 // uniform-by-parts and we know how to scalarize that. 6486 if (isa<LoadInst>(I)) 6487 return true; 6488 6489 // A uniform store isn't neccessarily uniform-by-part 6490 // and we can't assume scalarization. 6491 auto &SI = cast<StoreInst>(I); 6492 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6493 }; 6494 6495 const InstructionCost GatherScatterCost = 6496 isLegalGatherOrScatter(&I, VF) ? 6497 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6498 6499 // Load: Scalar load + broadcast 6500 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6501 // FIXME: This cost is a significant under-estimate for tail folded 6502 // memory ops. 6503 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6504 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6505 6506 // Choose better solution for the current VF, Note that Invalid 6507 // costs compare as maximumal large. If both are invalid, we get 6508 // scalable invalid which signals a failure and a vectorization abort. 6509 if (GatherScatterCost < ScalarizationCost) 6510 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6511 else 6512 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6513 continue; 6514 } 6515 6516 // We assume that widening is the best solution when possible. 6517 if (memoryInstructionCanBeWidened(&I, VF)) { 6518 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6519 int ConsecutiveStride = Legal->isConsecutivePtr( 6520 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6521 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6522 "Expected consecutive stride."); 6523 InstWidening Decision = 6524 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6525 setWideningDecision(&I, VF, Decision, Cost); 6526 continue; 6527 } 6528 6529 // Choose between Interleaving, Gather/Scatter or Scalarization. 6530 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6531 unsigned NumAccesses = 1; 6532 if (isAccessInterleaved(&I)) { 6533 auto Group = getInterleavedAccessGroup(&I); 6534 assert(Group && "Fail to get an interleaved access group."); 6535 6536 // Make one decision for the whole group. 6537 if (getWideningDecision(&I, VF) != CM_Unknown) 6538 continue; 6539 6540 NumAccesses = Group->getNumMembers(); 6541 if (interleavedAccessCanBeWidened(&I, VF)) 6542 InterleaveCost = getInterleaveGroupCost(&I, VF); 6543 } 6544 6545 InstructionCost GatherScatterCost = 6546 isLegalGatherOrScatter(&I, VF) 6547 ? getGatherScatterCost(&I, VF) * NumAccesses 6548 : InstructionCost::getInvalid(); 6549 6550 InstructionCost ScalarizationCost = 6551 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6552 6553 // Choose better solution for the current VF, 6554 // write down this decision and use it during vectorization. 6555 InstructionCost Cost; 6556 InstWidening Decision; 6557 if (InterleaveCost <= GatherScatterCost && 6558 InterleaveCost < ScalarizationCost) { 6559 Decision = CM_Interleave; 6560 Cost = InterleaveCost; 6561 } else if (GatherScatterCost < ScalarizationCost) { 6562 Decision = CM_GatherScatter; 6563 Cost = GatherScatterCost; 6564 } else { 6565 Decision = CM_Scalarize; 6566 Cost = ScalarizationCost; 6567 } 6568 // If the instructions belongs to an interleave group, the whole group 6569 // receives the same decision. The whole group receives the cost, but 6570 // the cost will actually be assigned to one instruction. 6571 if (auto Group = getInterleavedAccessGroup(&I)) 6572 setWideningDecision(Group, VF, Decision, Cost); 6573 else 6574 setWideningDecision(&I, VF, Decision, Cost); 6575 } 6576 } 6577 6578 // Make sure that any load of address and any other address computation 6579 // remains scalar unless there is gather/scatter support. This avoids 6580 // inevitable extracts into address registers, and also has the benefit of 6581 // activating LSR more, since that pass can't optimize vectorized 6582 // addresses. 6583 if (TTI.prefersVectorizedAddressing()) 6584 return; 6585 6586 // Start with all scalar pointer uses. 6587 SmallPtrSet<Instruction *, 8> AddrDefs; 6588 for (BasicBlock *BB : TheLoop->blocks()) 6589 for (Instruction &I : *BB) { 6590 Instruction *PtrDef = 6591 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6592 if (PtrDef && TheLoop->contains(PtrDef) && 6593 getWideningDecision(&I, VF) != CM_GatherScatter) 6594 AddrDefs.insert(PtrDef); 6595 } 6596 6597 // Add all instructions used to generate the addresses. 6598 SmallVector<Instruction *, 4> Worklist; 6599 append_range(Worklist, AddrDefs); 6600 while (!Worklist.empty()) { 6601 Instruction *I = Worklist.pop_back_val(); 6602 for (auto &Op : I->operands()) 6603 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6604 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6605 AddrDefs.insert(InstOp).second) 6606 Worklist.push_back(InstOp); 6607 } 6608 6609 for (auto *I : AddrDefs) { 6610 if (isa<LoadInst>(I)) { 6611 // Setting the desired widening decision should ideally be handled in 6612 // by cost functions, but since this involves the task of finding out 6613 // if the loaded register is involved in an address computation, it is 6614 // instead changed here when we know this is the case. 6615 InstWidening Decision = getWideningDecision(I, VF); 6616 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6617 // Scalarize a widened load of address. 6618 setWideningDecision( 6619 I, VF, CM_Scalarize, 6620 (VF.getKnownMinValue() * 6621 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6622 else if (auto Group = getInterleavedAccessGroup(I)) { 6623 // Scalarize an interleave group of address loads. 6624 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6625 if (Instruction *Member = Group->getMember(I)) 6626 setWideningDecision( 6627 Member, VF, CM_Scalarize, 6628 (VF.getKnownMinValue() * 6629 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6630 } 6631 } 6632 } else 6633 // Make sure I gets scalarized and a cost estimate without 6634 // scalarization overhead. 6635 ForcedScalars[VF].insert(I); 6636 } 6637 } 6638 6639 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6640 assert(!VF.isScalar() && 6641 "Trying to set a vectorization decision for a scalar VF"); 6642 6643 for (BasicBlock *BB : TheLoop->blocks()) { 6644 // For each instruction in the old loop. 6645 for (Instruction &I : *BB) { 6646 CallInst *CI = dyn_cast<CallInst>(&I); 6647 6648 if (!CI) 6649 continue; 6650 6651 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6652 InstructionCost VectorCost = InstructionCost::getInvalid(); 6653 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6654 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6655 6656 Function *ScalarFunc = CI->getCalledFunction(); 6657 Type *ScalarRetTy = CI->getType(); 6658 SmallVector<Type *, 4> Tys, ScalarTys; 6659 bool MaskRequired = Legal->isMaskRequired(CI); 6660 for (auto &ArgOp : CI->args()) 6661 ScalarTys.push_back(ArgOp->getType()); 6662 6663 // Compute corresponding vector type for return value and arguments. 6664 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 6665 for (Type *ScalarTy : ScalarTys) 6666 Tys.push_back(ToVectorTy(ScalarTy, VF)); 6667 6668 // An in-loop reduction using an fmuladd intrinsic is a special case; 6669 // we don't want the normal cost for that intrinsic. 6670 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6671 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6672 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6673 getVectorIntrinsicIDForCall(CI, TLI), 6674 std::nullopt, *RedCost); 6675 continue; 6676 } 6677 6678 // Estimate cost of scalarized vector call. The source operands are 6679 // assumed to be vectors, so we need to extract individual elements from 6680 // there, execute VF scalar calls, and then gather the result into the 6681 // vector return value. 6682 InstructionCost ScalarCallCost = 6683 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6684 6685 // Compute costs of unpacking argument values for the scalar calls and 6686 // packing the return values to a vector. 6687 InstructionCost ScalarizationCost = 6688 getScalarizationOverhead(CI, VF, CostKind); 6689 6690 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6691 6692 // Find the cost of vectorizing the call, if we can find a suitable 6693 // vector variant of the function. 6694 bool UsesMask = false; 6695 VFInfo FuncInfo; 6696 Function *VecFunc = nullptr; 6697 // Search through any available variants for one we can use at this VF. 6698 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6699 // Must match requested VF. 6700 if (Info.Shape.VF != VF) 6701 continue; 6702 6703 // Must take a mask argument if one is required 6704 if (MaskRequired && !Info.isMasked()) 6705 continue; 6706 6707 // Check that all parameter kinds are supported 6708 bool ParamsOk = true; 6709 for (VFParameter Param : Info.Shape.Parameters) { 6710 switch (Param.ParamKind) { 6711 case VFParamKind::Vector: 6712 break; 6713 case VFParamKind::OMP_Uniform: { 6714 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6715 // Make sure the scalar parameter in the loop is invariant. 6716 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6717 TheLoop)) 6718 ParamsOk = false; 6719 break; 6720 } 6721 case VFParamKind::OMP_Linear: { 6722 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6723 // Find the stride for the scalar parameter in this loop and see if 6724 // it matches the stride for the variant. 6725 // TODO: do we need to figure out the cost of an extract to get the 6726 // first lane? Or do we hope that it will be folded away? 6727 ScalarEvolution *SE = PSE.getSE(); 6728 const auto *SAR = 6729 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6730 6731 if (!SAR || SAR->getLoop() != TheLoop) { 6732 ParamsOk = false; 6733 break; 6734 } 6735 6736 const SCEVConstant *Step = 6737 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6738 6739 if (!Step || 6740 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6741 ParamsOk = false; 6742 6743 break; 6744 } 6745 case VFParamKind::GlobalPredicate: 6746 UsesMask = true; 6747 break; 6748 default: 6749 ParamsOk = false; 6750 break; 6751 } 6752 } 6753 6754 if (!ParamsOk) 6755 continue; 6756 6757 // Found a suitable candidate, stop here. 6758 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6759 FuncInfo = Info; 6760 break; 6761 } 6762 6763 // Add in the cost of synthesizing a mask if one wasn't required. 6764 InstructionCost MaskCost = 0; 6765 if (VecFunc && UsesMask && !MaskRequired) 6766 MaskCost = TTI.getShuffleCost( 6767 TargetTransformInfo::SK_Broadcast, 6768 VectorType::get(IntegerType::getInt1Ty( 6769 VecFunc->getFunctionType()->getContext()), 6770 VF)); 6771 6772 if (TLI && VecFunc && !CI->isNoBuiltin()) 6773 VectorCost = 6774 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6775 6776 // Find the cost of an intrinsic; some targets may have instructions that 6777 // perform the operation without needing an actual call. 6778 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6779 if (IID != Intrinsic::not_intrinsic) 6780 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6781 6782 InstructionCost Cost = ScalarCost; 6783 InstWidening Decision = CM_Scalarize; 6784 6785 if (VectorCost <= Cost) { 6786 Cost = VectorCost; 6787 Decision = CM_VectorCall; 6788 } 6789 6790 if (IntrinsicCost <= Cost) { 6791 Cost = IntrinsicCost; 6792 Decision = CM_IntrinsicCall; 6793 } 6794 6795 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6796 FuncInfo.getParamIndexForOptionalMask(), Cost); 6797 } 6798 } 6799 } 6800 6801 InstructionCost 6802 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6803 Type *&VectorTy) { 6804 Type *RetTy = I->getType(); 6805 if (canTruncateToMinimalBitwidth(I, VF)) 6806 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6807 auto SE = PSE.getSE(); 6808 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6809 6810 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6811 ElementCount VF) -> bool { 6812 if (VF.isScalar()) 6813 return true; 6814 6815 auto Scalarized = InstsToScalarize.find(VF); 6816 assert(Scalarized != InstsToScalarize.end() && 6817 "VF not yet analyzed for scalarization profitability"); 6818 return !Scalarized->second.count(I) && 6819 llvm::all_of(I->users(), [&](User *U) { 6820 auto *UI = cast<Instruction>(U); 6821 return !Scalarized->second.count(UI); 6822 }); 6823 }; 6824 (void) hasSingleCopyAfterVectorization; 6825 6826 if (isScalarAfterVectorization(I, VF)) { 6827 // With the exception of GEPs and PHIs, after scalarization there should 6828 // only be one copy of the instruction generated in the loop. This is 6829 // because the VF is either 1, or any instructions that need scalarizing 6830 // have already been dealt with by the time we get here. As a result, 6831 // it means we don't have to multiply the instruction cost by VF. 6832 assert(I->getOpcode() == Instruction::GetElementPtr || 6833 I->getOpcode() == Instruction::PHI || 6834 (I->getOpcode() == Instruction::BitCast && 6835 I->getType()->isPointerTy()) || 6836 hasSingleCopyAfterVectorization(I, VF)); 6837 VectorTy = RetTy; 6838 } else 6839 VectorTy = ToVectorTy(RetTy, VF); 6840 6841 // TODO: We need to estimate the cost of intrinsic calls. 6842 switch (I->getOpcode()) { 6843 case Instruction::GetElementPtr: 6844 // We mark this instruction as zero-cost because the cost of GEPs in 6845 // vectorized code depends on whether the corresponding memory instruction 6846 // is scalarized or not. Therefore, we handle GEPs with the memory 6847 // instruction cost. 6848 return 0; 6849 case Instruction::Br: { 6850 // In cases of scalarized and predicated instructions, there will be VF 6851 // predicated blocks in the vectorized loop. Each branch around these 6852 // blocks requires also an extract of its vector compare i1 element. 6853 bool ScalarPredicatedBB = false; 6854 BranchInst *BI = cast<BranchInst>(I); 6855 if (VF.isVector() && BI->isConditional() && 6856 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6857 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 6858 ScalarPredicatedBB = true; 6859 6860 if (ScalarPredicatedBB) { 6861 // Not possible to scalarize scalable vector with predicated instructions. 6862 if (VF.isScalable()) 6863 return InstructionCost::getInvalid(); 6864 // Return cost for branches around scalarized and predicated blocks. 6865 auto *Vec_i1Ty = 6866 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6867 return ( 6868 TTI.getScalarizationOverhead( 6869 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 6870 /*Insert*/ false, /*Extract*/ true, CostKind) + 6871 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6872 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6873 // The back-edge branch will remain, as will all scalar branches. 6874 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6875 else 6876 // This branch will be eliminated by if-conversion. 6877 return 0; 6878 // Note: We currently assume zero cost for an unconditional branch inside 6879 // a predicated block since it will become a fall-through, although we 6880 // may decide in the future to call TTI for all branches. 6881 } 6882 case Instruction::PHI: { 6883 auto *Phi = cast<PHINode>(I); 6884 6885 // First-order recurrences are replaced by vector shuffles inside the loop. 6886 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6887 SmallVector<int> Mask(VF.getKnownMinValue()); 6888 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6889 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6890 cast<VectorType>(VectorTy), Mask, CostKind, 6891 VF.getKnownMinValue() - 1); 6892 } 6893 6894 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6895 // converted into select instructions. We require N - 1 selects per phi 6896 // node, where N is the number of incoming values. 6897 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6898 return (Phi->getNumIncomingValues() - 1) * 6899 TTI.getCmpSelInstrCost( 6900 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6901 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6902 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6903 6904 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6905 } 6906 case Instruction::UDiv: 6907 case Instruction::SDiv: 6908 case Instruction::URem: 6909 case Instruction::SRem: 6910 if (VF.isVector() && isPredicatedInst(I)) { 6911 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6912 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6913 ScalarCost : SafeDivisorCost; 6914 } 6915 // We've proven all lanes safe to speculate, fall through. 6916 [[fallthrough]]; 6917 case Instruction::Add: 6918 case Instruction::FAdd: 6919 case Instruction::Sub: 6920 case Instruction::FSub: 6921 case Instruction::Mul: 6922 case Instruction::FMul: 6923 case Instruction::FDiv: 6924 case Instruction::FRem: 6925 case Instruction::Shl: 6926 case Instruction::LShr: 6927 case Instruction::AShr: 6928 case Instruction::And: 6929 case Instruction::Or: 6930 case Instruction::Xor: { 6931 // If we're speculating on the stride being 1, the multiplication may 6932 // fold away. We can generalize this for all operations using the notion 6933 // of neutral elements. (TODO) 6934 if (I->getOpcode() == Instruction::Mul && 6935 (PSE.getSCEV(I->getOperand(0))->isOne() || 6936 PSE.getSCEV(I->getOperand(1))->isOne())) 6937 return 0; 6938 6939 // Detect reduction patterns 6940 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6941 return *RedCost; 6942 6943 // Certain instructions can be cheaper to vectorize if they have a constant 6944 // second vector operand. One example of this are shifts on x86. 6945 Value *Op2 = I->getOperand(1); 6946 auto Op2Info = TTI.getOperandInfo(Op2); 6947 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6948 Legal->isInvariant(Op2)) 6949 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6950 6951 SmallVector<const Value *, 4> Operands(I->operand_values()); 6952 auto InstrCost = TTI.getArithmeticInstrCost( 6953 I->getOpcode(), VectorTy, CostKind, 6954 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6955 Op2Info, Operands, I); 6956 6957 // Some targets can replace frem with vector library calls. 6958 InstructionCost VecCallCost = InstructionCost::getInvalid(); 6959 if (I->getOpcode() == Instruction::FRem) { 6960 LibFunc Func; 6961 if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) && 6962 TLI->isFunctionVectorizable(TLI->getName(Func), VF)) { 6963 SmallVector<Type *, 4> OpTypes; 6964 for (auto &Op : I->operands()) 6965 OpTypes.push_back(Op->getType()); 6966 VecCallCost = 6967 TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind); 6968 } 6969 } 6970 return std::min(InstrCost, VecCallCost); 6971 } 6972 case Instruction::FNeg: { 6973 return TTI.getArithmeticInstrCost( 6974 I->getOpcode(), VectorTy, CostKind, 6975 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6976 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6977 I->getOperand(0), I); 6978 } 6979 case Instruction::Select: { 6980 SelectInst *SI = cast<SelectInst>(I); 6981 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6982 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6983 6984 const Value *Op0, *Op1; 6985 using namespace llvm::PatternMatch; 6986 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6987 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6988 // select x, y, false --> x & y 6989 // select x, true, y --> x | y 6990 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6991 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6992 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6993 Op1->getType()->getScalarSizeInBits() == 1); 6994 6995 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6996 return TTI.getArithmeticInstrCost( 6997 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6998 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6999 } 7000 7001 Type *CondTy = SI->getCondition()->getType(); 7002 if (!ScalarCond) 7003 CondTy = VectorType::get(CondTy, VF); 7004 7005 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7006 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7007 Pred = Cmp->getPredicate(); 7008 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7009 CostKind, I); 7010 } 7011 case Instruction::ICmp: 7012 case Instruction::FCmp: { 7013 Type *ValTy = I->getOperand(0)->getType(); 7014 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7015 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7016 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7017 VectorTy = ToVectorTy(ValTy, VF); 7018 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7019 cast<CmpInst>(I)->getPredicate(), CostKind, 7020 I); 7021 } 7022 case Instruction::Store: 7023 case Instruction::Load: { 7024 ElementCount Width = VF; 7025 if (Width.isVector()) { 7026 InstWidening Decision = getWideningDecision(I, Width); 7027 assert(Decision != CM_Unknown && 7028 "CM decision should be taken at this point"); 7029 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7030 return InstructionCost::getInvalid(); 7031 if (Decision == CM_Scalarize) 7032 Width = ElementCount::getFixed(1); 7033 } 7034 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7035 return getMemoryInstructionCost(I, VF); 7036 } 7037 case Instruction::BitCast: 7038 if (I->getType()->isPointerTy()) 7039 return 0; 7040 [[fallthrough]]; 7041 case Instruction::ZExt: 7042 case Instruction::SExt: 7043 case Instruction::FPToUI: 7044 case Instruction::FPToSI: 7045 case Instruction::FPExt: 7046 case Instruction::PtrToInt: 7047 case Instruction::IntToPtr: 7048 case Instruction::SIToFP: 7049 case Instruction::UIToFP: 7050 case Instruction::Trunc: 7051 case Instruction::FPTrunc: { 7052 // Computes the CastContextHint from a Load/Store instruction. 7053 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7054 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7055 "Expected a load or a store!"); 7056 7057 if (VF.isScalar() || !TheLoop->contains(I)) 7058 return TTI::CastContextHint::Normal; 7059 7060 switch (getWideningDecision(I, VF)) { 7061 case LoopVectorizationCostModel::CM_GatherScatter: 7062 return TTI::CastContextHint::GatherScatter; 7063 case LoopVectorizationCostModel::CM_Interleave: 7064 return TTI::CastContextHint::Interleave; 7065 case LoopVectorizationCostModel::CM_Scalarize: 7066 case LoopVectorizationCostModel::CM_Widen: 7067 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7068 : TTI::CastContextHint::Normal; 7069 case LoopVectorizationCostModel::CM_Widen_Reverse: 7070 return TTI::CastContextHint::Reversed; 7071 case LoopVectorizationCostModel::CM_Unknown: 7072 llvm_unreachable("Instr did not go through cost modelling?"); 7073 case LoopVectorizationCostModel::CM_VectorCall: 7074 case LoopVectorizationCostModel::CM_IntrinsicCall: 7075 llvm_unreachable_internal("Instr has invalid widening decision"); 7076 } 7077 7078 llvm_unreachable("Unhandled case!"); 7079 }; 7080 7081 unsigned Opcode = I->getOpcode(); 7082 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7083 // For Trunc, the context is the only user, which must be a StoreInst. 7084 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7085 if (I->hasOneUse()) 7086 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7087 CCH = ComputeCCH(Store); 7088 } 7089 // For Z/Sext, the context is the operand, which must be a LoadInst. 7090 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7091 Opcode == Instruction::FPExt) { 7092 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7093 CCH = ComputeCCH(Load); 7094 } 7095 7096 // We optimize the truncation of induction variables having constant 7097 // integer steps. The cost of these truncations is the same as the scalar 7098 // operation. 7099 if (isOptimizableIVTruncate(I, VF)) { 7100 auto *Trunc = cast<TruncInst>(I); 7101 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7102 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7103 } 7104 7105 // Detect reduction patterns 7106 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7107 return *RedCost; 7108 7109 Type *SrcScalarTy = I->getOperand(0)->getType(); 7110 Type *SrcVecTy = 7111 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7112 if (canTruncateToMinimalBitwidth(I, VF)) { 7113 // This cast is going to be shrunk. This may remove the cast or it might 7114 // turn it into slightly different cast. For example, if MinBW == 16, 7115 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7116 // 7117 // Calculate the modified src and dest types. 7118 Type *MinVecTy = VectorTy; 7119 if (Opcode == Instruction::Trunc) { 7120 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7121 VectorTy = 7122 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7123 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7124 // Leave SrcVecTy unchanged - we only shrink the destination element 7125 // type. 7126 VectorTy = 7127 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7128 } 7129 } 7130 7131 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7132 } 7133 case Instruction::Call: 7134 return getVectorCallCost(cast<CallInst>(I), VF); 7135 case Instruction::ExtractValue: 7136 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7137 case Instruction::Alloca: 7138 // We cannot easily widen alloca to a scalable alloca, as 7139 // the result would need to be a vector of pointers. 7140 if (VF.isScalable()) 7141 return InstructionCost::getInvalid(); 7142 [[fallthrough]]; 7143 default: 7144 // This opcode is unknown. Assume that it is the same as 'mul'. 7145 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7146 } // end of switch. 7147 } 7148 7149 void LoopVectorizationCostModel::collectValuesToIgnore() { 7150 // Ignore ephemeral values. 7151 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7152 7153 // Find all stores to invariant variables. Since they are going to sink 7154 // outside the loop we do not need calculate cost for them. 7155 for (BasicBlock *BB : TheLoop->blocks()) 7156 for (Instruction &I : *BB) { 7157 StoreInst *SI; 7158 if ((SI = dyn_cast<StoreInst>(&I)) && 7159 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7160 ValuesToIgnore.insert(&I); 7161 } 7162 7163 // Ignore type-promoting instructions we identified during reduction 7164 // detection. 7165 for (const auto &Reduction : Legal->getReductionVars()) { 7166 const RecurrenceDescriptor &RedDes = Reduction.second; 7167 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7168 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7169 } 7170 // Ignore type-casting instructions we identified during induction 7171 // detection. 7172 for (const auto &Induction : Legal->getInductionVars()) { 7173 const InductionDescriptor &IndDes = Induction.second; 7174 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7175 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7176 } 7177 } 7178 7179 void LoopVectorizationCostModel::collectInLoopReductions() { 7180 for (const auto &Reduction : Legal->getReductionVars()) { 7181 PHINode *Phi = Reduction.first; 7182 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7183 7184 // We don't collect reductions that are type promoted (yet). 7185 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7186 continue; 7187 7188 // If the target would prefer this reduction to happen "in-loop", then we 7189 // want to record it as such. 7190 unsigned Opcode = RdxDesc.getOpcode(); 7191 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7192 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7193 TargetTransformInfo::ReductionFlags())) 7194 continue; 7195 7196 // Check that we can correctly put the reductions into the loop, by 7197 // finding the chain of operations that leads from the phi to the loop 7198 // exit value. 7199 SmallVector<Instruction *, 4> ReductionOperations = 7200 RdxDesc.getReductionOpChain(Phi, TheLoop); 7201 bool InLoop = !ReductionOperations.empty(); 7202 7203 if (InLoop) { 7204 InLoopReductions.insert(Phi); 7205 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7206 Instruction *LastChain = Phi; 7207 for (auto *I : ReductionOperations) { 7208 InLoopReductionImmediateChains[I] = LastChain; 7209 LastChain = I; 7210 } 7211 } 7212 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7213 << " reduction for phi: " << *Phi << "\n"); 7214 } 7215 } 7216 7217 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, 7218 DebugLoc DL, const Twine &Name) { 7219 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && 7220 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); 7221 return tryInsertInstruction( 7222 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); 7223 } 7224 7225 // This function will select a scalable VF if the target supports scalable 7226 // vectors and a fixed one otherwise. 7227 // TODO: we could return a pair of values that specify the max VF and 7228 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7229 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7230 // doesn't have a cost model that can choose which plan to execute if 7231 // more than one is generated. 7232 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7233 LoopVectorizationCostModel &CM) { 7234 unsigned WidestType; 7235 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7236 7237 TargetTransformInfo::RegisterKind RegKind = 7238 TTI.enableScalableVectorization() 7239 ? TargetTransformInfo::RGK_ScalableVector 7240 : TargetTransformInfo::RGK_FixedWidthVector; 7241 7242 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7243 unsigned N = RegSize.getKnownMinValue() / WidestType; 7244 return ElementCount::get(N, RegSize.isScalable()); 7245 } 7246 7247 VectorizationFactor 7248 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7249 ElementCount VF = UserVF; 7250 // Outer loop handling: They may require CFG and instruction level 7251 // transformations before even evaluating whether vectorization is profitable. 7252 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7253 // the vectorization pipeline. 7254 if (!OrigLoop->isInnermost()) { 7255 // If the user doesn't provide a vectorization factor, determine a 7256 // reasonable one. 7257 if (UserVF.isZero()) { 7258 VF = determineVPlanVF(TTI, CM); 7259 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7260 7261 // Make sure we have a VF > 1 for stress testing. 7262 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7263 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7264 << "overriding computed VF.\n"); 7265 VF = ElementCount::getFixed(4); 7266 } 7267 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7268 !ForceTargetSupportsScalableVectors) { 7269 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7270 << "not supported by the target.\n"); 7271 reportVectorizationFailure( 7272 "Scalable vectorization requested but not supported by the target", 7273 "the scalable user-specified vectorization width for outer-loop " 7274 "vectorization cannot be used because the target does not support " 7275 "scalable vectors.", 7276 "ScalableVFUnfeasible", ORE, OrigLoop); 7277 return VectorizationFactor::Disabled(); 7278 } 7279 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7280 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7281 "VF needs to be a power of two"); 7282 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7283 << "VF " << VF << " to build VPlans.\n"); 7284 buildVPlans(VF, VF); 7285 7286 // For VPlan build stress testing, we bail out after VPlan construction. 7287 if (VPlanBuildStressTest) 7288 return VectorizationFactor::Disabled(); 7289 7290 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7291 } 7292 7293 LLVM_DEBUG( 7294 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7295 "VPlan-native path.\n"); 7296 return VectorizationFactor::Disabled(); 7297 } 7298 7299 std::optional<VectorizationFactor> 7300 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7301 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7302 CM.collectValuesToIgnore(); 7303 CM.collectElementTypesForWidening(); 7304 7305 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7306 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7307 return std::nullopt; 7308 7309 // Invalidate interleave groups if all blocks of loop will be predicated. 7310 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7311 !useMaskedInterleavedAccesses(TTI)) { 7312 LLVM_DEBUG( 7313 dbgs() 7314 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7315 "which requires masked-interleaved support.\n"); 7316 if (CM.InterleaveInfo.invalidateGroups()) 7317 // Invalidating interleave groups also requires invalidating all decisions 7318 // based on them, which includes widening decisions and uniform and scalar 7319 // values. 7320 CM.invalidateCostModelingDecisions(); 7321 } 7322 7323 ElementCount MaxUserVF = 7324 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7325 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7326 if (!UserVF.isZero() && UserVFIsLegal) { 7327 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7328 "VF needs to be a power of two"); 7329 // Collect the instructions (and their associated costs) that will be more 7330 // profitable to scalarize. 7331 CM.collectInLoopReductions(); 7332 if (CM.selectUserVectorizationFactor(UserVF)) { 7333 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7334 buildVPlansWithVPRecipes(UserVF, UserVF); 7335 if (!hasPlanWithVF(UserVF)) { 7336 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7337 << ".\n"); 7338 return std::nullopt; 7339 } 7340 7341 LLVM_DEBUG(printPlans(dbgs())); 7342 return {{UserVF, 0, 0}}; 7343 } else 7344 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7345 "InvalidCost", ORE, OrigLoop); 7346 } 7347 7348 // Populate the set of Vectorization Factor Candidates. 7349 ElementCountSet VFCandidates; 7350 for (auto VF = ElementCount::getFixed(1); 7351 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7352 VFCandidates.insert(VF); 7353 for (auto VF = ElementCount::getScalable(1); 7354 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7355 VFCandidates.insert(VF); 7356 7357 CM.collectInLoopReductions(); 7358 for (const auto &VF : VFCandidates) { 7359 // Collect Uniform and Scalar instructions after vectorization with VF. 7360 CM.collectUniformsAndScalars(VF); 7361 7362 // Collect the instructions (and their associated costs) that will be more 7363 // profitable to scalarize. 7364 if (VF.isVector()) 7365 CM.collectInstsToScalarize(VF); 7366 } 7367 7368 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7369 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7370 7371 LLVM_DEBUG(printPlans(dbgs())); 7372 if (!MaxFactors.hasVector()) 7373 return VectorizationFactor::Disabled(); 7374 7375 // Select the optimal vectorization factor. 7376 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7377 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7378 if (!hasPlanWithVF(VF.Width)) { 7379 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7380 << ".\n"); 7381 return std::nullopt; 7382 } 7383 return VF; 7384 } 7385 7386 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7387 assert(count_if(VPlans, 7388 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7389 1 && 7390 "Best VF has not a single VPlan."); 7391 7392 for (const VPlanPtr &Plan : VPlans) { 7393 if (Plan->hasVF(VF)) 7394 return *Plan.get(); 7395 } 7396 llvm_unreachable("No plan found!"); 7397 } 7398 7399 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7400 SmallVector<Metadata *, 4> MDs; 7401 // Reserve first location for self reference to the LoopID metadata node. 7402 MDs.push_back(nullptr); 7403 bool IsUnrollMetadata = false; 7404 MDNode *LoopID = L->getLoopID(); 7405 if (LoopID) { 7406 // First find existing loop unrolling disable metadata. 7407 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7408 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7409 if (MD) { 7410 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7411 IsUnrollMetadata = 7412 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7413 } 7414 MDs.push_back(LoopID->getOperand(i)); 7415 } 7416 } 7417 7418 if (!IsUnrollMetadata) { 7419 // Add runtime unroll disable metadata. 7420 LLVMContext &Context = L->getHeader()->getContext(); 7421 SmallVector<Metadata *, 1> DisableOperands; 7422 DisableOperands.push_back( 7423 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7424 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7425 MDs.push_back(DisableNode); 7426 MDNode *NewLoopID = MDNode::get(Context, MDs); 7427 // Set operand 0 to refer to the loop id itself. 7428 NewLoopID->replaceOperandWith(0, NewLoopID); 7429 L->setLoopID(NewLoopID); 7430 } 7431 } 7432 7433 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is 7434 // create a merge phi node for it and add it to \p ReductionResumeValues. 7435 static void createAndCollectMergePhiForReduction( 7436 VPInstruction *RedResult, 7437 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, 7438 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { 7439 if (!RedResult || 7440 RedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7441 return; 7442 7443 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); 7444 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 7445 7446 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 7447 Value *FinalValue = 7448 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); 7449 auto *ResumePhi = 7450 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 7451 7452 // TODO: bc.merge.rdx should not be created here, instead it should be 7453 // modeled in VPlan. 7454 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); 7455 // Create a phi node that merges control-flow from the backedge-taken check 7456 // block and the middle block. 7457 auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", 7458 LoopScalarPreHeader->getTerminator()); 7459 7460 // If we are fixing reductions in the epilogue loop then we should already 7461 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 7462 // we carry over the incoming values correctly. 7463 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 7464 if (Incoming == LoopMiddleBlock) 7465 BCBlockPhi->addIncoming(FinalValue, Incoming); 7466 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) 7467 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 7468 Incoming); 7469 else 7470 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 7471 } 7472 7473 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 7474 // TODO: This fixup should instead be modeled in VPlan. 7475 // Fix the scalar loop reduction variable with the incoming reduction sum 7476 // from the vector body and from the backedge value. 7477 int IncomingEdgeBlockIdx = 7478 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 7479 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 7480 // Pick the other block. 7481 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 7482 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 7483 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 7484 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 7485 7486 ReductionResumeValues[&RdxDesc] = BCBlockPhi; 7487 } 7488 7489 std::pair<DenseMap<const SCEV *, Value *>, 7490 DenseMap<const RecurrenceDescriptor *, Value *>> 7491 LoopVectorizationPlanner::executePlan( 7492 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7493 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7494 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7495 assert(BestVPlan.hasVF(BestVF) && 7496 "Trying to execute plan with unsupported VF"); 7497 assert(BestVPlan.hasUF(BestUF) && 7498 "Trying to execute plan with unsupported UF"); 7499 assert( 7500 (IsEpilogueVectorization || !ExpandedSCEVs) && 7501 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7502 7503 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7504 << '\n'); 7505 7506 if (!IsEpilogueVectorization) 7507 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7508 7509 // Perform the actual loop transformation. 7510 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, 7511 OrigLoop->getHeader()->getContext()); 7512 7513 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7514 // before making any changes to the CFG. 7515 if (!BestVPlan.getPreheader()->empty()) { 7516 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7517 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7518 BestVPlan.getPreheader()->execute(&State); 7519 } 7520 if (!ILV.getTripCount()) 7521 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7522 else 7523 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7524 "count during epilogue vectorization"); 7525 7526 // 1. Set up the skeleton for vectorization, including vector pre-header and 7527 // middle block. The vector loop is created during VPlan execution. 7528 Value *CanonicalIVStartValue; 7529 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7530 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7531 : State.ExpandedSCEVs); 7532 7533 // Only use noalias metadata when using memory checks guaranteeing no overlap 7534 // across all iterations. 7535 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7536 std::unique_ptr<LoopVersioning> LVer = nullptr; 7537 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7538 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7539 7540 // We currently don't use LoopVersioning for the actual loop cloning but we 7541 // still use it to add the noalias metadata. 7542 // TODO: Find a better way to re-use LoopVersioning functionality to add 7543 // metadata. 7544 LVer = std::make_unique<LoopVersioning>( 7545 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7546 PSE.getSE()); 7547 State.LVer = &*LVer; 7548 State.LVer->prepareNoAliasMetadata(); 7549 } 7550 7551 ILV.collectPoisonGeneratingRecipes(State); 7552 7553 ILV.printDebugTracesAtStart(); 7554 7555 //===------------------------------------------------===// 7556 // 7557 // Notice: any optimization or new instruction that go 7558 // into the code below should also be implemented in 7559 // the cost-model. 7560 // 7561 //===------------------------------------------------===// 7562 7563 // 2. Copy and widen instructions from the old loop into the new loop. 7564 BestVPlan.prepareToExecute(ILV.getTripCount(), 7565 ILV.getOrCreateVectorTripCount(nullptr), 7566 CanonicalIVStartValue, State); 7567 7568 BestVPlan.execute(&State); 7569 7570 // 2.5 Collect reduction resume values. 7571 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; 7572 auto *ExitVPBB = 7573 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); 7574 for (VPRecipeBase &R : *ExitVPBB) { 7575 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), 7576 ReductionResumeValues, State, OrigLoop, 7577 State.CFG.VPBB2IRBB[ExitVPBB]); 7578 } 7579 7580 // 2.6. Maintain Loop Hints 7581 // Keep all loop hints from the original loop on the vector loop (we'll 7582 // replace the vectorizer-specific hints below). 7583 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7584 7585 std::optional<MDNode *> VectorizedLoopID = 7586 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7587 LLVMLoopVectorizeFollowupVectorized}); 7588 7589 VPBasicBlock *HeaderVPBB = 7590 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7591 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7592 if (VectorizedLoopID) 7593 L->setLoopID(*VectorizedLoopID); 7594 else { 7595 // Keep all loop hints from the original loop on the vector loop (we'll 7596 // replace the vectorizer-specific hints below). 7597 if (MDNode *LID = OrigLoop->getLoopID()) 7598 L->setLoopID(LID); 7599 7600 LoopVectorizeHints Hints(L, true, *ORE); 7601 Hints.setAlreadyVectorized(); 7602 } 7603 TargetTransformInfo::UnrollingPreferences UP; 7604 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7605 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7606 AddRuntimeUnrollDisableMetaData(L); 7607 7608 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7609 // predication, updating analyses. 7610 ILV.fixVectorizedLoop(State, BestVPlan); 7611 7612 ILV.printDebugTracesAtEnd(); 7613 7614 return {State.ExpandedSCEVs, ReductionResumeValues}; 7615 } 7616 7617 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7618 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7619 for (const auto &Plan : VPlans) 7620 if (PrintVPlansInDotFormat) 7621 Plan->printDOT(O); 7622 else 7623 Plan->print(O); 7624 } 7625 #endif 7626 7627 //===--------------------------------------------------------------------===// 7628 // EpilogueVectorizerMainLoop 7629 //===--------------------------------------------------------------------===// 7630 7631 /// This function is partially responsible for generating the control flow 7632 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7633 std::pair<BasicBlock *, Value *> 7634 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7635 const SCEV2ValueTy &ExpandedSCEVs) { 7636 createVectorLoopSkeleton(""); 7637 7638 // Generate the code to check the minimum iteration count of the vector 7639 // epilogue (see below). 7640 EPI.EpilogueIterationCountCheck = 7641 emitIterationCountCheck(LoopScalarPreHeader, true); 7642 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7643 7644 // Generate the code to check any assumptions that we've made for SCEV 7645 // expressions. 7646 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7647 7648 // Generate the code that checks at runtime if arrays overlap. We put the 7649 // checks into a separate block to make the more common case of few elements 7650 // faster. 7651 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7652 7653 // Generate the iteration count check for the main loop, *after* the check 7654 // for the epilogue loop, so that the path-length is shorter for the case 7655 // that goes directly through the vector epilogue. The longer-path length for 7656 // the main loop is compensated for, by the gain from vectorizing the larger 7657 // trip count. Note: the branch will get updated later on when we vectorize 7658 // the epilogue. 7659 EPI.MainLoopIterationCountCheck = 7660 emitIterationCountCheck(LoopScalarPreHeader, false); 7661 7662 // Generate the induction variable. 7663 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7664 7665 // Skip induction resume value creation here because they will be created in 7666 // the second pass for the scalar loop. The induction resume values for the 7667 // inductions in the epilogue loop are created before executing the plan for 7668 // the epilogue loop. 7669 7670 return {completeLoopSkeleton(), nullptr}; 7671 } 7672 7673 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7674 LLVM_DEBUG({ 7675 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7676 << "Main Loop VF:" << EPI.MainLoopVF 7677 << ", Main Loop UF:" << EPI.MainLoopUF 7678 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7679 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7680 }); 7681 } 7682 7683 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7684 DEBUG_WITH_TYPE(VerboseDebug, { 7685 dbgs() << "intermediate fn:\n" 7686 << *OrigLoop->getHeader()->getParent() << "\n"; 7687 }); 7688 } 7689 7690 BasicBlock * 7691 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7692 bool ForEpilogue) { 7693 assert(Bypass && "Expected valid bypass basic block."); 7694 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7695 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7696 Value *Count = getTripCount(); 7697 // Reuse existing vector loop preheader for TC checks. 7698 // Note that new preheader block is generated for vector loop. 7699 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7700 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7701 7702 // Generate code to check if the loop's trip count is less than VF * UF of the 7703 // main vector loop. 7704 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7705 : VF.isVector()) 7706 ? ICmpInst::ICMP_ULE 7707 : ICmpInst::ICMP_ULT; 7708 7709 Value *CheckMinIters = Builder.CreateICmp( 7710 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7711 "min.iters.check"); 7712 7713 if (!ForEpilogue) 7714 TCCheckBlock->setName("vector.main.loop.iter.check"); 7715 7716 // Create new preheader for vector loop. 7717 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7718 DT, LI, nullptr, "vector.ph"); 7719 7720 if (ForEpilogue) { 7721 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7722 DT->getNode(Bypass)->getIDom()) && 7723 "TC check is expected to dominate Bypass"); 7724 7725 // Update dominator for Bypass & LoopExit. 7726 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7727 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7728 // For loops with multiple exits, there's no edge from the middle block 7729 // to exit blocks (as the epilogue must run) and thus no need to update 7730 // the immediate dominator of the exit blocks. 7731 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7732 7733 LoopBypassBlocks.push_back(TCCheckBlock); 7734 7735 // Save the trip count so we don't have to regenerate it in the 7736 // vec.epilog.iter.check. This is safe to do because the trip count 7737 // generated here dominates the vector epilog iter check. 7738 EPI.TripCount = Count; 7739 } 7740 7741 BranchInst &BI = 7742 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7743 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7744 setBranchWeights(BI, MinItersBypassWeights); 7745 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7746 7747 return TCCheckBlock; 7748 } 7749 7750 //===--------------------------------------------------------------------===// 7751 // EpilogueVectorizerEpilogueLoop 7752 //===--------------------------------------------------------------------===// 7753 7754 /// This function is partially responsible for generating the control flow 7755 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7756 std::pair<BasicBlock *, Value *> 7757 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7758 const SCEV2ValueTy &ExpandedSCEVs) { 7759 createVectorLoopSkeleton("vec.epilog."); 7760 7761 // Now, compare the remaining count and if there aren't enough iterations to 7762 // execute the vectorized epilogue skip to the scalar part. 7763 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7764 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7765 LoopVectorPreHeader = 7766 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7767 LI, nullptr, "vec.epilog.ph"); 7768 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7769 VecEpilogueIterationCountCheck); 7770 7771 // Adjust the control flow taking the state info from the main loop 7772 // vectorization into account. 7773 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7774 "expected this to be saved from the previous pass."); 7775 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7776 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7777 7778 DT->changeImmediateDominator(LoopVectorPreHeader, 7779 EPI.MainLoopIterationCountCheck); 7780 7781 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7782 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7783 7784 if (EPI.SCEVSafetyCheck) 7785 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7786 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7787 if (EPI.MemSafetyCheck) 7788 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7789 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7790 7791 DT->changeImmediateDominator( 7792 VecEpilogueIterationCountCheck, 7793 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7794 7795 DT->changeImmediateDominator(LoopScalarPreHeader, 7796 EPI.EpilogueIterationCountCheck); 7797 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7798 // If there is an epilogue which must run, there's no edge from the 7799 // middle block to exit blocks and thus no need to update the immediate 7800 // dominator of the exit blocks. 7801 DT->changeImmediateDominator(LoopExitBlock, 7802 EPI.EpilogueIterationCountCheck); 7803 7804 // Keep track of bypass blocks, as they feed start values to the induction and 7805 // reduction phis in the scalar loop preheader. 7806 if (EPI.SCEVSafetyCheck) 7807 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7808 if (EPI.MemSafetyCheck) 7809 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7810 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7811 7812 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7813 // reductions which merge control-flow from the latch block and the middle 7814 // block. Update the incoming values here and move the Phi into the preheader. 7815 SmallVector<PHINode *, 4> PhisInBlock; 7816 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7817 PhisInBlock.push_back(&Phi); 7818 7819 for (PHINode *Phi : PhisInBlock) { 7820 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7821 Phi->replaceIncomingBlockWith( 7822 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7823 VecEpilogueIterationCountCheck); 7824 7825 // If the phi doesn't have an incoming value from the 7826 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7827 // value and also those from other check blocks. This is needed for 7828 // reduction phis only. 7829 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7830 return EPI.EpilogueIterationCountCheck == IncB; 7831 })) 7832 continue; 7833 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7834 if (EPI.SCEVSafetyCheck) 7835 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7836 if (EPI.MemSafetyCheck) 7837 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7838 } 7839 7840 // Generate a resume induction for the vector epilogue and put it in the 7841 // vector epilogue preheader 7842 Type *IdxTy = Legal->getWidestInductionType(); 7843 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); 7844 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7845 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7846 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7847 EPI.MainLoopIterationCountCheck); 7848 7849 // Generate induction resume values. These variables save the new starting 7850 // indexes for the scalar loop. They are used to test if there are any tail 7851 // iterations left once the vector loop has completed. 7852 // Note that when the vectorized epilogue is skipped due to iteration count 7853 // check, then the resume value for the induction variable comes from 7854 // the trip count of the main vector loop, hence passing the AdditionalBypass 7855 // argument. 7856 createInductionResumeValues(ExpandedSCEVs, 7857 {VecEpilogueIterationCountCheck, 7858 EPI.VectorTripCount} /* AdditionalBypass */); 7859 7860 return {completeLoopSkeleton(), EPResumeVal}; 7861 } 7862 7863 BasicBlock * 7864 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7865 BasicBlock *Bypass, BasicBlock *Insert) { 7866 7867 assert(EPI.TripCount && 7868 "Expected trip count to have been safed in the first pass."); 7869 assert( 7870 (!isa<Instruction>(EPI.TripCount) || 7871 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7872 "saved trip count does not dominate insertion point."); 7873 Value *TC = EPI.TripCount; 7874 IRBuilder<> Builder(Insert->getTerminator()); 7875 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7876 7877 // Generate code to check if the loop's trip count is less than VF * UF of the 7878 // vector epilogue loop. 7879 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 7880 ? ICmpInst::ICMP_ULE 7881 : ICmpInst::ICMP_ULT; 7882 7883 Value *CheckMinIters = 7884 Builder.CreateICmp(P, Count, 7885 createStepForVF(Builder, Count->getType(), 7886 EPI.EpilogueVF, EPI.EpilogueUF), 7887 "min.epilog.iters.check"); 7888 7889 BranchInst &BI = 7890 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7891 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7892 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 7893 unsigned EpilogueLoopStep = 7894 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 7895 // We assume the remaining `Count` is equally distributed in 7896 // [0, MainLoopStep) 7897 // So the probability for `Count < EpilogueLoopStep` should be 7898 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 7899 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 7900 const uint32_t Weights[] = {EstimatedSkipCount, 7901 MainLoopStep - EstimatedSkipCount}; 7902 setBranchWeights(BI, Weights); 7903 } 7904 ReplaceInstWithInst(Insert->getTerminator(), &BI); 7905 7906 LoopBypassBlocks.push_back(Insert); 7907 return Insert; 7908 } 7909 7910 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7911 LLVM_DEBUG({ 7912 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7913 << "Epilogue Loop VF:" << EPI.EpilogueVF 7914 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7915 }); 7916 } 7917 7918 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7919 DEBUG_WITH_TYPE(VerboseDebug, { 7920 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7921 }); 7922 } 7923 7924 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7925 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7926 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7927 bool PredicateAtRangeStart = Predicate(Range.Start); 7928 7929 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 7930 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7931 Range.End = TmpVF; 7932 break; 7933 } 7934 7935 return PredicateAtRangeStart; 7936 } 7937 7938 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7939 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7940 /// of VF's starting at a given VF and extending it as much as possible. Each 7941 /// vectorization decision can potentially shorten this sub-range during 7942 /// buildVPlan(). 7943 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7944 ElementCount MaxVF) { 7945 auto MaxVFTimes2 = MaxVF * 2; 7946 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 7947 VFRange SubRange = {VF, MaxVFTimes2}; 7948 VPlans.push_back(buildVPlan(SubRange)); 7949 VF = SubRange.End; 7950 } 7951 } 7952 7953 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7954 VPlan &Plan) { 7955 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7956 7957 // Look for cached value. 7958 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7959 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7960 if (ECEntryIt != EdgeMaskCache.end()) 7961 return ECEntryIt->second; 7962 7963 VPValue *SrcMask = getBlockInMask(Src); 7964 7965 // The terminator has to be a branch inst! 7966 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7967 assert(BI && "Unexpected terminator found"); 7968 7969 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7970 return EdgeMaskCache[Edge] = SrcMask; 7971 7972 // If source is an exiting block, we know the exit edge is dynamically dead 7973 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7974 // adding uses of an otherwise potentially dead instruction. 7975 if (OrigLoop->isLoopExiting(Src)) 7976 return EdgeMaskCache[Edge] = SrcMask; 7977 7978 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 7979 assert(EdgeMask && "No Edge Mask found for condition"); 7980 7981 if (BI->getSuccessor(0) != Dst) 7982 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7983 7984 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 7985 // The condition is 'SrcMask && EdgeMask', which is equivalent to 7986 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 7987 // The select version does not introduce new UB if SrcMask is false and 7988 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 7989 VPValue *False = Plan.getVPValueOrAddLiveIn( 7990 ConstantInt::getFalse(BI->getCondition()->getType())); 7991 EdgeMask = 7992 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 7993 } 7994 7995 return EdgeMaskCache[Edge] = EdgeMask; 7996 } 7997 7998 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { 7999 BasicBlock *Header = OrigLoop->getHeader(); 8000 8001 // When not folding the tail, use nullptr to model all-true mask. 8002 if (!CM.foldTailByMasking()) { 8003 BlockMaskCache[Header] = nullptr; 8004 return; 8005 } 8006 8007 // Introduce the early-exit compare IV <= BTC to form header block mask. 8008 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8009 // constructing the desired canonical IV in the header block as its first 8010 // non-phi instructions. 8011 8012 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8013 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8014 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8015 HeaderVPBB->insert(IV, NewInsertionPoint); 8016 8017 VPBuilder::InsertPointGuard Guard(Builder); 8018 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8019 VPValue *BlockMask = nullptr; 8020 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8021 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8022 BlockMaskCache[Header] = BlockMask; 8023 } 8024 8025 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8026 // Return the cached value. 8027 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8028 assert(BCEntryIt != BlockMaskCache.end() && 8029 "Trying to access mask for block without one."); 8030 return BCEntryIt->second; 8031 } 8032 8033 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8034 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8035 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8036 assert(OrigLoop->getHeader() != BB && 8037 "Loop header must have cached block mask"); 8038 8039 // All-one mask is modelled as no-mask following the convention for masked 8040 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8041 VPValue *BlockMask = nullptr; 8042 // This is the block mask. We OR all incoming edges. 8043 for (auto *Predecessor : predecessors(BB)) { 8044 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8045 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8046 BlockMaskCache[BB] = EdgeMask; 8047 return; 8048 } 8049 8050 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8051 BlockMask = EdgeMask; 8052 continue; 8053 } 8054 8055 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8056 } 8057 8058 BlockMaskCache[BB] = BlockMask; 8059 } 8060 8061 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8062 ArrayRef<VPValue *> Operands, 8063 VFRange &Range, 8064 VPlanPtr &Plan) { 8065 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8066 "Must be called with either a load or store"); 8067 8068 auto willWiden = [&](ElementCount VF) -> bool { 8069 LoopVectorizationCostModel::InstWidening Decision = 8070 CM.getWideningDecision(I, VF); 8071 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8072 "CM decision should be taken at this point."); 8073 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8074 return true; 8075 if (CM.isScalarAfterVectorization(I, VF) || 8076 CM.isProfitableToScalarize(I, VF)) 8077 return false; 8078 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8079 }; 8080 8081 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8082 return nullptr; 8083 8084 VPValue *Mask = nullptr; 8085 if (Legal->isMaskRequired(I)) 8086 Mask = getBlockInMask(I->getParent()); 8087 8088 // Determine if the pointer operand of the access is either consecutive or 8089 // reverse consecutive. 8090 LoopVectorizationCostModel::InstWidening Decision = 8091 CM.getWideningDecision(I, Range.Start); 8092 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8093 bool Consecutive = 8094 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8095 8096 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8097 if (Consecutive) { 8098 auto *GEP = dyn_cast<GetElementPtrInst>( 8099 Ptr->getUnderlyingValue()->stripPointerCasts()); 8100 auto *VectorPtr = new VPVectorPointerRecipe( 8101 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, 8102 I->getDebugLoc()); 8103 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8104 Ptr = VectorPtr; 8105 } 8106 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8107 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, 8108 Reverse); 8109 8110 StoreInst *Store = cast<StoreInst>(I); 8111 return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask, 8112 Consecutive, Reverse); 8113 } 8114 8115 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8116 /// insert a recipe to expand the step for the induction recipe. 8117 static VPWidenIntOrFpInductionRecipe * 8118 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8119 VPValue *Start, const InductionDescriptor &IndDesc, 8120 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8121 VFRange &Range) { 8122 assert(IndDesc.getStartValue() == 8123 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8124 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8125 "step must be loop invariant"); 8126 8127 VPValue *Step = 8128 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8129 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8130 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8131 } 8132 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8133 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8134 } 8135 8136 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8137 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8138 8139 // Check if this is an integer or fp induction. If so, build the recipe that 8140 // produces its scalar and vector values. 8141 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8142 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8143 *PSE.getSE(), *OrigLoop, Range); 8144 8145 // Check if this is pointer induction. If so, build the recipe for it. 8146 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8147 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8148 *PSE.getSE()); 8149 return new VPWidenPointerInductionRecipe( 8150 Phi, Operands[0], Step, *II, 8151 LoopVectorizationPlanner::getDecisionAndClampRange( 8152 [&](ElementCount VF) { 8153 return CM.isScalarAfterVectorization(Phi, VF); 8154 }, 8155 Range)); 8156 } 8157 return nullptr; 8158 } 8159 8160 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8161 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8162 // Optimize the special case where the source is a constant integer 8163 // induction variable. Notice that we can only optimize the 'trunc' case 8164 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8165 // (c) other casts depend on pointer size. 8166 8167 // Determine whether \p K is a truncation based on an induction variable that 8168 // can be optimized. 8169 auto isOptimizableIVTruncate = 8170 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8171 return [=](ElementCount VF) -> bool { 8172 return CM.isOptimizableIVTruncate(K, VF); 8173 }; 8174 }; 8175 8176 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8177 isOptimizableIVTruncate(I), Range)) { 8178 8179 auto *Phi = cast<PHINode>(I->getOperand(0)); 8180 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8181 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8182 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8183 *OrigLoop, Range); 8184 } 8185 return nullptr; 8186 } 8187 8188 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8189 ArrayRef<VPValue *> Operands, 8190 VPlanPtr &Plan) { 8191 // If all incoming values are equal, the incoming VPValue can be used directly 8192 // instead of creating a new VPBlendRecipe. 8193 if (llvm::all_equal(Operands)) 8194 return Operands[0]; 8195 8196 unsigned NumIncoming = Phi->getNumIncomingValues(); 8197 // For in-loop reductions, we do not need to create an additional select. 8198 VPValue *InLoopVal = nullptr; 8199 for (unsigned In = 0; In < NumIncoming; In++) { 8200 PHINode *PhiOp = 8201 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8202 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8203 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8204 InLoopVal = Operands[In]; 8205 } 8206 } 8207 8208 assert((!InLoopVal || NumIncoming == 2) && 8209 "Found an in-loop reduction for PHI with unexpected number of " 8210 "incoming values"); 8211 if (InLoopVal) 8212 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8213 8214 // We know that all PHIs in non-header blocks are converted into selects, so 8215 // we don't have to worry about the insertion order and we can just use the 8216 // builder. At this point we generate the predication tree. There may be 8217 // duplications since this is a simple recursive scan, but future 8218 // optimizations will clean it up. 8219 SmallVector<VPValue *, 2> OperandsWithMask; 8220 8221 for (unsigned In = 0; In < NumIncoming; In++) { 8222 VPValue *EdgeMask = 8223 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8224 assert((EdgeMask || NumIncoming == 1) && 8225 "Multiple predecessors with one having a full mask"); 8226 OperandsWithMask.push_back(Operands[In]); 8227 if (EdgeMask) 8228 OperandsWithMask.push_back(EdgeMask); 8229 } 8230 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8231 } 8232 8233 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8234 ArrayRef<VPValue *> Operands, 8235 VFRange &Range, 8236 VPlanPtr &Plan) { 8237 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8238 [this, CI](ElementCount VF) { 8239 return CM.isScalarWithPredication(CI, VF); 8240 }, 8241 Range); 8242 8243 if (IsPredicated) 8244 return nullptr; 8245 8246 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8247 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8248 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8249 ID == Intrinsic::pseudoprobe || 8250 ID == Intrinsic::experimental_noalias_scope_decl)) 8251 return nullptr; 8252 8253 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8254 8255 // Is it beneficial to perform intrinsic call compared to lib call? 8256 bool ShouldUseVectorIntrinsic = 8257 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8258 [&](ElementCount VF) -> bool { 8259 return CM.getCallWideningDecision(CI, VF).Kind == 8260 LoopVectorizationCostModel::CM_IntrinsicCall; 8261 }, 8262 Range); 8263 if (ShouldUseVectorIntrinsic) 8264 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID, 8265 CI->getDebugLoc()); 8266 8267 Function *Variant = nullptr; 8268 std::optional<unsigned> MaskPos; 8269 // Is better to call a vectorized version of the function than to to scalarize 8270 // the call? 8271 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8272 [&](ElementCount VF) -> bool { 8273 // The following case may be scalarized depending on the VF. 8274 // The flag shows whether we can use a usual Call for vectorized 8275 // version of the instruction. 8276 8277 // If we've found a variant at a previous VF, then stop looking. A 8278 // vectorized variant of a function expects input in a certain shape 8279 // -- basically the number of input registers, the number of lanes 8280 // per register, and whether there's a mask required. 8281 // We store a pointer to the variant in the VPWidenCallRecipe, so 8282 // once we have an appropriate variant it's only valid for that VF. 8283 // This will force a different vplan to be generated for each VF that 8284 // finds a valid variant. 8285 if (Variant) 8286 return false; 8287 LoopVectorizationCostModel::CallWideningDecision Decision = 8288 CM.getCallWideningDecision(CI, VF); 8289 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8290 Variant = Decision.Variant; 8291 MaskPos = Decision.MaskPos; 8292 return true; 8293 } 8294 8295 return false; 8296 }, 8297 Range); 8298 if (ShouldUseVectorCall) { 8299 if (MaskPos.has_value()) { 8300 // We have 2 cases that would require a mask: 8301 // 1) The block needs to be predicated, either due to a conditional 8302 // in the scalar loop or use of an active lane mask with 8303 // tail-folding, and we use the appropriate mask for the block. 8304 // 2) No mask is required for the block, but the only available 8305 // vector variant at this VF requires a mask, so we synthesize an 8306 // all-true mask. 8307 VPValue *Mask = nullptr; 8308 if (Legal->isMaskRequired(CI)) 8309 Mask = getBlockInMask(CI->getParent()); 8310 else 8311 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8312 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8313 8314 Ops.insert(Ops.begin() + *MaskPos, Mask); 8315 } 8316 8317 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8318 Intrinsic::not_intrinsic, CI->getDebugLoc(), 8319 Variant); 8320 } 8321 8322 return nullptr; 8323 } 8324 8325 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8326 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8327 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8328 // Instruction should be widened, unless it is scalar after vectorization, 8329 // scalarization is profitable or it is predicated. 8330 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8331 return CM.isScalarAfterVectorization(I, VF) || 8332 CM.isProfitableToScalarize(I, VF) || 8333 CM.isScalarWithPredication(I, VF); 8334 }; 8335 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8336 Range); 8337 } 8338 8339 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8340 ArrayRef<VPValue *> Operands, 8341 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8342 switch (I->getOpcode()) { 8343 default: 8344 return nullptr; 8345 case Instruction::SDiv: 8346 case Instruction::UDiv: 8347 case Instruction::SRem: 8348 case Instruction::URem: { 8349 // If not provably safe, use a select to form a safe divisor before widening the 8350 // div/rem operation itself. Otherwise fall through to general handling below. 8351 if (CM.isPredicatedInst(I)) { 8352 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8353 VPValue *Mask = getBlockInMask(I->getParent()); 8354 VPValue *One = Plan->getVPValueOrAddLiveIn( 8355 ConstantInt::get(I->getType(), 1u, false)); 8356 auto *SafeRHS = 8357 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8358 I->getDebugLoc()); 8359 VPBB->appendRecipe(SafeRHS); 8360 Ops[1] = SafeRHS; 8361 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8362 } 8363 [[fallthrough]]; 8364 } 8365 case Instruction::Add: 8366 case Instruction::And: 8367 case Instruction::AShr: 8368 case Instruction::FAdd: 8369 case Instruction::FCmp: 8370 case Instruction::FDiv: 8371 case Instruction::FMul: 8372 case Instruction::FNeg: 8373 case Instruction::FRem: 8374 case Instruction::FSub: 8375 case Instruction::ICmp: 8376 case Instruction::LShr: 8377 case Instruction::Mul: 8378 case Instruction::Or: 8379 case Instruction::Select: 8380 case Instruction::Shl: 8381 case Instruction::Sub: 8382 case Instruction::Xor: 8383 case Instruction::Freeze: 8384 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8385 }; 8386 } 8387 8388 void VPRecipeBuilder::fixHeaderPhis() { 8389 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8390 for (VPHeaderPHIRecipe *R : PhisToFix) { 8391 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8392 VPRecipeBase *IncR = 8393 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8394 R->addOperand(IncR->getVPSingleValue()); 8395 } 8396 } 8397 8398 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8399 VFRange &Range, 8400 VPlan &Plan) { 8401 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8402 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8403 Range); 8404 8405 bool IsPredicated = CM.isPredicatedInst(I); 8406 8407 // Even if the instruction is not marked as uniform, there are certain 8408 // intrinsic calls that can be effectively treated as such, so we check for 8409 // them here. Conservatively, we only do this for scalable vectors, since 8410 // for fixed-width VFs we can always fall back on full scalarization. 8411 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8412 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8413 case Intrinsic::assume: 8414 case Intrinsic::lifetime_start: 8415 case Intrinsic::lifetime_end: 8416 // For scalable vectors if one of the operands is variant then we still 8417 // want to mark as uniform, which will generate one instruction for just 8418 // the first lane of the vector. We can't scalarize the call in the same 8419 // way as for fixed-width vectors because we don't know how many lanes 8420 // there are. 8421 // 8422 // The reasons for doing it this way for scalable vectors are: 8423 // 1. For the assume intrinsic generating the instruction for the first 8424 // lane is still be better than not generating any at all. For 8425 // example, the input may be a splat across all lanes. 8426 // 2. For the lifetime start/end intrinsics the pointer operand only 8427 // does anything useful when the input comes from a stack object, 8428 // which suggests it should always be uniform. For non-stack objects 8429 // the effect is to poison the object, which still allows us to 8430 // remove the call. 8431 IsUniform = true; 8432 break; 8433 default: 8434 break; 8435 } 8436 } 8437 VPValue *BlockInMask = nullptr; 8438 if (!IsPredicated) { 8439 // Finalize the recipe for Instr, first if it is not predicated. 8440 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8441 } else { 8442 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8443 // Instructions marked for predication are replicated and a mask operand is 8444 // added initially. Masked replicate recipes will later be placed under an 8445 // if-then construct to prevent side-effects. Generate recipes to compute 8446 // the block mask for this region. 8447 BlockInMask = getBlockInMask(I->getParent()); 8448 } 8449 8450 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8451 IsUniform, BlockInMask); 8452 return toVPRecipeResult(Recipe); 8453 } 8454 8455 VPRecipeOrVPValueTy 8456 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8457 ArrayRef<VPValue *> Operands, 8458 VFRange &Range, VPBasicBlock *VPBB, 8459 VPlanPtr &Plan) { 8460 // First, check for specific widening recipes that deal with inductions, Phi 8461 // nodes, calls and memory operations. 8462 VPRecipeBase *Recipe; 8463 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8464 if (Phi->getParent() != OrigLoop->getHeader()) 8465 return tryToBlend(Phi, Operands, Plan); 8466 8467 // Always record recipes for header phis. Later first-order recurrence phis 8468 // can have earlier phis as incoming values. 8469 recordRecipeOf(Phi); 8470 8471 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8472 return toVPRecipeResult(Recipe); 8473 8474 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8475 assert((Legal->isReductionVariable(Phi) || 8476 Legal->isFixedOrderRecurrence(Phi)) && 8477 "can only widen reductions and fixed-order recurrences here"); 8478 VPValue *StartV = Operands[0]; 8479 if (Legal->isReductionVariable(Phi)) { 8480 const RecurrenceDescriptor &RdxDesc = 8481 Legal->getReductionVars().find(Phi)->second; 8482 assert(RdxDesc.getRecurrenceStartValue() == 8483 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8484 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8485 CM.isInLoopReduction(Phi), 8486 CM.useOrderedReductions(RdxDesc)); 8487 } else { 8488 // TODO: Currently fixed-order recurrences are modeled as chains of 8489 // first-order recurrences. If there are no users of the intermediate 8490 // recurrences in the chain, the fixed order recurrence should be modeled 8491 // directly, enabling more efficient codegen. 8492 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8493 } 8494 8495 // Record the incoming value from the backedge, so we can add the incoming 8496 // value from the backedge after all recipes have been created. 8497 auto *Inc = cast<Instruction>( 8498 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8499 auto RecipeIter = Ingredient2Recipe.find(Inc); 8500 if (RecipeIter == Ingredient2Recipe.end()) 8501 recordRecipeOf(Inc); 8502 8503 PhisToFix.push_back(PhiRecipe); 8504 return toVPRecipeResult(PhiRecipe); 8505 } 8506 8507 if (isa<TruncInst>(Instr) && 8508 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8509 Range, *Plan))) 8510 return toVPRecipeResult(Recipe); 8511 8512 // All widen recipes below deal only with VF > 1. 8513 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8514 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8515 return nullptr; 8516 8517 if (auto *CI = dyn_cast<CallInst>(Instr)) 8518 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8519 8520 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8521 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8522 8523 if (!shouldWiden(Instr, Range)) 8524 return nullptr; 8525 8526 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8527 return toVPRecipeResult(new VPWidenGEPRecipe( 8528 GEP, make_range(Operands.begin(), Operands.end()))); 8529 8530 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8531 return toVPRecipeResult(new VPWidenSelectRecipe( 8532 *SI, make_range(Operands.begin(), Operands.end()))); 8533 } 8534 8535 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8536 return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], 8537 CI->getType(), *CI)); 8538 } 8539 8540 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8541 } 8542 8543 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8544 ElementCount MaxVF) { 8545 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8546 8547 auto MaxVFTimes2 = MaxVF * 2; 8548 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8549 VFRange SubRange = {VF, MaxVFTimes2}; 8550 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8551 // Now optimize the initial VPlan. 8552 if (!Plan->hasVF(ElementCount::getFixed(1))) 8553 VPlanTransforms::truncateToMinimalBitwidths( 8554 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); 8555 VPlanTransforms::optimize(*Plan, *PSE.getSE()); 8556 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 8557 VPlans.push_back(std::move(Plan)); 8558 } 8559 VF = SubRange.End; 8560 } 8561 } 8562 8563 // Add the necessary canonical IV and branch recipes required to control the 8564 // loop. 8565 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8566 DebugLoc DL) { 8567 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8568 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8569 8570 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8571 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8572 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8573 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8574 Header->insert(CanonicalIVPHI, Header->begin()); 8575 8576 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8577 // IV by VF * UF. 8578 auto *CanonicalIVIncrement = 8579 new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, 8580 {HasNUW, false}, DL, "index.next"); 8581 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8582 8583 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8584 EB->appendRecipe(CanonicalIVIncrement); 8585 8586 // Add the BranchOnCount VPInstruction to the latch. 8587 VPInstruction *BranchBack = 8588 new VPInstruction(VPInstruction::BranchOnCount, 8589 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8590 EB->appendRecipe(BranchBack); 8591 } 8592 8593 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8594 // original exit block. 8595 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, 8596 VPlan &Plan) { 8597 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8598 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8599 // Only handle single-exit loops with unique exit blocks for now. 8600 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8601 return; 8602 8603 // Introduce VPUsers modeling the exit values. 8604 for (PHINode &ExitPhi : ExitBB->phis()) { 8605 Value *IncomingValue = 8606 ExitPhi.getIncomingValueForBlock(ExitingBB); 8607 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8608 Plan.addLiveOut(&ExitPhi, V); 8609 } 8610 } 8611 8612 VPlanPtr 8613 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 8614 8615 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8616 8617 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8618 8619 // --------------------------------------------------------------------------- 8620 // Pre-construction: record ingredients whose recipes we'll need to further 8621 // process after constructing the initial VPlan. 8622 // --------------------------------------------------------------------------- 8623 8624 // For each interleave group which is relevant for this (possibly trimmed) 8625 // Range, add it to the set of groups to be later applied to the VPlan and add 8626 // placeholders for its members' Recipes which we'll be replacing with a 8627 // single VPInterleaveRecipe. 8628 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8629 auto applyIG = [IG, this](ElementCount VF) -> bool { 8630 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8631 CM.getWideningDecision(IG->getInsertPos(), VF) == 8632 LoopVectorizationCostModel::CM_Interleave); 8633 // For scalable vectors, the only interleave factor currently supported 8634 // is 2 since we require the (de)interleave2 intrinsics instead of 8635 // shufflevectors. 8636 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8637 "Unsupported interleave factor for scalable vectors"); 8638 return Result; 8639 }; 8640 if (!getDecisionAndClampRange(applyIG, Range)) 8641 continue; 8642 InterleaveGroups.insert(IG); 8643 for (unsigned i = 0; i < IG->getFactor(); i++) 8644 if (Instruction *Member = IG->getMember(i)) 8645 RecipeBuilder.recordRecipeOf(Member); 8646 }; 8647 8648 // --------------------------------------------------------------------------- 8649 // Build initial VPlan: Scan the body of the loop in a topological order to 8650 // visit each basic block after having visited its predecessor basic blocks. 8651 // --------------------------------------------------------------------------- 8652 8653 // Create initial VPlan skeleton, having a basic block for the pre-header 8654 // which contains SCEV expansions that need to happen before the CFG is 8655 // modified; a basic block for the vector pre-header, followed by a region for 8656 // the vector loop, followed by the middle basic block. The skeleton vector 8657 // loop region contains a header and latch basic blocks. 8658 VPlanPtr Plan = VPlan::createInitialVPlan( 8659 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8660 *PSE.getSE()); 8661 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8662 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8663 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8664 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); 8665 Plan->getVectorLoopRegion()->setExiting(LatchVPBB); 8666 8667 // Don't use getDecisionAndClampRange here, because we don't know the UF 8668 // so this function is better to be conservative, rather than to split 8669 // it up into different VPlans. 8670 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 8671 bool IVUpdateMayOverflow = false; 8672 for (ElementCount VF : Range) 8673 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8674 8675 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8676 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 8677 // When not folding the tail, we know that the induction increment will not 8678 // overflow. 8679 bool HasNUW = Style == TailFoldingStyle::None; 8680 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 8681 8682 // Scan the body of the loop in a topological order to visit each basic block 8683 // after having visited its predecessor basic blocks. 8684 LoopBlocksDFS DFS(OrigLoop); 8685 DFS.perform(LI); 8686 8687 VPBasicBlock *VPBB = HeaderVPBB; 8688 bool NeedsMasks = CM.foldTailByMasking() || 8689 any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { 8690 return Legal->blockNeedsPredication(BB); 8691 }); 8692 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8693 // Relevant instructions from basic block BB will be grouped into VPRecipe 8694 // ingredients and fill a new VPBasicBlock. 8695 if (VPBB != HeaderVPBB) 8696 VPBB->setName(BB->getName()); 8697 Builder.setInsertPoint(VPBB); 8698 8699 if (VPBB == HeaderVPBB) 8700 RecipeBuilder.createHeaderMask(*Plan); 8701 else if (NeedsMasks) 8702 RecipeBuilder.createBlockInMask(BB, *Plan); 8703 8704 // Introduce each ingredient into VPlan. 8705 // TODO: Model and preserve debug intrinsics in VPlan. 8706 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 8707 Instruction *Instr = &I; 8708 SmallVector<VPValue *, 4> Operands; 8709 auto *Phi = dyn_cast<PHINode>(Instr); 8710 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8711 Operands.push_back(Plan->getVPValueOrAddLiveIn( 8712 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8713 } else { 8714 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8715 Operands = {OpRange.begin(), OpRange.end()}; 8716 } 8717 8718 // Invariant stores inside loop will be deleted and a single store 8719 // with the final reduction value will be added to the exit block 8720 StoreInst *SI; 8721 if ((SI = dyn_cast<StoreInst>(&I)) && 8722 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8723 continue; 8724 8725 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8726 Instr, Operands, Range, VPBB, Plan); 8727 if (!RecipeOrValue) 8728 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 8729 // If Instr can be simplified to an existing VPValue, use it. 8730 if (isa<VPValue *>(RecipeOrValue)) { 8731 auto *VPV = cast<VPValue *>(RecipeOrValue); 8732 Plan->addVPValue(Instr, VPV); 8733 // If the re-used value is a recipe, register the recipe for the 8734 // instruction, in case the recipe for Instr needs to be recorded. 8735 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8736 RecipeBuilder.setRecipe(Instr, R); 8737 continue; 8738 } 8739 // Otherwise, add the new recipe. 8740 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 8741 for (auto *Def : Recipe->definedValues()) { 8742 auto *UV = Def->getUnderlyingValue(); 8743 Plan->addVPValue(UV, Def); 8744 } 8745 8746 RecipeBuilder.setRecipe(Instr, Recipe); 8747 if (isa<VPHeaderPHIRecipe>(Recipe)) { 8748 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 8749 // the following cases, VPHeaderPHIRecipes may be created after non-phi 8750 // recipes and need to be moved to the phi section of HeaderVPBB: 8751 // * tail-folding (non-phi recipes computing the header mask are 8752 // introduced earlier than regular header phi recipes, and should appear 8753 // after them) 8754 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 8755 8756 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 8757 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 8758 "unexpected recipe needs moving"); 8759 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8760 } else 8761 VPBB->appendRecipe(Recipe); 8762 } 8763 8764 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8765 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8766 } 8767 8768 // After here, VPBB should not be used. 8769 VPBB = nullptr; 8770 8771 if (CM.requiresScalarEpilogue(Range)) { 8772 // No edge from the middle block to the unique exit block has been inserted 8773 // and there is nothing to fix from vector loop; phis should have incoming 8774 // from scalar loop only. 8775 } else 8776 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); 8777 8778 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8779 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8780 "entry block must be set to a VPRegionBlock having a non-empty entry " 8781 "VPBasicBlock"); 8782 RecipeBuilder.fixHeaderPhis(); 8783 8784 // --------------------------------------------------------------------------- 8785 // Transform initial VPlan: Apply previously taken decisions, in order, to 8786 // bring the VPlan to its final state. 8787 // --------------------------------------------------------------------------- 8788 8789 // Adjust the recipes for any inloop reductions. 8790 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); 8791 8792 // Interleave memory: for each Interleave Group we marked earlier as relevant 8793 // for this VPlan, replace the Recipes widening its memory instructions with a 8794 // single VPInterleaveRecipe at its insertion point. 8795 for (const auto *IG : InterleaveGroups) { 8796 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8797 RecipeBuilder.getRecipe(IG->getInsertPos())); 8798 SmallVector<VPValue *, 4> StoredValues; 8799 for (unsigned i = 0; i < IG->getFactor(); ++i) 8800 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8801 auto *StoreR = 8802 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8803 StoredValues.push_back(StoreR->getStoredValue()); 8804 } 8805 8806 bool NeedsMaskForGaps = 8807 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 8808 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8809 Recipe->getMask(), NeedsMaskForGaps); 8810 VPIG->insertBefore(Recipe); 8811 unsigned J = 0; 8812 for (unsigned i = 0; i < IG->getFactor(); ++i) 8813 if (Instruction *Member = IG->getMember(i)) { 8814 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 8815 if (!Member->getType()->isVoidTy()) { 8816 VPValue *OriginalV = MemberR->getVPSingleValue(); 8817 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8818 J++; 8819 } 8820 MemberR->eraseFromParent(); 8821 } 8822 } 8823 8824 for (ElementCount VF : Range) 8825 Plan->addVF(VF); 8826 Plan->setName("Initial VPlan"); 8827 8828 // Replace VPValues for known constant strides guaranteed by predicate scalar 8829 // evolution. 8830 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 8831 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 8832 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 8833 // Only handle constant strides for now. 8834 if (!ScevStride) 8835 continue; 8836 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 8837 8838 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 8839 // The versioned value may not be used in the loop directly, so just add a 8840 // new live-in in those cases. 8841 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 8842 } 8843 8844 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8845 // in ways that accessing values using original IR values is incorrect. 8846 Plan->disableValue2VPValue(); 8847 8848 // Sink users of fixed-order recurrence past the recipe defining the previous 8849 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 8850 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 8851 return nullptr; 8852 8853 if (useActiveLaneMask(Style)) { 8854 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 8855 // TailFoldingStyle is visible there. 8856 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 8857 bool WithoutRuntimeCheck = 8858 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 8859 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 8860 WithoutRuntimeCheck); 8861 } 8862 return Plan; 8863 } 8864 8865 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8866 // Outer loop handling: They may require CFG and instruction level 8867 // transformations before even evaluating whether vectorization is profitable. 8868 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8869 // the vectorization pipeline. 8870 assert(!OrigLoop->isInnermost()); 8871 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8872 8873 // Create new empty VPlan 8874 auto Plan = VPlan::createInitialVPlan( 8875 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8876 *PSE.getSE()); 8877 8878 // Build hierarchical CFG 8879 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8880 HCFGBuilder.buildHierarchicalCFG(); 8881 8882 for (ElementCount VF : Range) 8883 Plan->addVF(VF); 8884 8885 VPlanTransforms::VPInstructionsToVPRecipes( 8886 Plan, 8887 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 8888 *PSE.getSE(), *TLI); 8889 8890 // Remove the existing terminator of the exiting block of the top-most region. 8891 // A BranchOnCount will be added instead when adding the canonical IV recipes. 8892 auto *Term = 8893 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 8894 Term->eraseFromParent(); 8895 8896 // Tail folding is not supported for outer loops, so the induction increment 8897 // is guaranteed to not wrap. 8898 bool HasNUW = true; 8899 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 8900 DebugLoc()); 8901 return Plan; 8902 } 8903 8904 // Adjust the recipes for reductions. For in-loop reductions the chain of 8905 // instructions leading from the loop exit instr to the phi need to be converted 8906 // to reductions, with one operand being vector and the other being the scalar 8907 // reduction chain. For other reductions, a select is introduced between the phi 8908 // and live-out recipes when folding the tail. 8909 // 8910 // A ComputeReductionResult recipe is added to the middle block, also for 8911 // in-loop reductions which compute their result in-loop, because generating 8912 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 8913 void LoopVectorizationPlanner::adjustRecipesForReductions( 8914 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 8915 ElementCount MinVF) { 8916 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 8917 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 8918 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 8919 // sank outside of the loop would keep the same order as they had in the 8920 // original loop. 8921 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 8922 for (VPRecipeBase &R : Header->phis()) { 8923 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 8924 ReductionPHIList.emplace_back(ReductionPhi); 8925 } 8926 bool HasIntermediateStore = false; 8927 stable_sort(ReductionPHIList, 8928 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, 8929 const VPReductionPHIRecipe *R2) { 8930 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 8931 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 8932 HasIntermediateStore |= IS1 || IS2; 8933 8934 // If neither of the recipes has an intermediate store, keep the 8935 // order the same. 8936 if (!IS1 && !IS2) 8937 return false; 8938 8939 // If only one of the recipes has an intermediate store, then 8940 // move it towards the beginning of the list. 8941 if (IS1 && !IS2) 8942 return true; 8943 8944 if (!IS1 && IS2) 8945 return false; 8946 8947 // If both recipes have an intermediate store, then the recipe 8948 // with the later store should be processed earlier. So it 8949 // should go to the beginning of the list. 8950 return DT->dominates(IS2, IS1); 8951 }); 8952 8953 if (HasIntermediateStore && ReductionPHIList.size() > 1) 8954 for (VPRecipeBase *R : ReductionPHIList) 8955 R->moveBefore(*Header, Header->getFirstNonPhi()); 8956 8957 for (VPRecipeBase &R : Header->phis()) { 8958 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 8959 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 8960 continue; 8961 8962 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 8963 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8964 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 8965 "AnyOf reductions are not allowed for in-loop reductions"); 8966 8967 // Collect the chain of "link" recipes for the reduction starting at PhiR. 8968 SetVector<VPSingleDefRecipe *> Worklist; 8969 Worklist.insert(PhiR); 8970 for (unsigned I = 0; I != Worklist.size(); ++I) { 8971 VPSingleDefRecipe *Cur = Worklist[I]; 8972 for (VPUser *U : Cur->users()) { 8973 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U); 8974 if (!UserRecipe) { 8975 assert(isa<VPLiveOut>(U) && 8976 "U must either be a VPSingleDef or VPLiveOut"); 8977 continue; 8978 } 8979 Worklist.insert(UserRecipe); 8980 } 8981 } 8982 8983 // Visit operation "Links" along the reduction chain top-down starting from 8984 // the phi until LoopExitValue. We keep track of the previous item 8985 // (PreviousLink) to tell which of the two operands of a Link will remain 8986 // scalar and which will be reduced. For minmax by select(cmp), Link will be 8987 // the select instructions. 8988 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 8989 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 8990 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 8991 8992 // Index of the first operand which holds a non-mask vector operand. 8993 unsigned IndexOfFirstOperand; 8994 // Recognize a call to the llvm.fmuladd intrinsic. 8995 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 8996 VPValue *VecOp; 8997 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 8998 if (IsFMulAdd) { 8999 assert( 9000 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9001 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9002 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9003 isa<VPWidenCallRecipe>(CurrentLink)) && 9004 CurrentLink->getOperand(2) == PreviousLink && 9005 "expected a call where the previous link is the added operand"); 9006 9007 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9008 // need to create an fmul recipe (multiplying the first two operands of 9009 // the fmuladd together) to use as the vector operand for the fadd 9010 // reduction. 9011 VPInstruction *FMulRecipe = new VPInstruction( 9012 Instruction::FMul, 9013 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9014 CurrentLinkI->getFastMathFlags()); 9015 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9016 VecOp = FMulRecipe; 9017 } else { 9018 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9019 if (isa<VPWidenRecipe>(CurrentLink)) { 9020 assert(isa<CmpInst>(CurrentLinkI) && 9021 "need to have the compare of the select"); 9022 continue; 9023 } 9024 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9025 "must be a select recipe"); 9026 IndexOfFirstOperand = 1; 9027 } else { 9028 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9029 "Expected to replace a VPWidenSC"); 9030 IndexOfFirstOperand = 0; 9031 } 9032 // Note that for non-commutable operands (cmp-selects), the semantics of 9033 // the cmp-select are captured in the recurrence kind. 9034 unsigned VecOpId = 9035 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9036 ? IndexOfFirstOperand + 1 9037 : IndexOfFirstOperand; 9038 VecOp = CurrentLink->getOperand(VecOpId); 9039 assert(VecOp != PreviousLink && 9040 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9041 (VecOpId - IndexOfFirstOperand)) == 9042 PreviousLink && 9043 "PreviousLink must be the operand other than VecOp"); 9044 } 9045 9046 BasicBlock *BB = CurrentLinkI->getParent(); 9047 VPValue *CondOp = nullptr; 9048 if (CM.blockNeedsPredicationForAnyReason(BB)) { 9049 VPBuilder::InsertPointGuard Guard(Builder); 9050 Builder.setInsertPoint(CurrentLink); 9051 CondOp = RecipeBuilder.getBlockInMask(BB); 9052 } 9053 9054 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9055 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp); 9056 // Append the recipe to the end of the VPBasicBlock because we need to 9057 // ensure that it comes after all of it's inputs, including CondOp. 9058 // Note that this transformation may leave over dead recipes (including 9059 // CurrentLink), which will be cleaned by a later VPlan transform. 9060 LinkVPBB->appendRecipe(RedRecipe); 9061 CurrentLink->replaceAllUsesWith(RedRecipe); 9062 PreviousLink = RedRecipe; 9063 } 9064 } 9065 Builder.setInsertPoint(&*LatchVPBB->begin()); 9066 for (VPRecipeBase &R : 9067 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9068 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9069 if (!PhiR) 9070 continue; 9071 9072 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9073 // If tail is folded by masking, introduce selects between the phi 9074 // and the live-out instruction of each reduction, at the beginning of the 9075 // dedicated latch block. 9076 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9077 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9078 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9079 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9080 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9081 "reduction recipe must be defined before latch"); 9082 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9083 std::optional<FastMathFlags> FMFs = 9084 PhiTy->isFloatingPointTy() 9085 ? std::make_optional(RdxDesc.getFastMathFlags()) 9086 : std::nullopt; 9087 NewExitingVPV = 9088 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9089 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9090 return isa<VPInstruction>(&U) && 9091 cast<VPInstruction>(&U)->getOpcode() == 9092 VPInstruction::ComputeReductionResult; 9093 }); 9094 if (PreferPredicatedReductionSelect || 9095 TTI.preferPredicatedReductionSelect( 9096 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, 9097 TargetTransformInfo::ReductionFlags())) 9098 PhiR->setOperand(1, NewExitingVPV); 9099 } 9100 9101 // If the vector reduction can be performed in a smaller type, we truncate 9102 // then extend the loop exit value to enable InstCombine to evaluate the 9103 // entire expression in the smaller type. 9104 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9105 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 9106 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9107 Type *RdxTy = RdxDesc.getRecurrenceType(); 9108 auto *Trunc = 9109 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9110 auto *Extnd = 9111 RdxDesc.isSigned() 9112 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9113 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9114 9115 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9116 Extnd->insertAfter(Trunc); 9117 if (PhiR->getOperand(1) == NewExitingVPV) 9118 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9119 NewExitingVPV = Extnd; 9120 } 9121 9122 // We want code in the middle block to appear to execute on the location of 9123 // the scalar loop's latch terminator because: (a) it is all compiler 9124 // generated, (b) these instructions are always executed after evaluating 9125 // the latch conditional branch, and (c) other passes may add new 9126 // predecessors which terminate on this line. This is the easiest way to 9127 // ensure we don't accidentally cause an extra step back into the loop while 9128 // debugging. 9129 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9130 9131 // TODO: At the moment ComputeReductionResult also drives creation of the 9132 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9133 // even for in-loop reductions, until the reduction resume value handling is 9134 // also modeled in VPlan. 9135 auto *FinalReductionResult = new VPInstruction( 9136 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9137 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) 9138 ->appendRecipe(FinalReductionResult); 9139 OrigExitingVPV->replaceUsesWithIf( 9140 FinalReductionResult, 9141 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); 9142 } 9143 9144 VPlanTransforms::clearReductionWrapFlags(*Plan); 9145 } 9146 9147 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9148 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9149 VPSlotTracker &SlotTracker) const { 9150 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9151 IG->getInsertPos()->printAsOperand(O, false); 9152 O << ", "; 9153 getAddr()->printAsOperand(O, SlotTracker); 9154 VPValue *Mask = getMask(); 9155 if (Mask) { 9156 O << ", "; 9157 Mask->printAsOperand(O, SlotTracker); 9158 } 9159 9160 unsigned OpIdx = 0; 9161 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9162 if (!IG->getMember(i)) 9163 continue; 9164 if (getNumStoreOperands() > 0) { 9165 O << "\n" << Indent << " store "; 9166 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9167 O << " to index " << i; 9168 } else { 9169 O << "\n" << Indent << " "; 9170 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9171 O << " = load from index " << i; 9172 } 9173 ++OpIdx; 9174 } 9175 } 9176 #endif 9177 9178 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9179 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9180 "Not a pointer induction according to InductionDescriptor!"); 9181 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9182 "Unexpected type."); 9183 9184 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9185 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9186 9187 if (onlyScalarsGenerated(State.VF)) { 9188 // This is the normalized GEP that starts counting at zero. 9189 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9190 CanonicalIV, IndDesc.getStep()->getType()); 9191 // Determine the number of scalars we need to generate for each unroll 9192 // iteration. If the instruction is uniform, we only need to generate the 9193 // first lane. Otherwise, we generate all VF values. 9194 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9195 assert((IsUniform || !State.VF.isScalable()) && 9196 "Cannot scalarize a scalable VF"); 9197 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9198 9199 for (unsigned Part = 0; Part < State.UF; ++Part) { 9200 Value *PartStart = 9201 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9202 9203 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9204 Value *Idx = State.Builder.CreateAdd( 9205 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9206 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9207 9208 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9209 Value *SclrGep = emitTransformedIndex( 9210 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, 9211 IndDesc.getKind(), IndDesc.getInductionBinOp()); 9212 SclrGep->setName("next.gep"); 9213 State.set(this, SclrGep, VPIteration(Part, Lane)); 9214 } 9215 } 9216 return; 9217 } 9218 9219 Type *PhiType = IndDesc.getStep()->getType(); 9220 9221 // Build a pointer phi 9222 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9223 Type *ScStValueType = ScalarStartValue->getType(); 9224 PHINode *NewPointerPhi = 9225 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9226 9227 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9228 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9229 9230 // A pointer induction, performed by using a gep 9231 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9232 9233 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9234 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9235 Value *NumUnrolledElems = 9236 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9237 Value *InductionGEP = GetElementPtrInst::Create( 9238 State.Builder.getInt8Ty(), NewPointerPhi, 9239 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9240 InductionLoc); 9241 // Add induction update using an incorrect block temporarily. The phi node 9242 // will be fixed after VPlan execution. Note that at this point the latch 9243 // block cannot be used, as it does not exist yet. 9244 // TODO: Model increment value in VPlan, by turning the recipe into a 9245 // multi-def and a subclass of VPHeaderPHIRecipe. 9246 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9247 9248 // Create UF many actual address geps that use the pointer 9249 // phi as base and a vectorized version of the step value 9250 // (<step*0, ..., step*N>) as offset. 9251 for (unsigned Part = 0; Part < State.UF; ++Part) { 9252 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9253 Value *StartOffsetScalar = 9254 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9255 Value *StartOffset = 9256 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9257 // Create a vector of consecutive numbers from zero to VF. 9258 StartOffset = State.Builder.CreateAdd( 9259 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9260 9261 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9262 "scalar step must be the same across all parts"); 9263 Value *GEP = State.Builder.CreateGEP( 9264 State.Builder.getInt8Ty(), NewPointerPhi, 9265 State.Builder.CreateMul( 9266 StartOffset, 9267 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9268 "vector.gep")); 9269 State.set(this, GEP, Part); 9270 } 9271 } 9272 9273 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9274 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9275 9276 // Fast-math-flags propagate from the original induction instruction. 9277 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9278 if (FPBinOp) 9279 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9280 9281 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9282 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9283 Value *DerivedIV = emitTransformedIndex( 9284 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, 9285 Kind, cast_if_present<BinaryOperator>(FPBinOp)); 9286 DerivedIV->setName("offset.idx"); 9287 if (TruncResultTy) { 9288 assert(TruncResultTy != DerivedIV->getType() && 9289 Step->getType()->isIntegerTy() && 9290 "Truncation requires an integer step"); 9291 DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); 9292 } 9293 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9294 9295 State.set(this, DerivedIV, VPIteration(0, 0)); 9296 } 9297 9298 void VPInterleaveRecipe::execute(VPTransformState &State) { 9299 assert(!State.Instance && "Interleave group being replicated."); 9300 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9301 getStoredValues(), getMask(), 9302 NeedsMaskForGaps); 9303 } 9304 9305 void VPReductionRecipe::execute(VPTransformState &State) { 9306 assert(!State.Instance && "Reduction being replicated."); 9307 Value *PrevInChain = State.get(getChainOp(), 0); 9308 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9309 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); 9310 // Propagate the fast-math flags carried by the underlying instruction. 9311 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9312 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 9313 for (unsigned Part = 0; Part < State.UF; ++Part) { 9314 Value *NewVecOp = State.get(getVecOp(), Part); 9315 if (VPValue *Cond = getCondOp()) { 9316 Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) 9317 : State.get(Cond, {Part, 0}); 9318 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 9319 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 9320 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 9321 RdxDesc.getFastMathFlags()); 9322 if (State.VF.isVector()) { 9323 Iden = 9324 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9325 } 9326 9327 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 9328 NewVecOp = Select; 9329 } 9330 Value *NewRed; 9331 Value *NextInChain; 9332 if (IsOrdered) { 9333 if (State.VF.isVector()) 9334 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 9335 PrevInChain); 9336 else 9337 NewRed = State.Builder.CreateBinOp( 9338 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 9339 NewVecOp); 9340 PrevInChain = NewRed; 9341 } else { 9342 PrevInChain = State.get(getChainOp(), Part); 9343 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 9344 } 9345 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9346 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 9347 NewRed, PrevInChain); 9348 } else if (IsOrdered) 9349 NextInChain = NewRed; 9350 else 9351 NextInChain = State.Builder.CreateBinOp( 9352 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 9353 State.set(this, NextInChain, Part); 9354 } 9355 } 9356 9357 void VPReplicateRecipe::execute(VPTransformState &State) { 9358 Instruction *UI = getUnderlyingInstr(); 9359 if (State.Instance) { // Generate a single instance. 9360 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9361 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9362 // Insert scalar instance packing it into a vector. 9363 if (State.VF.isVector() && shouldPack()) { 9364 // If we're constructing lane 0, initialize to start from poison. 9365 if (State.Instance->Lane.isFirstLane()) { 9366 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9367 Value *Poison = PoisonValue::get( 9368 VectorType::get(UI->getType(), State.VF)); 9369 State.set(this, Poison, State.Instance->Part); 9370 } 9371 State.packScalarIntoVectorValue(this, *State.Instance); 9372 } 9373 return; 9374 } 9375 9376 if (IsUniform) { 9377 // If the recipe is uniform across all parts (instead of just per VF), only 9378 // generate a single instance. 9379 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9380 all_of(operands(), [](VPValue *Op) { 9381 return Op->isDefinedOutsideVectorRegions(); 9382 })) { 9383 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9384 if (user_begin() != user_end()) { 9385 for (unsigned Part = 1; Part < State.UF; ++Part) 9386 State.set(this, State.get(this, VPIteration(0, 0)), 9387 VPIteration(Part, 0)); 9388 } 9389 return; 9390 } 9391 9392 // Uniform within VL means we need to generate lane 0 only for each 9393 // unrolled copy. 9394 for (unsigned Part = 0; Part < State.UF; ++Part) 9395 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9396 return; 9397 } 9398 9399 // A store of a loop varying value to a uniform address only needs the last 9400 // copy of the store. 9401 if (isa<StoreInst>(UI) && 9402 vputils::isUniformAfterVectorization(getOperand(1))) { 9403 auto Lane = VPLane::getLastLaneForVF(State.VF); 9404 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9405 State); 9406 return; 9407 } 9408 9409 // Generate scalar instances for all VF lanes of all UF parts. 9410 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9411 const unsigned EndLane = State.VF.getKnownMinValue(); 9412 for (unsigned Part = 0; Part < State.UF; ++Part) 9413 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9414 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9415 } 9416 9417 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9418 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9419 9420 // Attempt to issue a wide load. 9421 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9422 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9423 9424 assert((LI || SI) && "Invalid Load/Store instruction"); 9425 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9426 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9427 9428 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9429 9430 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9431 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9432 bool CreateGatherScatter = !isConsecutive(); 9433 9434 auto &Builder = State.Builder; 9435 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9436 bool isMaskRequired = getMask(); 9437 if (isMaskRequired) { 9438 // Mask reversal is only needed for non-all-one (null) masks, as reverse of 9439 // a null all-one mask is a null mask. 9440 for (unsigned Part = 0; Part < State.UF; ++Part) { 9441 Value *Mask = State.get(getMask(), Part); 9442 if (isReverse()) 9443 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9444 BlockInMaskParts[Part] = Mask; 9445 } 9446 } 9447 9448 // Handle Stores: 9449 if (SI) { 9450 State.setDebugLocFrom(SI->getDebugLoc()); 9451 9452 for (unsigned Part = 0; Part < State.UF; ++Part) { 9453 Instruction *NewSI = nullptr; 9454 Value *StoredVal = State.get(StoredValue, Part); 9455 if (CreateGatherScatter) { 9456 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9457 Value *VectorGep = State.get(getAddr(), Part); 9458 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9459 MaskPart); 9460 } else { 9461 if (isReverse()) { 9462 // If we store to reverse consecutive memory locations, then we need 9463 // to reverse the order of elements in the stored value. 9464 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9465 // We don't want to update the value in the map as it might be used in 9466 // another expression. So don't call resetVectorValue(StoredVal). 9467 } 9468 auto *VecPtr = State.get(getAddr(), Part); 9469 if (isMaskRequired) 9470 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9471 BlockInMaskParts[Part]); 9472 else 9473 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9474 } 9475 State.addMetadata(NewSI, SI); 9476 } 9477 return; 9478 } 9479 9480 // Handle loads. 9481 assert(LI && "Must have a load instruction"); 9482 State.setDebugLocFrom(LI->getDebugLoc()); 9483 for (unsigned Part = 0; Part < State.UF; ++Part) { 9484 Value *NewLI; 9485 if (CreateGatherScatter) { 9486 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9487 Value *VectorGep = State.get(getAddr(), Part); 9488 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9489 nullptr, "wide.masked.gather"); 9490 State.addMetadata(NewLI, LI); 9491 } else { 9492 auto *VecPtr = State.get(getAddr(), Part); 9493 if (isMaskRequired) 9494 NewLI = Builder.CreateMaskedLoad( 9495 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9496 PoisonValue::get(DataTy), "wide.masked.load"); 9497 else 9498 NewLI = 9499 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9500 9501 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9502 State.addMetadata(NewLI, LI); 9503 if (Reverse) 9504 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9505 } 9506 9507 State.set(getVPSingleValue(), NewLI, Part); 9508 } 9509 } 9510 9511 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9512 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9513 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9514 // for predication. 9515 static ScalarEpilogueLowering getScalarEpilogueLowering( 9516 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9517 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9518 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9519 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9520 // don't look at hints or options, and don't request a scalar epilogue. 9521 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9522 // LoopAccessInfo (due to code dependency and not being able to reliably get 9523 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9524 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9525 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9526 // back to the old way and vectorize with versioning when forced. See D81345.) 9527 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9528 PGSOQueryType::IRPass) && 9529 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9530 return CM_ScalarEpilogueNotAllowedOptSize; 9531 9532 // 2) If set, obey the directives 9533 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9534 switch (PreferPredicateOverEpilogue) { 9535 case PreferPredicateTy::ScalarEpilogue: 9536 return CM_ScalarEpilogueAllowed; 9537 case PreferPredicateTy::PredicateElseScalarEpilogue: 9538 return CM_ScalarEpilogueNotNeededUsePredicate; 9539 case PreferPredicateTy::PredicateOrDontVectorize: 9540 return CM_ScalarEpilogueNotAllowedUsePredicate; 9541 }; 9542 } 9543 9544 // 3) If set, obey the hints 9545 switch (Hints.getPredicate()) { 9546 case LoopVectorizeHints::FK_Enabled: 9547 return CM_ScalarEpilogueNotNeededUsePredicate; 9548 case LoopVectorizeHints::FK_Disabled: 9549 return CM_ScalarEpilogueAllowed; 9550 }; 9551 9552 // 4) if the TTI hook indicates this is profitable, request predication. 9553 TailFoldingInfo TFI(TLI, &LVL, IAI); 9554 if (TTI->preferPredicateOverEpilogue(&TFI)) 9555 return CM_ScalarEpilogueNotNeededUsePredicate; 9556 9557 return CM_ScalarEpilogueAllowed; 9558 } 9559 9560 // Process the loop in the VPlan-native vectorization path. This path builds 9561 // VPlan upfront in the vectorization pipeline, which allows to apply 9562 // VPlan-to-VPlan transformations from the very beginning without modifying the 9563 // input LLVM IR. 9564 static bool processLoopInVPlanNativePath( 9565 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9566 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9567 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9568 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9569 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9570 LoopVectorizationRequirements &Requirements) { 9571 9572 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9573 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9574 return false; 9575 } 9576 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9577 Function *F = L->getHeader()->getParent(); 9578 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9579 9580 ScalarEpilogueLowering SEL = 9581 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9582 9583 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9584 &Hints, IAI); 9585 // Use the planner for outer loop vectorization. 9586 // TODO: CM is not used at this point inside the planner. Turn CM into an 9587 // optional argument if we don't need it in the future. 9588 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9589 ORE); 9590 9591 // Get user vectorization factor. 9592 ElementCount UserVF = Hints.getWidth(); 9593 9594 CM.collectElementTypesForWidening(); 9595 9596 // Plan how to best vectorize, return the best VF and its cost. 9597 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9598 9599 // If we are stress testing VPlan builds, do not attempt to generate vector 9600 // code. Masked vector code generation support will follow soon. 9601 // Also, do not attempt to vectorize if no vector code will be produced. 9602 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9603 return false; 9604 9605 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9606 9607 { 9608 bool AddBranchWeights = 9609 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9610 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9611 F->getParent()->getDataLayout(), AddBranchWeights); 9612 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9613 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9614 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9615 << L->getHeader()->getParent()->getName() << "\"\n"); 9616 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9617 } 9618 9619 reportVectorization(ORE, L, VF, 1); 9620 9621 // Mark the loop as already vectorized to avoid vectorizing again. 9622 Hints.setAlreadyVectorized(); 9623 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9624 return true; 9625 } 9626 9627 // Emit a remark if there are stores to floats that required a floating point 9628 // extension. If the vectorized loop was generated with floating point there 9629 // will be a performance penalty from the conversion overhead and the change in 9630 // the vector width. 9631 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9632 SmallVector<Instruction *, 4> Worklist; 9633 for (BasicBlock *BB : L->getBlocks()) { 9634 for (Instruction &Inst : *BB) { 9635 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9636 if (S->getValueOperand()->getType()->isFloatTy()) 9637 Worklist.push_back(S); 9638 } 9639 } 9640 } 9641 9642 // Traverse the floating point stores upwards searching, for floating point 9643 // conversions. 9644 SmallPtrSet<const Instruction *, 4> Visited; 9645 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9646 while (!Worklist.empty()) { 9647 auto *I = Worklist.pop_back_val(); 9648 if (!L->contains(I)) 9649 continue; 9650 if (!Visited.insert(I).second) 9651 continue; 9652 9653 // Emit a remark if the floating point store required a floating 9654 // point conversion. 9655 // TODO: More work could be done to identify the root cause such as a 9656 // constant or a function return type and point the user to it. 9657 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9658 ORE->emit([&]() { 9659 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9660 I->getDebugLoc(), L->getHeader()) 9661 << "floating point conversion changes vector width. " 9662 << "Mixed floating point precision requires an up/down " 9663 << "cast that will negatively impact performance."; 9664 }); 9665 9666 for (Use &Op : I->operands()) 9667 if (auto *OpI = dyn_cast<Instruction>(Op)) 9668 Worklist.push_back(OpI); 9669 } 9670 } 9671 9672 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9673 VectorizationFactor &VF, 9674 std::optional<unsigned> VScale, Loop *L, 9675 ScalarEvolution &SE, 9676 ScalarEpilogueLowering SEL) { 9677 InstructionCost CheckCost = Checks.getCost(); 9678 if (!CheckCost.isValid()) 9679 return false; 9680 9681 // When interleaving only scalar and vector cost will be equal, which in turn 9682 // would lead to a divide by 0. Fall back to hard threshold. 9683 if (VF.Width.isScalar()) { 9684 if (CheckCost > VectorizeMemoryCheckThreshold) { 9685 LLVM_DEBUG( 9686 dbgs() 9687 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9688 return false; 9689 } 9690 return true; 9691 } 9692 9693 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9694 double ScalarC = *VF.ScalarCost.getValue(); 9695 if (ScalarC == 0) 9696 return true; 9697 9698 // First, compute the minimum iteration count required so that the vector 9699 // loop outperforms the scalar loop. 9700 // The total cost of the scalar loop is 9701 // ScalarC * TC 9702 // where 9703 // * TC is the actual trip count of the loop. 9704 // * ScalarC is the cost of a single scalar iteration. 9705 // 9706 // The total cost of the vector loop is 9707 // RtC + VecC * (TC / VF) + EpiC 9708 // where 9709 // * RtC is the cost of the generated runtime checks 9710 // * VecC is the cost of a single vector iteration. 9711 // * TC is the actual trip count of the loop 9712 // * VF is the vectorization factor 9713 // * EpiCost is the cost of the generated epilogue, including the cost 9714 // of the remaining scalar operations. 9715 // 9716 // Vectorization is profitable once the total vector cost is less than the 9717 // total scalar cost: 9718 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9719 // 9720 // Now we can compute the minimum required trip count TC as 9721 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 9722 // 9723 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9724 // the computations are performed on doubles, not integers and the result 9725 // is rounded up, hence we get an upper estimate of the TC. 9726 unsigned IntVF = VF.Width.getKnownMinValue(); 9727 if (VF.Width.isScalable()) { 9728 unsigned AssumedMinimumVscale = 1; 9729 if (VScale) 9730 AssumedMinimumVscale = *VScale; 9731 IntVF *= AssumedMinimumVscale; 9732 } 9733 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 9734 double RtC = *CheckCost.getValue(); 9735 double MinTC1 = RtC / (ScalarC - VecCOverVF); 9736 9737 // Second, compute a minimum iteration count so that the cost of the 9738 // runtime checks is only a fraction of the total scalar loop cost. This 9739 // adds a loop-dependent bound on the overhead incurred if the runtime 9740 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 9741 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 9742 // cost, compute 9743 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 9744 double MinTC2 = RtC * 10 / ScalarC; 9745 9746 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 9747 // epilogue is allowed, choose the next closest multiple of VF. This should 9748 // partly compensate for ignoring the epilogue cost. 9749 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 9750 if (SEL == CM_ScalarEpilogueAllowed) 9751 MinTC = alignTo(MinTC, IntVF); 9752 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 9753 9754 LLVM_DEBUG( 9755 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 9756 << VF.MinProfitableTripCount << "\n"); 9757 9758 // Skip vectorization if the expected trip count is less than the minimum 9759 // required trip count. 9760 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 9761 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 9762 VF.MinProfitableTripCount)) { 9763 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 9764 "trip count < minimum profitable VF (" 9765 << *ExpectedTC << " < " << VF.MinProfitableTripCount 9766 << ")\n"); 9767 9768 return false; 9769 } 9770 } 9771 return true; 9772 } 9773 9774 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9775 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9776 !EnableLoopInterleaving), 9777 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9778 !EnableLoopVectorization) {} 9779 9780 bool LoopVectorizePass::processLoop(Loop *L) { 9781 assert((EnableVPlanNativePath || L->isInnermost()) && 9782 "VPlan-native path is not enabled. Only process inner loops."); 9783 9784 #ifndef NDEBUG 9785 const std::string DebugLocStr = getDebugLocString(L); 9786 #endif /* NDEBUG */ 9787 9788 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 9789 << L->getHeader()->getParent()->getName() << "' from " 9790 << DebugLocStr << "\n"); 9791 9792 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 9793 9794 LLVM_DEBUG( 9795 dbgs() << "LV: Loop hints:" 9796 << " force=" 9797 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9798 ? "disabled" 9799 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9800 ? "enabled" 9801 : "?")) 9802 << " width=" << Hints.getWidth() 9803 << " interleave=" << Hints.getInterleave() << "\n"); 9804 9805 // Function containing loop 9806 Function *F = L->getHeader()->getParent(); 9807 9808 // Looking at the diagnostic output is the only way to determine if a loop 9809 // was vectorized (other than looking at the IR or machine code), so it 9810 // is important to generate an optimization remark for each loop. Most of 9811 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9812 // generated as OptimizationRemark and OptimizationRemarkMissed are 9813 // less verbose reporting vectorized loops and unvectorized loops that may 9814 // benefit from vectorization, respectively. 9815 9816 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9817 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9818 return false; 9819 } 9820 9821 PredicatedScalarEvolution PSE(*SE, *L); 9822 9823 // Check if it is legal to vectorize the loop. 9824 LoopVectorizationRequirements Requirements; 9825 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 9826 &Requirements, &Hints, DB, AC, BFI, PSI); 9827 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9828 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9829 Hints.emitRemarkWithHints(); 9830 return false; 9831 } 9832 9833 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9834 // here. They may require CFG and instruction level transformations before 9835 // even evaluating whether vectorization is profitable. Since we cannot modify 9836 // the incoming IR, we need to build VPlan upfront in the vectorization 9837 // pipeline. 9838 if (!L->isInnermost()) 9839 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9840 ORE, BFI, PSI, Hints, Requirements); 9841 9842 assert(L->isInnermost() && "Inner loop expected."); 9843 9844 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9845 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9846 9847 // If an override option has been passed in for interleaved accesses, use it. 9848 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9849 UseInterleaved = EnableInterleavedMemAccesses; 9850 9851 // Analyze interleaved memory accesses. 9852 if (UseInterleaved) 9853 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9854 9855 // Check the function attributes and profiles to find out if this function 9856 // should be optimized for size. 9857 ScalarEpilogueLowering SEL = 9858 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 9859 9860 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9861 // count by optimizing for size, to minimize overheads. 9862 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9863 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9864 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9865 << "This loop is worth vectorizing only if no scalar " 9866 << "iteration overheads are incurred."); 9867 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9868 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9869 else { 9870 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 9871 LLVM_DEBUG(dbgs() << "\n"); 9872 // Predicate tail-folded loops are efficient even when the loop 9873 // iteration count is low. However, setting the epilogue policy to 9874 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 9875 // with runtime checks. It's more effective to let 9876 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 9877 // for the loop. 9878 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 9879 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9880 } else { 9881 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 9882 "small to consider vectorizing.\n"); 9883 reportVectorizationFailure( 9884 "The trip count is below the minial threshold value.", 9885 "loop trip count is too low, avoiding vectorization", 9886 "LowTripCount", ORE, L); 9887 Hints.emitRemarkWithHints(); 9888 return false; 9889 } 9890 } 9891 } 9892 9893 // Check the function attributes to see if implicit floats or vectors are 9894 // allowed. 9895 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9896 reportVectorizationFailure( 9897 "Can't vectorize when the NoImplicitFloat attribute is used", 9898 "loop not vectorized due to NoImplicitFloat attribute", 9899 "NoImplicitFloat", ORE, L); 9900 Hints.emitRemarkWithHints(); 9901 return false; 9902 } 9903 9904 // Check if the target supports potentially unsafe FP vectorization. 9905 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9906 // for the target we're vectorizing for, to make sure none of the 9907 // additional fp-math flags can help. 9908 if (Hints.isPotentiallyUnsafe() && 9909 TTI->isFPVectorizationPotentiallyUnsafe()) { 9910 reportVectorizationFailure( 9911 "Potentially unsafe FP op prevents vectorization", 9912 "loop not vectorized due to unsafe FP support.", 9913 "UnsafeFP", ORE, L); 9914 Hints.emitRemarkWithHints(); 9915 return false; 9916 } 9917 9918 bool AllowOrderedReductions; 9919 // If the flag is set, use that instead and override the TTI behaviour. 9920 if (ForceOrderedReductions.getNumOccurrences() > 0) 9921 AllowOrderedReductions = ForceOrderedReductions; 9922 else 9923 AllowOrderedReductions = TTI->enableOrderedReductions(); 9924 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 9925 ORE->emit([&]() { 9926 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9927 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9928 ExactFPMathInst->getDebugLoc(), 9929 ExactFPMathInst->getParent()) 9930 << "loop not vectorized: cannot prove it is safe to reorder " 9931 "floating-point operations"; 9932 }); 9933 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9934 "reorder floating-point operations\n"); 9935 Hints.emitRemarkWithHints(); 9936 return false; 9937 } 9938 9939 // Use the cost model. 9940 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9941 F, &Hints, IAI); 9942 // Use the planner for vectorization. 9943 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 9944 ORE); 9945 9946 // Get user vectorization factor and interleave count. 9947 ElementCount UserVF = Hints.getWidth(); 9948 unsigned UserIC = Hints.getInterleave(); 9949 9950 // Plan how to best vectorize, return the best VF and its cost. 9951 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9952 9953 VectorizationFactor VF = VectorizationFactor::Disabled(); 9954 unsigned IC = 1; 9955 9956 bool AddBranchWeights = 9957 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9958 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9959 F->getParent()->getDataLayout(), AddBranchWeights); 9960 if (MaybeVF) { 9961 VF = *MaybeVF; 9962 // Select the interleave count. 9963 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9964 9965 unsigned SelectedIC = std::max(IC, UserIC); 9966 // Optimistically generate runtime checks if they are needed. Drop them if 9967 // they turn out to not be profitable. 9968 if (VF.Width.isVector() || SelectedIC > 1) 9969 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 9970 9971 // Check if it is profitable to vectorize with runtime checks. 9972 bool ForceVectorization = 9973 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 9974 if (!ForceVectorization && 9975 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 9976 *PSE.getSE(), SEL)) { 9977 ORE->emit([&]() { 9978 return OptimizationRemarkAnalysisAliasing( 9979 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 9980 L->getHeader()) 9981 << "loop not vectorized: cannot prove it is safe to reorder " 9982 "memory operations"; 9983 }); 9984 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 9985 Hints.emitRemarkWithHints(); 9986 return false; 9987 } 9988 } 9989 9990 // Identify the diagnostic messages that should be produced. 9991 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9992 bool VectorizeLoop = true, InterleaveLoop = true; 9993 if (VF.Width.isScalar()) { 9994 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9995 VecDiagMsg = std::make_pair( 9996 "VectorizationNotBeneficial", 9997 "the cost-model indicates that vectorization is not beneficial"); 9998 VectorizeLoop = false; 9999 } 10000 10001 if (!MaybeVF && UserIC > 1) { 10002 // Tell the user interleaving was avoided up-front, despite being explicitly 10003 // requested. 10004 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10005 "interleaving should be avoided up front\n"); 10006 IntDiagMsg = std::make_pair( 10007 "InterleavingAvoided", 10008 "Ignoring UserIC, because interleaving was avoided up front"); 10009 InterleaveLoop = false; 10010 } else if (IC == 1 && UserIC <= 1) { 10011 // Tell the user interleaving is not beneficial. 10012 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10013 IntDiagMsg = std::make_pair( 10014 "InterleavingNotBeneficial", 10015 "the cost-model indicates that interleaving is not beneficial"); 10016 InterleaveLoop = false; 10017 if (UserIC == 1) { 10018 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10019 IntDiagMsg.second += 10020 " and is explicitly disabled or interleave count is set to 1"; 10021 } 10022 } else if (IC > 1 && UserIC == 1) { 10023 // Tell the user interleaving is beneficial, but it explicitly disabled. 10024 LLVM_DEBUG( 10025 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10026 IntDiagMsg = std::make_pair( 10027 "InterleavingBeneficialButDisabled", 10028 "the cost-model indicates that interleaving is beneficial " 10029 "but is explicitly disabled or interleave count is set to 1"); 10030 InterleaveLoop = false; 10031 } 10032 10033 // Override IC if user provided an interleave count. 10034 IC = UserIC > 0 ? UserIC : IC; 10035 10036 // Emit diagnostic messages, if any. 10037 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10038 if (!VectorizeLoop && !InterleaveLoop) { 10039 // Do not vectorize or interleaving the loop. 10040 ORE->emit([&]() { 10041 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10042 L->getStartLoc(), L->getHeader()) 10043 << VecDiagMsg.second; 10044 }); 10045 ORE->emit([&]() { 10046 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10047 L->getStartLoc(), L->getHeader()) 10048 << IntDiagMsg.second; 10049 }); 10050 return false; 10051 } else if (!VectorizeLoop && InterleaveLoop) { 10052 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10053 ORE->emit([&]() { 10054 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10055 L->getStartLoc(), L->getHeader()) 10056 << VecDiagMsg.second; 10057 }); 10058 } else if (VectorizeLoop && !InterleaveLoop) { 10059 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10060 << ") in " << DebugLocStr << '\n'); 10061 ORE->emit([&]() { 10062 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10063 L->getStartLoc(), L->getHeader()) 10064 << IntDiagMsg.second; 10065 }); 10066 } else if (VectorizeLoop && InterleaveLoop) { 10067 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10068 << ") in " << DebugLocStr << '\n'); 10069 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10070 } 10071 10072 bool DisableRuntimeUnroll = false; 10073 MDNode *OrigLoopID = L->getLoopID(); 10074 { 10075 using namespace ore; 10076 if (!VectorizeLoop) { 10077 assert(IC > 1 && "interleave count should not be 1 or 0"); 10078 // If we decided that it is not legal to vectorize the loop, then 10079 // interleave it. 10080 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10081 &CM, BFI, PSI, Checks); 10082 10083 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10084 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10085 10086 ORE->emit([&]() { 10087 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10088 L->getHeader()) 10089 << "interleaved loop (interleaved count: " 10090 << NV("InterleaveCount", IC) << ")"; 10091 }); 10092 } else { 10093 // If we decided that it is *legal* to vectorize the loop, then do it. 10094 10095 // Consider vectorizing the epilogue too if it's profitable. 10096 VectorizationFactor EpilogueVF = 10097 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10098 if (EpilogueVF.Width.isVector()) { 10099 10100 // The first pass vectorizes the main loop and creates a scalar epilogue 10101 // to be vectorized by executing the plan (potentially with a different 10102 // factor) again shortly afterwards. 10103 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10104 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10105 EPI, &LVL, &CM, BFI, PSI, Checks); 10106 10107 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10108 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( 10109 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); 10110 ++LoopsVectorized; 10111 10112 // Second pass vectorizes the epilogue and adjusts the control flow 10113 // edges from the first pass. 10114 EPI.MainLoopVF = EPI.EpilogueVF; 10115 EPI.MainLoopUF = EPI.EpilogueUF; 10116 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10117 ORE, EPI, &LVL, &CM, BFI, PSI, 10118 Checks); 10119 10120 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10121 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10122 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10123 Header->setName("vec.epilog.vector.body"); 10124 10125 // Re-use the trip count and steps expanded for the main loop, as 10126 // skeleton creation needs it as a value that dominates both the scalar 10127 // and vector epilogue loops 10128 // TODO: This is a workaround needed for epilogue vectorization and it 10129 // should be removed once induction resume value creation is done 10130 // directly in VPlan. 10131 EpilogILV.setTripCount(MainILV.getTripCount()); 10132 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10133 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10134 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10135 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10136 ExpandR->replaceAllUsesWith(ExpandedVal); 10137 ExpandR->eraseFromParent(); 10138 } 10139 10140 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10141 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10142 // before vectorizing the epilogue loop. 10143 for (VPRecipeBase &R : Header->phis()) { 10144 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10145 continue; 10146 10147 Value *ResumeV = nullptr; 10148 // TODO: Move setting of resume values to prepareToExecute. 10149 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10150 ResumeV = ReductionResumeValues 10151 .find(&ReductionPhi->getRecurrenceDescriptor()) 10152 ->second; 10153 } else { 10154 // Create induction resume values for both widened pointer and 10155 // integer/fp inductions and update the start value of the induction 10156 // recipes to use the resume value. 10157 PHINode *IndPhi = nullptr; 10158 const InductionDescriptor *ID; 10159 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10160 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10161 ID = &Ind->getInductionDescriptor(); 10162 } else { 10163 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10164 IndPhi = WidenInd->getPHINode(); 10165 ID = &WidenInd->getInductionDescriptor(); 10166 } 10167 10168 ResumeV = MainILV.createInductionResumeValue( 10169 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10170 {EPI.MainLoopIterationCountCheck}); 10171 } 10172 assert(ResumeV && "Must have a resume value"); 10173 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10174 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10175 } 10176 10177 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10178 DT, true, &ExpandedSCEVs); 10179 ++LoopsEpilogueVectorized; 10180 10181 if (!MainILV.areSafetyChecksAdded()) 10182 DisableRuntimeUnroll = true; 10183 } else { 10184 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10185 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10186 PSI, Checks); 10187 10188 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10189 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10190 ++LoopsVectorized; 10191 10192 // Add metadata to disable runtime unrolling a scalar loop when there 10193 // are no runtime checks about strides and memory. A scalar loop that is 10194 // rarely used is not worth unrolling. 10195 if (!LB.areSafetyChecksAdded()) 10196 DisableRuntimeUnroll = true; 10197 } 10198 // Report the vectorization decision. 10199 reportVectorization(ORE, L, VF, IC); 10200 } 10201 10202 if (ORE->allowExtraAnalysis(LV_NAME)) 10203 checkMixedPrecision(L, ORE); 10204 } 10205 10206 std::optional<MDNode *> RemainderLoopID = 10207 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10208 LLVMLoopVectorizeFollowupEpilogue}); 10209 if (RemainderLoopID) { 10210 L->setLoopID(*RemainderLoopID); 10211 } else { 10212 if (DisableRuntimeUnroll) 10213 AddRuntimeUnrollDisableMetaData(L); 10214 10215 // Mark the loop as already vectorized to avoid vectorizing again. 10216 Hints.setAlreadyVectorized(); 10217 } 10218 10219 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10220 return true; 10221 } 10222 10223 LoopVectorizeResult LoopVectorizePass::runImpl( 10224 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10225 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10226 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10227 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10228 SE = &SE_; 10229 LI = &LI_; 10230 TTI = &TTI_; 10231 DT = &DT_; 10232 BFI = BFI_; 10233 TLI = TLI_; 10234 AC = &AC_; 10235 LAIs = &LAIs_; 10236 DB = &DB_; 10237 ORE = &ORE_; 10238 PSI = PSI_; 10239 10240 // Don't attempt if 10241 // 1. the target claims to have no vector registers, and 10242 // 2. interleaving won't help ILP. 10243 // 10244 // The second condition is necessary because, even if the target has no 10245 // vector registers, loop vectorization may still enable scalar 10246 // interleaving. 10247 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10248 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10249 return LoopVectorizeResult(false, false); 10250 10251 bool Changed = false, CFGChanged = false; 10252 10253 // The vectorizer requires loops to be in simplified form. 10254 // Since simplification may add new inner loops, it has to run before the 10255 // legality and profitability checks. This means running the loop vectorizer 10256 // will simplify all loops, regardless of whether anything end up being 10257 // vectorized. 10258 for (const auto &L : *LI) 10259 Changed |= CFGChanged |= 10260 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10261 10262 // Build up a worklist of inner-loops to vectorize. This is necessary as 10263 // the act of vectorizing or partially unrolling a loop creates new loops 10264 // and can invalidate iterators across the loops. 10265 SmallVector<Loop *, 8> Worklist; 10266 10267 for (Loop *L : *LI) 10268 collectSupportedLoops(*L, LI, ORE, Worklist); 10269 10270 LoopsAnalyzed += Worklist.size(); 10271 10272 // Now walk the identified inner loops. 10273 while (!Worklist.empty()) { 10274 Loop *L = Worklist.pop_back_val(); 10275 10276 // For the inner loops we actually process, form LCSSA to simplify the 10277 // transform. 10278 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10279 10280 Changed |= CFGChanged |= processLoop(L); 10281 10282 if (Changed) { 10283 LAIs->clear(); 10284 10285 #ifndef NDEBUG 10286 if (VerifySCEV) 10287 SE->verify(); 10288 #endif 10289 } 10290 } 10291 10292 // Process each loop nest in the function. 10293 return LoopVectorizeResult(Changed, CFGChanged); 10294 } 10295 10296 PreservedAnalyses LoopVectorizePass::run(Function &F, 10297 FunctionAnalysisManager &AM) { 10298 auto &LI = AM.getResult<LoopAnalysis>(F); 10299 // There are no loops in the function. Return before computing other expensive 10300 // analyses. 10301 if (LI.empty()) 10302 return PreservedAnalyses::all(); 10303 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10304 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10305 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10306 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10307 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10308 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10309 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10310 10311 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10312 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10313 ProfileSummaryInfo *PSI = 10314 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10315 BlockFrequencyInfo *BFI = nullptr; 10316 if (PSI && PSI->hasProfileSummary()) 10317 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10318 LoopVectorizeResult Result = 10319 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10320 if (!Result.MadeAnyChange) 10321 return PreservedAnalyses::all(); 10322 PreservedAnalyses PA; 10323 10324 if (isAssignmentTrackingEnabled(*F.getParent())) { 10325 for (auto &BB : F) 10326 RemoveRedundantDbgInstrs(&BB); 10327 } 10328 10329 // We currently do not preserve loopinfo/dominator analyses with outer loop 10330 // vectorization. Until this is addressed, mark these analyses as preserved 10331 // only for non-VPlan-native path. 10332 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10333 if (!EnableVPlanNativePath) { 10334 PA.preserve<LoopAnalysis>(); 10335 PA.preserve<DominatorTreeAnalysis>(); 10336 PA.preserve<ScalarEvolutionAnalysis>(); 10337 } 10338 10339 if (Result.MadeCFGChange) { 10340 // Making CFG changes likely means a loop got vectorized. Indicate that 10341 // extra simplification passes should be run. 10342 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10343 // be run if runtime checks have been added. 10344 AM.getResult<ShouldRunExtraVectorPasses>(F); 10345 PA.preserve<ShouldRunExtraVectorPasses>(); 10346 } else { 10347 PA.preserveSet<CFGAnalyses>(); 10348 } 10349 return PA; 10350 } 10351 10352 void LoopVectorizePass::printPipeline( 10353 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10354 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10355 OS, MapClassName2PassName); 10356 10357 OS << '<'; 10358 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10359 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10360 OS << '>'; 10361 } 10362