1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/STLExtras.h" 70 #include "llvm/ADT/SmallPtrSet.h" 71 #include "llvm/ADT/SmallSet.h" 72 #include "llvm/ADT/SmallVector.h" 73 #include "llvm/ADT/Statistic.h" 74 #include "llvm/ADT/StringRef.h" 75 #include "llvm/ADT/Twine.h" 76 #include "llvm/ADT/iterator_range.h" 77 #include "llvm/Analysis/AssumptionCache.h" 78 #include "llvm/Analysis/BasicAliasAnalysis.h" 79 #include "llvm/Analysis/BlockFrequencyInfo.h" 80 #include "llvm/Analysis/CFG.h" 81 #include "llvm/Analysis/CodeMetrics.h" 82 #include "llvm/Analysis/DemandedBits.h" 83 #include "llvm/Analysis/GlobalsModRef.h" 84 #include "llvm/Analysis/LoopAccessAnalysis.h" 85 #include "llvm/Analysis/LoopAnalysisManager.h" 86 #include "llvm/Analysis/LoopInfo.h" 87 #include "llvm/Analysis/LoopIterator.h" 88 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 89 #include "llvm/Analysis/ProfileSummaryInfo.h" 90 #include "llvm/Analysis/ScalarEvolution.h" 91 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 92 #include "llvm/Analysis/TargetLibraryInfo.h" 93 #include "llvm/Analysis/TargetTransformInfo.h" 94 #include "llvm/Analysis/ValueTracking.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfo.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/MDBuilder.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/ProfDataUtils.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cmath> 146 #include <cstdint> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <map> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN( 251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 252 "data-and-control-without-rt-check", 253 "Similar to data-and-control, but remove the runtime check"))); 254 255 static cl::opt<bool> MaximizeBandwidth( 256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 257 cl::desc("Maximize bandwidth when selecting vectorization factor which " 258 "will be determined by the smallest type in loop.")); 259 260 static cl::opt<bool> EnableInterleavedMemAccesses( 261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 262 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 263 264 /// An interleave-group may need masking if it resides in a block that needs 265 /// predication, or in order to mask away gaps. 266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 269 270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 272 cl::desc("We don't interleave loops with a estimated constant trip count " 273 "below this number")); 274 275 static cl::opt<unsigned> ForceTargetNumScalarRegs( 276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's number of scalar registers.")); 278 279 static cl::opt<unsigned> ForceTargetNumVectorRegs( 280 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 281 cl::desc("A flag that overrides the target's number of vector registers.")); 282 283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 285 cl::desc("A flag that overrides the target's max interleave factor for " 286 "scalar loops.")); 287 288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 290 cl::desc("A flag that overrides the target's max interleave factor for " 291 "vectorized loops.")); 292 293 static cl::opt<unsigned> ForceTargetInstructionCost( 294 "force-target-instruction-cost", cl::init(0), cl::Hidden, 295 cl::desc("A flag that overrides the target's expected cost for " 296 "an instruction to a single constant value. Mostly " 297 "useful for getting consistent testing.")); 298 299 static cl::opt<bool> ForceTargetSupportsScalableVectors( 300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 301 cl::desc( 302 "Pretend that scalable vectors are supported, even if the target does " 303 "not support them. This flag should only be used for testing.")); 304 305 static cl::opt<unsigned> SmallLoopCost( 306 "small-loop-cost", cl::init(20), cl::Hidden, 307 cl::desc( 308 "The cost of a loop that is considered 'small' by the interleaver.")); 309 310 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 312 cl::desc("Enable the use of the block frequency analysis to access PGO " 313 "heuristics minimizing code growth in cold regions and being more " 314 "aggressive in hot regions.")); 315 316 // Runtime interleave loops for load/store throughput. 317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 319 cl::desc( 320 "Enable runtime interleaving until load/store ports are saturated")); 321 322 /// Interleave small loops with scalar reductions. 323 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 325 cl::desc("Enable interleaving for loops with small iteration counts that " 326 "contain scalar reductions to expose ILP.")); 327 328 /// The number of stores in a loop that are allowed to need predication. 329 static cl::opt<unsigned> NumberOfStoresToPredicate( 330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 331 cl::desc("Max number of stores to be predicated behind an if.")); 332 333 static cl::opt<bool> EnableIndVarRegisterHeur( 334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 335 cl::desc("Count the induction variable only once when interleaving")); 336 337 static cl::opt<bool> EnableCondStoresVectorization( 338 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 339 cl::desc("Enable if predication of stores during vectorization.")); 340 341 static cl::opt<unsigned> MaxNestedScalarReductionIC( 342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 343 cl::desc("The maximum interleave count to use when interleaving a scalar " 344 "reduction in a nested loop.")); 345 346 static cl::opt<bool> 347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 348 cl::Hidden, 349 cl::desc("Prefer in-loop vector reductions, " 350 "overriding the targets preference.")); 351 352 static cl::opt<bool> ForceOrderedReductions( 353 "force-ordered-reductions", cl::init(false), cl::Hidden, 354 cl::desc("Enable the vectorisation of loops with in-order (strict) " 355 "FP reductions")); 356 357 static cl::opt<bool> PreferPredicatedReductionSelect( 358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 359 cl::desc( 360 "Prefer predicating a reduction operation over an after loop select.")); 361 362 namespace llvm { 363 cl::opt<bool> EnableVPlanNativePath( 364 "enable-vplan-native-path", cl::Hidden, 365 cl::desc("Enable VPlan-native vectorization path with " 366 "support for outer loop vectorization.")); 367 } 368 369 // This flag enables the stress testing of the VPlan H-CFG construction in the 370 // VPlan-native vectorization path. It must be used in conjuction with 371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 372 // verification of the H-CFGs built. 373 static cl::opt<bool> VPlanBuildStressTest( 374 "vplan-build-stress-test", cl::init(false), cl::Hidden, 375 cl::desc( 376 "Build VPlan for every supported loop nest in the function and bail " 377 "out right after the build (stress test the VPlan H-CFG construction " 378 "in the VPlan-native vectorization path).")); 379 380 cl::opt<bool> llvm::EnableLoopInterleaving( 381 "interleave-loops", cl::init(true), cl::Hidden, 382 cl::desc("Enable loop interleaving in Loop vectorization passes")); 383 cl::opt<bool> llvm::EnableLoopVectorization( 384 "vectorize-loops", cl::init(true), cl::Hidden, 385 cl::desc("Run the Loop vectorization passes")); 386 387 static cl::opt<bool> PrintVPlansInDotFormat( 388 "vplan-print-in-dot-format", cl::Hidden, 389 cl::desc("Use dot format instead of plain text when dumping VPlans")); 390 391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 392 "force-widen-divrem-via-safe-divisor", cl::Hidden, 393 cl::desc( 394 "Override cost based safe divisor widening for div/rem instructions")); 395 396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 398 cl::Hidden, 399 cl::desc("Try wider VFs if they enable the use of vector variants")); 400 401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 402 // variables not overflowing do not hold. See `emitSCEVChecks`. 403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 405 // `emitMemRuntimeChecks`. 406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 407 // Likelyhood of bypassing the vectorized loop because there are zero trips left 408 // after prolog. See `emitIterationCountCheck`. 409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 410 411 /// A helper function that returns true if the given type is irregular. The 412 /// type is irregular if its allocated size doesn't equal the store size of an 413 /// element of the corresponding vector type. 414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 415 // Determine if an array of N elements of type Ty is "bitcast compatible" 416 // with a <N x Ty> vector. 417 // This is only true if there is no padding between the array elements. 418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 419 } 420 421 /// A helper function that returns the reciprocal of the block probability of 422 /// predicated blocks. If we return X, we are assuming the predicated block 423 /// will execute once for every X iterations of the loop header. 424 /// 425 /// TODO: We should use actual block probability here, if available. Currently, 426 /// we always assume predicated blocks have a 50% chance of executing. 427 static unsigned getReciprocalPredBlockProb() { return 2; } 428 429 /// Returns "best known" trip count for the specified loop \p L as defined by 430 /// the following procedure: 431 /// 1) Returns exact trip count if it is known. 432 /// 2) Returns expected trip count according to profile data if any. 433 /// 3) Returns upper bound estimate if it is known. 434 /// 4) Returns std::nullopt if all of the above failed. 435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 436 Loop *L) { 437 // Check if exact trip count is known. 438 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 439 return ExpectedTC; 440 441 // Check if there is an expected trip count available from profile data. 442 if (LoopVectorizeWithBlockFrequency) 443 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 444 return *EstimatedTC; 445 446 // Check if upper bound estimate is known. 447 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 448 return ExpectedTC; 449 450 return std::nullopt; 451 } 452 453 /// Return a vector containing interleaved elements from multiple 454 /// smaller input vectors. 455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 456 const Twine &Name) { 457 unsigned Factor = Vals.size(); 458 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 459 460 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 461 #ifndef NDEBUG 462 for (Value *Val : Vals) 463 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 464 #endif 465 466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 467 // must use intrinsics to interleave. 468 if (VecTy->isScalableTy()) { 469 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 470 return Builder.CreateIntrinsic( 471 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 472 /*FMFSource=*/nullptr, Name); 473 } 474 475 // Fixed length. Start by concatenating all vectors into a wide vector. 476 Value *WideVec = concatenateVectors(Builder, Vals); 477 478 // Interleave the elements into the wide vector. 479 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 480 return Builder.CreateShuffleVector( 481 WideVec, createInterleaveMask(NumElts, Factor), Name); 482 } 483 484 namespace { 485 // Forward declare GeneratedRTChecks. 486 class GeneratedRTChecks; 487 488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 489 } // namespace 490 491 namespace llvm { 492 493 AnalysisKey ShouldRunExtraVectorPasses::Key; 494 495 /// InnerLoopVectorizer vectorizes loops which contain only one basic 496 /// block to a specified vectorization factor (VF). 497 /// This class performs the widening of scalars into vectors, or multiple 498 /// scalars. This class also implements the following features: 499 /// * It inserts an epilogue loop for handling loops that don't have iteration 500 /// counts that are known to be a multiple of the vectorization factor. 501 /// * It handles the code generation for reduction variables. 502 /// * Scalarization (implementation using scalars) of un-vectorizable 503 /// instructions. 504 /// InnerLoopVectorizer does not perform any vectorization-legality 505 /// checks, and relies on the caller to check for the different legality 506 /// aspects. The InnerLoopVectorizer relies on the 507 /// LoopVectorizationLegality class to provide information about the induction 508 /// and reduction variables that were found to a given vectorization factor. 509 class InnerLoopVectorizer { 510 public: 511 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 512 LoopInfo *LI, DominatorTree *DT, 513 const TargetLibraryInfo *TLI, 514 const TargetTransformInfo *TTI, AssumptionCache *AC, 515 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 516 ElementCount MinProfitableTripCount, 517 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 518 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 519 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 520 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 521 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 522 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 523 PSI(PSI), RTChecks(RTChecks) { 524 // Query this against the original loop and save it here because the profile 525 // of the original loop header may change as the transformation happens. 526 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 527 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 528 529 if (MinProfitableTripCount.isZero()) 530 this->MinProfitableTripCount = VecWidth; 531 else 532 this->MinProfitableTripCount = MinProfitableTripCount; 533 } 534 535 virtual ~InnerLoopVectorizer() = default; 536 537 /// Create a new empty loop that will contain vectorized instructions later 538 /// on, while the old loop will be used as the scalar remainder. Control flow 539 /// is generated around the vectorized (and scalar epilogue) loops consisting 540 /// of various checks and bypasses. Return the pre-header block of the new 541 /// loop and the start value for the canonical induction, if it is != 0. The 542 /// latter is the case when vectorizing the epilogue loop. In the case of 543 /// epilogue vectorization, this function is overriden to handle the more 544 /// complex control flow around the loops. \p ExpandedSCEVs is used to 545 /// look up SCEV expansions for expressions needed during skeleton creation. 546 virtual std::pair<BasicBlock *, Value *> 547 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 548 549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 550 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 551 552 // Return true if any runtime check is added. 553 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 554 555 /// A type for vectorized values in the new loop. Each value from the 556 /// original loop, when vectorized, is represented by UF vector values in the 557 /// new unrolled loop, where UF is the unroll factor. 558 using VectorParts = SmallVector<Value *, 2>; 559 560 /// A helper function to scalarize a single Instruction in the innermost loop. 561 /// Generates a sequence of scalar instances for each lane between \p MinLane 562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 564 /// Instr's operands. 565 void scalarizeInstruction(const Instruction *Instr, 566 VPReplicateRecipe *RepRecipe, 567 const VPIteration &Instance, 568 VPTransformState &State); 569 570 /// Try to vectorize interleaved access group \p Group with the base address 571 /// given in \p Addr, optionally masking the vector operations if \p 572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 573 /// values in the vectorized loop. 574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 575 ArrayRef<VPValue *> VPDefs, 576 VPTransformState &State, VPValue *Addr, 577 ArrayRef<VPValue *> StoredValues, 578 VPValue *BlockInMask, bool NeedsMaskForGaps); 579 580 /// Fix the non-induction PHIs in \p Plan. 581 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 582 583 /// Returns true if the reordering of FP operations is not allowed, but we are 584 /// able to vectorize with strict in-order reductions for the given RdxDesc. 585 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 586 587 /// Create a new phi node for the induction variable \p OrigPhi to resume 588 /// iteration count in the scalar epilogue, from where the vectorized loop 589 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 590 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 591 /// and the resume values can come from an additional bypass block, the \p 592 /// AdditionalBypass pair provides information about the bypass block and the 593 /// end value on the edge from bypass to this loop. 594 PHINode *createInductionResumeValue( 595 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 596 ArrayRef<BasicBlock *> BypassBlocks, 597 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 598 599 /// Returns the original loop trip count. 600 Value *getTripCount() const { return TripCount; } 601 602 /// Used to set the trip count after ILV's construction and after the 603 /// preheader block has been executed. Note that this always holds the trip 604 /// count of the original loop for both main loop and epilogue vectorization. 605 void setTripCount(Value *TC) { TripCount = TC; } 606 607 protected: 608 friend class LoopVectorizationPlanner; 609 610 /// A small list of PHINodes. 611 using PhiVector = SmallVector<PHINode *, 4>; 612 613 /// A type for scalarized values in the new loop. Each value from the 614 /// original loop, when scalarized, is represented by UF x VF scalar values 615 /// in the new unrolled loop, where UF is the unroll factor and VF is the 616 /// vectorization factor. 617 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 618 619 /// Set up the values of the IVs correctly when exiting the vector loop. 620 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 621 Value *VectorTripCount, Value *EndValue, 622 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 623 VPlan &Plan, VPTransformState &State); 624 625 /// Create the exit value of first order recurrences in the middle block and 626 /// update their users. 627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 628 VPTransformState &State); 629 630 /// Create code for the loop exit value of the reduction. 631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Returns (and creates if needed) the trip count of the widened loop. 638 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 639 640 /// Returns a bitcasted value to the requested vector type. 641 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 642 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 643 const DataLayout &DL); 644 645 /// Emit a bypass check to see if the vector trip count is zero, including if 646 /// it overflows. 647 void emitIterationCountCheck(BasicBlock *Bypass); 648 649 /// Emit a bypass check to see if all of the SCEV assumptions we've 650 /// had to make are correct. Returns the block containing the checks or 651 /// nullptr if no checks have been added. 652 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 653 654 /// Emit bypass checks to check any memory assumptions we may have made. 655 /// Returns the block containing the checks or nullptr if no checks have been 656 /// added. 657 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 658 659 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 660 /// vector loop preheader, middle block and scalar preheader. 661 void createVectorLoopSkeleton(StringRef Prefix); 662 663 /// Create new phi nodes for the induction variables to resume iteration count 664 /// in the scalar epilogue, from where the vectorized loop left off. 665 /// In cases where the loop skeleton is more complicated (eg. epilogue 666 /// vectorization) and the resume values can come from an additional bypass 667 /// block, the \p AdditionalBypass pair provides information about the bypass 668 /// block and the end value on the edge from bypass to this loop. 669 void createInductionResumeValues( 670 const SCEV2ValueTy &ExpandedSCEVs, 671 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 672 673 /// Complete the loop skeleton by adding debug MDs, creating appropriate 674 /// conditional branches in the middle block, preparing the builder and 675 /// running the verifier. Return the preheader of the completed vector loop. 676 BasicBlock *completeLoopSkeleton(); 677 678 /// Collect poison-generating recipes that may generate a poison value that is 679 /// used after vectorization, even when their operands are not poison. Those 680 /// recipes meet the following conditions: 681 /// * Contribute to the address computation of a recipe generating a widen 682 /// memory load/store (VPWidenMemoryInstructionRecipe or 683 /// VPInterleaveRecipe). 684 /// * Such a widen memory load/store has at least one underlying Instruction 685 /// that is in a basic block that needs predication and after vectorization 686 /// the generated instruction won't be predicated. 687 void collectPoisonGeneratingRecipes(VPTransformState &State); 688 689 /// Allow subclasses to override and print debug traces before/after vplan 690 /// execution, when trace information is requested. 691 virtual void printDebugTracesAtStart(){}; 692 virtual void printDebugTracesAtEnd(){}; 693 694 /// The original loop. 695 Loop *OrigLoop; 696 697 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 698 /// dynamic knowledge to simplify SCEV expressions and converts them to a 699 /// more usable form. 700 PredicatedScalarEvolution &PSE; 701 702 /// Loop Info. 703 LoopInfo *LI; 704 705 /// Dominator Tree. 706 DominatorTree *DT; 707 708 /// Target Library Info. 709 const TargetLibraryInfo *TLI; 710 711 /// Target Transform Info. 712 const TargetTransformInfo *TTI; 713 714 /// Assumption Cache. 715 AssumptionCache *AC; 716 717 /// Interface to emit optimization remarks. 718 OptimizationRemarkEmitter *ORE; 719 720 /// The vectorization SIMD factor to use. Each vector will have this many 721 /// vector elements. 722 ElementCount VF; 723 724 ElementCount MinProfitableTripCount; 725 726 /// The vectorization unroll factor to use. Each scalar is vectorized to this 727 /// many different vector instructions. 728 unsigned UF; 729 730 /// The builder that we use 731 IRBuilder<> Builder; 732 733 // --- Vectorization state --- 734 735 /// The vector-loop preheader. 736 BasicBlock *LoopVectorPreHeader; 737 738 /// The scalar-loop preheader. 739 BasicBlock *LoopScalarPreHeader; 740 741 /// Middle Block between the vector and the scalar. 742 BasicBlock *LoopMiddleBlock; 743 744 /// The unique ExitBlock of the scalar loop if one exists. Note that 745 /// there can be multiple exiting edges reaching this block. 746 BasicBlock *LoopExitBlock; 747 748 /// The scalar loop body. 749 BasicBlock *LoopScalarBody; 750 751 /// A list of all bypass blocks. The first block is the entry of the loop. 752 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 753 754 /// Store instructions that were predicated. 755 SmallVector<Instruction *, 4> PredicatedInstructions; 756 757 /// Trip count of the original loop. 758 Value *TripCount = nullptr; 759 760 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 761 Value *VectorTripCount = nullptr; 762 763 /// The legality analysis. 764 LoopVectorizationLegality *Legal; 765 766 /// The profitablity analysis. 767 LoopVectorizationCostModel *Cost; 768 769 // Record whether runtime checks are added. 770 bool AddedSafetyChecks = false; 771 772 // Holds the end values for each induction variable. We save the end values 773 // so we can later fix-up the external users of the induction variables. 774 DenseMap<PHINode *, Value *> IVEndValues; 775 776 /// BFI and PSI are used to check for profile guided size optimizations. 777 BlockFrequencyInfo *BFI; 778 ProfileSummaryInfo *PSI; 779 780 // Whether this loop should be optimized for size based on profile guided size 781 // optimizatios. 782 bool OptForSizeBasedOnProfile; 783 784 /// Structure to hold information about generated runtime checks, responsible 785 /// for cleaning the checks, if vectorization turns out unprofitable. 786 GeneratedRTChecks &RTChecks; 787 788 // Holds the resume values for reductions in the loops, used to set the 789 // correct start value of reduction PHIs when vectorizing the epilogue. 790 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 791 ReductionResumeValues; 792 }; 793 794 class InnerLoopUnroller : public InnerLoopVectorizer { 795 public: 796 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 797 LoopInfo *LI, DominatorTree *DT, 798 const TargetLibraryInfo *TLI, 799 const TargetTransformInfo *TTI, AssumptionCache *AC, 800 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 801 LoopVectorizationLegality *LVL, 802 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 803 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 805 ElementCount::getFixed(1), 806 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 807 BFI, PSI, Check) {} 808 }; 809 810 /// Encapsulate information regarding vectorization of a loop and its epilogue. 811 /// This information is meant to be updated and used across two stages of 812 /// epilogue vectorization. 813 struct EpilogueLoopVectorizationInfo { 814 ElementCount MainLoopVF = ElementCount::getFixed(0); 815 unsigned MainLoopUF = 0; 816 ElementCount EpilogueVF = ElementCount::getFixed(0); 817 unsigned EpilogueUF = 0; 818 BasicBlock *MainLoopIterationCountCheck = nullptr; 819 BasicBlock *EpilogueIterationCountCheck = nullptr; 820 BasicBlock *SCEVSafetyCheck = nullptr; 821 BasicBlock *MemSafetyCheck = nullptr; 822 Value *TripCount = nullptr; 823 Value *VectorTripCount = nullptr; 824 825 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 826 ElementCount EVF, unsigned EUF) 827 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 828 assert(EUF == 1 && 829 "A high UF for the epilogue loop is likely not beneficial."); 830 } 831 }; 832 833 /// An extension of the inner loop vectorizer that creates a skeleton for a 834 /// vectorized loop that has its epilogue (residual) also vectorized. 835 /// The idea is to run the vplan on a given loop twice, firstly to setup the 836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 837 /// from the first step and vectorize the epilogue. This is achieved by 838 /// deriving two concrete strategy classes from this base class and invoking 839 /// them in succession from the loop vectorizer planner. 840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 841 public: 842 InnerLoopAndEpilogueVectorizer( 843 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 844 DominatorTree *DT, const TargetLibraryInfo *TLI, 845 const TargetTransformInfo *TTI, AssumptionCache *AC, 846 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 847 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 848 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 849 GeneratedRTChecks &Checks) 850 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 851 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 852 CM, BFI, PSI, Checks), 853 EPI(EPI) {} 854 855 // Override this function to handle the more complex control flow around the 856 // three loops. 857 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 858 const SCEV2ValueTy &ExpandedSCEVs) final { 859 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 860 } 861 862 /// The interface for creating a vectorized skeleton using one of two 863 /// different strategies, each corresponding to one execution of the vplan 864 /// as described above. 865 virtual std::pair<BasicBlock *, Value *> 866 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 867 868 /// Holds and updates state information required to vectorize the main loop 869 /// and its epilogue in two separate passes. This setup helps us avoid 870 /// regenerating and recomputing runtime safety checks. It also helps us to 871 /// shorten the iteration-count-check path length for the cases where the 872 /// iteration count of the loop is so small that the main vector loop is 873 /// completely skipped. 874 EpilogueLoopVectorizationInfo &EPI; 875 }; 876 877 /// A specialized derived class of inner loop vectorizer that performs 878 /// vectorization of *main* loops in the process of vectorizing loops and their 879 /// epilogues. 880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 881 public: 882 EpilogueVectorizerMainLoop( 883 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 884 DominatorTree *DT, const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 887 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 888 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 889 GeneratedRTChecks &Check) 890 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 EPI, LVL, CM, BFI, PSI, Check) {} 892 /// Implements the interface for creating a vectorized skeleton using the 893 /// *main loop* strategy (ie the first pass of vplan execution). 894 std::pair<BasicBlock *, Value *> 895 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 896 897 protected: 898 /// Emits an iteration count bypass check once for the main loop (when \p 899 /// ForEpilogue is false) and once for the epilogue loop (when \p 900 /// ForEpilogue is true). 901 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 902 void printDebugTracesAtStart() override; 903 void printDebugTracesAtEnd() override; 904 }; 905 906 // A specialized derived class of inner loop vectorizer that performs 907 // vectorization of *epilogue* loops in the process of vectorizing loops and 908 // their epilogues. 909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 910 public: 911 EpilogueVectorizerEpilogueLoop( 912 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 913 DominatorTree *DT, const TargetLibraryInfo *TLI, 914 const TargetTransformInfo *TTI, AssumptionCache *AC, 915 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 916 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 917 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 918 GeneratedRTChecks &Checks) 919 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 920 EPI, LVL, CM, BFI, PSI, Checks) { 921 TripCount = EPI.TripCount; 922 } 923 /// Implements the interface for creating a vectorized skeleton using the 924 /// *epilogue loop* strategy (ie the second pass of vplan execution). 925 std::pair<BasicBlock *, Value *> 926 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 927 928 protected: 929 /// Emits an iteration count bypass check after the main vector loop has 930 /// finished to see if there are any iterations left to execute by either 931 /// the vector epilogue or the scalar epilogue. 932 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 933 BasicBlock *Bypass, 934 BasicBlock *Insert); 935 void printDebugTracesAtStart() override; 936 void printDebugTracesAtEnd() override; 937 }; 938 } // end namespace llvm 939 940 /// Look for a meaningful debug location on the instruction or it's 941 /// operands. 942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 943 if (!I) 944 return DebugLoc(); 945 946 DebugLoc Empty; 947 if (I->getDebugLoc() != Empty) 948 return I->getDebugLoc(); 949 950 for (Use &Op : I->operands()) { 951 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 952 if (OpInst->getDebugLoc() != Empty) 953 return OpInst->getDebugLoc(); 954 } 955 956 return I->getDebugLoc(); 957 } 958 959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 960 /// is passed, the message relates to that particular instruction. 961 #ifndef NDEBUG 962 static void debugVectorizationMessage(const StringRef Prefix, 963 const StringRef DebugMsg, 964 Instruction *I) { 965 dbgs() << "LV: " << Prefix << DebugMsg; 966 if (I != nullptr) 967 dbgs() << " " << *I; 968 else 969 dbgs() << '.'; 970 dbgs() << '\n'; 971 } 972 #endif 973 974 /// Create an analysis remark that explains why vectorization failed 975 /// 976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 977 /// RemarkName is the identifier for the remark. If \p I is passed it is an 978 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 979 /// the location of the remark. \return the remark object that can be 980 /// streamed to. 981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 982 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 983 Value *CodeRegion = TheLoop->getHeader(); 984 DebugLoc DL = TheLoop->getStartLoc(); 985 986 if (I) { 987 CodeRegion = I->getParent(); 988 // If there is no debug location attached to the instruction, revert back to 989 // using the loop's. 990 if (I->getDebugLoc()) 991 DL = I->getDebugLoc(); 992 } 993 994 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 995 } 996 997 namespace llvm { 998 999 /// Return a value for Step multiplied by VF. 1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1001 int64_t Step) { 1002 assert(Ty->isIntegerTy() && "Expected an integer step"); 1003 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1004 } 1005 1006 /// Return the runtime value for VF. 1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1008 return B.CreateElementCount(Ty, VF); 1009 } 1010 1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1012 Loop *OrigLoop) { 1013 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1014 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1015 1016 ScalarEvolution &SE = *PSE.getSE(); 1017 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1018 } 1019 1020 void reportVectorizationFailure(const StringRef DebugMsg, 1021 const StringRef OREMsg, const StringRef ORETag, 1022 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1023 Instruction *I) { 1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1025 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1026 ORE->emit( 1027 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1028 << "loop not vectorized: " << OREMsg); 1029 } 1030 1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1032 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1033 Instruction *I) { 1034 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1035 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1036 ORE->emit( 1037 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1038 << Msg); 1039 } 1040 1041 /// Report successful vectorization of the loop. In case an outer loop is 1042 /// vectorized, prepend "outer" to the vectorization remark. 1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1044 VectorizationFactor VF, unsigned IC) { 1045 LLVM_DEBUG(debugVectorizationMessage( 1046 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 1047 nullptr)); 1048 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 1049 ORE->emit([&]() { 1050 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 1051 TheLoop->getHeader()) 1052 << "vectorized " << LoopType << "loop (vectorization width: " 1053 << ore::NV("VectorizationFactor", VF.Width) 1054 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 1055 }); 1056 } 1057 1058 } // end namespace llvm 1059 1060 #ifndef NDEBUG 1061 /// \return string containing a file name and a line # for the given loop. 1062 static std::string getDebugLocString(const Loop *L) { 1063 std::string Result; 1064 if (L) { 1065 raw_string_ostream OS(Result); 1066 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1067 LoopDbgLoc.print(OS); 1068 else 1069 // Just print the module name. 1070 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1071 OS.flush(); 1072 } 1073 return Result; 1074 } 1075 #endif 1076 1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1078 VPTransformState &State) { 1079 1080 // Collect recipes in the backward slice of `Root` that may generate a poison 1081 // value that is used after vectorization. 1082 SmallPtrSet<VPRecipeBase *, 16> Visited; 1083 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1084 SmallVector<VPRecipeBase *, 16> Worklist; 1085 Worklist.push_back(Root); 1086 1087 // Traverse the backward slice of Root through its use-def chain. 1088 while (!Worklist.empty()) { 1089 VPRecipeBase *CurRec = Worklist.back(); 1090 Worklist.pop_back(); 1091 1092 if (!Visited.insert(CurRec).second) 1093 continue; 1094 1095 // Prune search if we find another recipe generating a widen memory 1096 // instruction. Widen memory instructions involved in address computation 1097 // will lead to gather/scatter instructions, which don't need to be 1098 // handled. 1099 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1100 isa<VPInterleaveRecipe>(CurRec) || 1101 isa<VPScalarIVStepsRecipe>(CurRec) || 1102 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1103 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1104 continue; 1105 1106 // This recipe contributes to the address computation of a widen 1107 // load/store. If the underlying instruction has poison-generating flags, 1108 // drop them directly. 1109 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1110 RecWithFlags->dropPoisonGeneratingFlags(); 1111 } else { 1112 Instruction *Instr = dyn_cast_or_null<Instruction>( 1113 CurRec->getVPSingleValue()->getUnderlyingValue()); 1114 (void)Instr; 1115 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1116 "found instruction with poison generating flags not covered by " 1117 "VPRecipeWithIRFlags"); 1118 } 1119 1120 // Add new definitions to the worklist. 1121 for (VPValue *operand : CurRec->operands()) 1122 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1123 Worklist.push_back(OpDef); 1124 } 1125 }); 1126 1127 // Traverse all the recipes in the VPlan and collect the poison-generating 1128 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1129 // VPInterleaveRecipe. 1130 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1131 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1132 for (VPRecipeBase &Recipe : *VPBB) { 1133 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1134 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1135 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1136 if (AddrDef && WidenRec->isConsecutive() && 1137 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1138 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1139 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1140 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1141 if (AddrDef) { 1142 // Check if any member of the interleave group needs predication. 1143 const InterleaveGroup<Instruction> *InterGroup = 1144 InterleaveRec->getInterleaveGroup(); 1145 bool NeedPredication = false; 1146 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1147 I < NumMembers; ++I) { 1148 Instruction *Member = InterGroup->getMember(I); 1149 if (Member) 1150 NeedPredication |= 1151 Legal->blockNeedsPredication(Member->getParent()); 1152 } 1153 1154 if (NeedPredication) 1155 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1156 } 1157 } 1158 } 1159 } 1160 } 1161 1162 namespace llvm { 1163 1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1165 // lowered. 1166 enum ScalarEpilogueLowering { 1167 1168 // The default: allowing scalar epilogues. 1169 CM_ScalarEpilogueAllowed, 1170 1171 // Vectorization with OptForSize: don't allow epilogues. 1172 CM_ScalarEpilogueNotAllowedOptSize, 1173 1174 // A special case of vectorisation with OptForSize: loops with a very small 1175 // trip count are considered for vectorization under OptForSize, thereby 1176 // making sure the cost of their loop body is dominant, free of runtime 1177 // guards and scalar iteration overheads. 1178 CM_ScalarEpilogueNotAllowedLowTripLoop, 1179 1180 // Loop hint predicate indicating an epilogue is undesired. 1181 CM_ScalarEpilogueNotNeededUsePredicate, 1182 1183 // Directive indicating we must either tail fold or not vectorize 1184 CM_ScalarEpilogueNotAllowedUsePredicate 1185 }; 1186 1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1188 1189 /// LoopVectorizationCostModel - estimates the expected speedups due to 1190 /// vectorization. 1191 /// In many cases vectorization is not profitable. This can happen because of 1192 /// a number of reasons. In this class we mainly attempt to predict the 1193 /// expected speedup/slowdowns due to the supported instruction set. We use the 1194 /// TargetTransformInfo to query the different backends for the cost of 1195 /// different operations. 1196 class LoopVectorizationCostModel { 1197 public: 1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1199 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1200 LoopVectorizationLegality *Legal, 1201 const TargetTransformInfo &TTI, 1202 const TargetLibraryInfo *TLI, DemandedBits *DB, 1203 AssumptionCache *AC, 1204 OptimizationRemarkEmitter *ORE, const Function *F, 1205 const LoopVectorizeHints *Hints, 1206 InterleavedAccessInfo &IAI) 1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1209 Hints(Hints), InterleaveInfo(IAI) {} 1210 1211 /// \return An upper bound for the vectorization factors (both fixed and 1212 /// scalable). If the factors are 0, vectorization and interleaving should be 1213 /// avoided up front. 1214 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// Setup cost-based decisions for user vectorization factor. 1221 /// \return true if the UserVF is a feasible VF to be chosen. 1222 bool selectUserVectorizationFactor(ElementCount UserVF) { 1223 collectUniformsAndScalars(UserVF); 1224 collectInstsToScalarize(UserVF); 1225 return expectedCost(UserVF).first.isValid(); 1226 } 1227 1228 /// \return The size (in bits) of the smallest and widest types in the code 1229 /// that needs to be vectorized. We ignore values that remain scalar such as 1230 /// 64 bit loop indices. 1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1232 1233 /// \return The desired interleave count. 1234 /// If interleave count has been specified by metadata it will be returned. 1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1236 /// are the selected vectorization factor and the cost of the selected VF. 1237 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1238 1239 /// Memory access instruction may be vectorized in more than one way. 1240 /// Form of instruction after vectorization depends on cost. 1241 /// This function takes cost-based decisions for Load/Store instructions 1242 /// and collects them in a map. This decisions map is used for building 1243 /// the lists of loop-uniform and loop-scalar instructions. 1244 /// The calculated cost is saved with widening decision in order to 1245 /// avoid redundant calculations. 1246 void setCostBasedWideningDecision(ElementCount VF); 1247 1248 /// A call may be vectorized in different ways depending on whether we have 1249 /// vectorized variants available and whether the target supports masking. 1250 /// This function analyzes all calls in the function at the supplied VF, 1251 /// makes a decision based on the costs of available options, and stores that 1252 /// decision in a map for use in planning and plan execution. 1253 void setVectorizedCallDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Collect all element types in the loop for which widening is needed. 1275 void collectElementTypesForWidening(); 1276 1277 /// Split reductions into those that happen in the loop, and those that happen 1278 /// outside. In loop reductions are collected into InLoopReductions. 1279 void collectInLoopReductions(); 1280 1281 /// Returns true if we should use strict in-order reductions for the given 1282 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1283 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1284 /// of FP operations. 1285 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1286 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1287 } 1288 1289 /// \returns The smallest bitwidth each instruction can be represented with. 1290 /// The vector equivalents of these instructions should be truncated to this 1291 /// type. 1292 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1293 return MinBWs; 1294 } 1295 1296 /// \returns True if it is more profitable to scalarize instruction \p I for 1297 /// vectorization factor \p VF. 1298 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1299 assert(VF.isVector() && 1300 "Profitable to scalarize relevant only for VF > 1."); 1301 1302 // Cost model is not run in the VPlan-native path - return conservative 1303 // result until this changes. 1304 if (EnableVPlanNativePath) 1305 return false; 1306 1307 auto Scalars = InstsToScalarize.find(VF); 1308 assert(Scalars != InstsToScalarize.end() && 1309 "VF not yet analyzed for scalarization profitability"); 1310 return Scalars->second.contains(I); 1311 } 1312 1313 /// Returns true if \p I is known to be uniform after vectorization. 1314 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1315 // Pseudo probe needs to be duplicated for each unrolled iteration and 1316 // vector lane so that profiled loop trip count can be accurately 1317 // accumulated instead of being under counted. 1318 if (isa<PseudoProbeInst>(I)) 1319 return false; 1320 1321 if (VF.isScalar()) 1322 return true; 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto UniformsPerVF = Uniforms.find(VF); 1330 assert(UniformsPerVF != Uniforms.end() && 1331 "VF not yet analyzed for uniformity"); 1332 return UniformsPerVF->second.count(I); 1333 } 1334 1335 /// Returns true if \p I is known to be scalar after vectorization. 1336 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto ScalarsPerVF = Scalars.find(VF); 1346 assert(ScalarsPerVF != Scalars.end() && 1347 "Scalar values are not calculated for VF"); 1348 return ScalarsPerVF->second.count(I); 1349 } 1350 1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1352 /// for vectorization factor \p VF. 1353 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1354 return VF.isVector() && MinBWs.contains(I) && 1355 !isProfitableToScalarize(I, VF) && 1356 !isScalarAfterVectorization(I, VF); 1357 } 1358 1359 /// Decision that was taken during cost calculation for memory instruction. 1360 enum InstWidening { 1361 CM_Unknown, 1362 CM_Widen, // For consecutive accesses with stride +1. 1363 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1364 CM_Interleave, 1365 CM_GatherScatter, 1366 CM_Scalarize, 1367 CM_VectorCall, 1368 CM_IntrinsicCall 1369 }; 1370 1371 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1372 /// instruction \p I and vector width \p VF. 1373 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1374 InstructionCost Cost) { 1375 assert(VF.isVector() && "Expected VF >=2"); 1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1377 } 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// interleaving group \p Grp and vector width \p VF. 1381 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1382 ElementCount VF, InstWidening W, 1383 InstructionCost Cost) { 1384 assert(VF.isVector() && "Expected VF >=2"); 1385 /// Broadcast this decicion to all instructions inside the group. 1386 /// But the cost will be assigned to one instruction only. 1387 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1388 if (auto *I = Grp->getMember(i)) { 1389 if (Grp->getInsertPos() == I) 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 else 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1393 } 1394 } 1395 } 1396 1397 /// Return the cost model decision for the given instruction \p I and vector 1398 /// width \p VF. Return CM_Unknown if this instruction did not pass 1399 /// through the cost modeling. 1400 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1401 assert(VF.isVector() && "Expected VF to be a vector VF"); 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return CM_GatherScatter; 1406 1407 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1408 auto Itr = WideningDecisions.find(InstOnVF); 1409 if (Itr == WideningDecisions.end()) 1410 return CM_Unknown; 1411 return Itr->second.first; 1412 } 1413 1414 /// Return the vectorization cost for the given instruction \p I and vector 1415 /// width \p VF. 1416 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1419 assert(WideningDecisions.contains(InstOnVF) && 1420 "The cost is not calculated"); 1421 return WideningDecisions[InstOnVF].second; 1422 } 1423 1424 struct CallWideningDecision { 1425 InstWidening Kind; 1426 Function *Variant; 1427 Intrinsic::ID IID; 1428 std::optional<unsigned> MaskPos; 1429 InstructionCost Cost; 1430 }; 1431 1432 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1433 Function *Variant, Intrinsic::ID IID, 1434 std::optional<unsigned> MaskPos, 1435 InstructionCost Cost) { 1436 assert(!VF.isScalar() && "Expected vector VF"); 1437 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1438 MaskPos, Cost}; 1439 } 1440 1441 CallWideningDecision getCallWideningDecision(CallInst *CI, 1442 ElementCount VF) const { 1443 assert(!VF.isScalar() && "Expected vector VF"); 1444 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1445 } 1446 1447 /// Return True if instruction \p I is an optimizable truncate whose operand 1448 /// is an induction variable. Such a truncate will be removed by adding a new 1449 /// induction variable with the destination type. 1450 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1451 // If the instruction is not a truncate, return false. 1452 auto *Trunc = dyn_cast<TruncInst>(I); 1453 if (!Trunc) 1454 return false; 1455 1456 // Get the source and destination types of the truncate. 1457 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1458 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1459 1460 // If the truncate is free for the given types, return false. Replacing a 1461 // free truncate with an induction variable would add an induction variable 1462 // update instruction to each iteration of the loop. We exclude from this 1463 // check the primary induction variable since it will need an update 1464 // instruction regardless. 1465 Value *Op = Trunc->getOperand(0); 1466 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1467 return false; 1468 1469 // If the truncated value is not an induction variable, return false. 1470 return Legal->isInductionPhi(Op); 1471 } 1472 1473 /// Collects the instructions to scalarize for each predicated instruction in 1474 /// the loop. 1475 void collectInstsToScalarize(ElementCount VF); 1476 1477 /// Collect Uniform and Scalar values for the given \p VF. 1478 /// The sets depend on CM decision for Load/Store instructions 1479 /// that may be vectorized as interleave, gather-scatter or scalarized. 1480 /// Also make a decision on what to do about call instructions in the loop 1481 /// at that VF -- scalarize, call a known vector routine, or call a 1482 /// vector intrinsic. 1483 void collectUniformsAndScalars(ElementCount VF) { 1484 // Do the analysis once. 1485 if (VF.isScalar() || Uniforms.contains(VF)) 1486 return; 1487 setCostBasedWideningDecision(VF); 1488 setVectorizedCallDecision(VF); 1489 collectLoopUniforms(VF); 1490 collectLoopScalars(VF); 1491 } 1492 1493 /// Returns true if the target machine supports masked store operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(DataType, Ptr) && 1497 TTI.isLegalMaskedStore(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked load operation 1501 /// for the given \p DataType and kind of access to \p Ptr. 1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1503 return Legal->isConsecutivePtr(DataType, Ptr) && 1504 TTI.isLegalMaskedLoad(DataType, Alignment); 1505 } 1506 1507 /// Returns true if the target machine can represent \p V as a masked gather 1508 /// or scatter operation. 1509 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1510 bool LI = isa<LoadInst>(V); 1511 bool SI = isa<StoreInst>(V); 1512 if (!LI && !SI) 1513 return false; 1514 auto *Ty = getLoadStoreType(V); 1515 Align Align = getLoadStoreAlignment(V); 1516 if (VF.isVector()) 1517 Ty = VectorType::get(Ty, VF); 1518 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1519 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1520 } 1521 1522 /// Returns true if the target machine supports all of the reduction 1523 /// variables found for the given VF. 1524 bool canVectorizeReductions(ElementCount VF) const { 1525 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1526 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1528 })); 1529 } 1530 1531 /// Given costs for both strategies, return true if the scalar predication 1532 /// lowering should be used for div/rem. This incorporates an override 1533 /// option so it is not simply a cost comparison. 1534 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1535 InstructionCost SafeDivisorCost) const { 1536 switch (ForceSafeDivisor) { 1537 case cl::BOU_UNSET: 1538 return ScalarCost < SafeDivisorCost; 1539 case cl::BOU_TRUE: 1540 return false; 1541 case cl::BOU_FALSE: 1542 return true; 1543 }; 1544 llvm_unreachable("impossible case value"); 1545 } 1546 1547 /// Returns true if \p I is an instruction which requires predication and 1548 /// for which our chosen predication strategy is scalarization (i.e. we 1549 /// don't have an alternate strategy such as masking available). 1550 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1551 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1552 1553 /// Returns true if \p I is an instruction that needs to be predicated 1554 /// at runtime. The result is independent of the predication mechanism. 1555 /// Superset of instructions that return true for isScalarWithPredication. 1556 bool isPredicatedInst(Instruction *I) const; 1557 1558 /// Return the costs for our two available strategies for lowering a 1559 /// div/rem operation which requires speculating at least one lane. 1560 /// First result is for scalarization (will be invalid for scalable 1561 /// vectors); second is for the safe-divisor strategy. 1562 std::pair<InstructionCost, InstructionCost> 1563 getDivRemSpeculationCost(Instruction *I, 1564 ElementCount VF) const; 1565 1566 /// Returns true if \p I is a memory instruction with consecutive memory 1567 /// access that can be widened. 1568 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1569 1570 /// Returns true if \p I is a memory instruction in an interleaved-group 1571 /// of memory accesses that can be vectorized with wide vector loads/stores 1572 /// and shuffles. 1573 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1574 1575 /// Check if \p Instr belongs to any interleaved access group. 1576 bool isAccessInterleaved(Instruction *Instr) { 1577 return InterleaveInfo.isInterleaved(Instr); 1578 } 1579 1580 /// Get the interleaved access group that \p Instr belongs to. 1581 const InterleaveGroup<Instruction> * 1582 getInterleavedAccessGroup(Instruction *Instr) { 1583 return InterleaveInfo.getInterleaveGroup(Instr); 1584 } 1585 1586 /// Returns true if we're required to use a scalar epilogue for at least 1587 /// the final iteration of the original loop. 1588 bool requiresScalarEpilogue(bool IsVectorizing) const { 1589 if (!isScalarEpilogueAllowed()) 1590 return false; 1591 // If we might exit from anywhere but the latch, must run the exiting 1592 // iteration in scalar form. 1593 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1594 return true; 1595 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1596 } 1597 1598 /// Returns true if we're required to use a scalar epilogue for at least 1599 /// the final iteration of the original loop for all VFs in \p Range. 1600 /// A scalar epilogue must either be required for all VFs in \p Range or for 1601 /// none. 1602 bool requiresScalarEpilogue(VFRange Range) const { 1603 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1604 return requiresScalarEpilogue(VF.isVector()); 1605 }; 1606 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1607 assert( 1608 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1609 "all VFs in range must agree on whether a scalar epilogue is required"); 1610 return IsRequired; 1611 } 1612 1613 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1614 /// loop hint annotation. 1615 bool isScalarEpilogueAllowed() const { 1616 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1617 } 1618 1619 /// Returns the TailFoldingStyle that is best for the current loop. 1620 TailFoldingStyle 1621 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1622 if (!CanFoldTailByMasking) 1623 return TailFoldingStyle::None; 1624 1625 if (ForceTailFoldingStyle.getNumOccurrences()) 1626 return ForceTailFoldingStyle; 1627 1628 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { 1633 return getTailFoldingStyle() != TailFoldingStyle::None; 1634 } 1635 1636 /// Returns true if the instructions in this block requires predication 1637 /// for any reason, e.g. because tail folding now requires a predicate 1638 /// or because the block in the original loop was predicated. 1639 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1640 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1641 } 1642 1643 /// Returns true if the Phi is part of an inloop reduction. 1644 bool isInLoopReduction(PHINode *Phi) const { 1645 return InLoopReductions.contains(Phi); 1646 } 1647 1648 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1649 /// with factor VF. Return the cost of the instruction, including 1650 /// scalarization overhead if it's needed. 1651 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1652 1653 /// Estimate cost of a call instruction CI if it were vectorized with factor 1654 /// VF. Return the cost of the instruction, including scalarization overhead 1655 /// if it's needed. 1656 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1657 1658 /// Invalidates decisions already taken by the cost model. 1659 void invalidateCostModelingDecisions() { 1660 WideningDecisions.clear(); 1661 CallWideningDecisions.clear(); 1662 Uniforms.clear(); 1663 Scalars.clear(); 1664 } 1665 1666 /// The vectorization cost is a combination of the cost itself and a boolean 1667 /// indicating whether any of the contributing operations will actually 1668 /// operate on vector values after type legalization in the backend. If this 1669 /// latter value is false, then all operations will be scalarized (i.e. no 1670 /// vectorization has actually taken place). 1671 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1672 1673 /// Returns the expected execution cost. The unit of the cost does 1674 /// not matter because we use the 'cost' units to compare different 1675 /// vector widths. The cost that is returned is *not* normalized by 1676 /// the factor width. If \p Invalid is not nullptr, this function 1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1678 /// each instruction that has an Invalid cost for the given VF. 1679 VectorizationCostTy 1680 expectedCost(ElementCount VF, 1681 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1682 1683 bool hasPredStores() const { return NumPredStores > 0; } 1684 1685 /// Returns true if epilogue vectorization is considered profitable, and 1686 /// false otherwise. 1687 /// \p VF is the vectorization factor chosen for the original loop. 1688 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1689 1690 private: 1691 unsigned NumPredStores = 0; 1692 1693 /// \return An upper bound for the vectorization factors for both 1694 /// fixed and scalable vectorization, where the minimum-known number of 1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1696 /// disabled or unsupported, then the scalable part will be equal to 1697 /// ElementCount::getScalable(0). 1698 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1699 ElementCount UserVF, 1700 bool FoldTailByMasking); 1701 1702 /// \return the maximized element count based on the targets vector 1703 /// registers and the loop trip-count, but limited to a maximum safe VF. 1704 /// This is a helper function of computeFeasibleMaxVF. 1705 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1706 unsigned SmallestType, 1707 unsigned WidestType, 1708 ElementCount MaxSafeVF, 1709 bool FoldTailByMasking); 1710 1711 /// \return the maximum legal scalable VF, based on the safe max number 1712 /// of elements. 1713 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1714 1715 /// Returns the execution time cost of an instruction for a given vector 1716 /// width. Vector width of one means scalar. 1717 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost-computation logic from getInstructionCost which provides 1720 /// the vector type as an output parameter. 1721 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1722 Type *&VectorTy); 1723 1724 /// Return the cost of instructions in an inloop reduction pattern, if I is 1725 /// part of that pattern. 1726 std::optional<InstructionCost> 1727 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1728 TTI::TargetCostKind CostKind) const; 1729 1730 /// Calculate vectorization cost of memory instruction \p I. 1731 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1732 1733 /// The cost computation for scalarized memory instruction. 1734 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1735 1736 /// The cost computation for interleaving group of memory instructions. 1737 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost computation for Gather/Scatter instruction. 1740 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1741 1742 /// The cost computation for widening instruction \p I with consecutive 1743 /// memory access. 1744 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1745 1746 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1747 /// Load: scalar load + broadcast. 1748 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1749 /// element) 1750 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1751 1752 /// Estimate the overhead of scalarizing an instruction. This is a 1753 /// convenience wrapper for the type-based getScalarizationOverhead API. 1754 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1755 TTI::TargetCostKind CostKind) const; 1756 1757 /// Returns true if an artificially high cost for emulated masked memrefs 1758 /// should be used. 1759 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1760 1761 /// Map of scalar integer values to the smallest bitwidth they can be legally 1762 /// represented as. The vector equivalents of these values should be truncated 1763 /// to this type. 1764 MapVector<Instruction *, uint64_t> MinBWs; 1765 1766 /// A type representing the costs for instructions if they were to be 1767 /// scalarized rather than vectorized. The entries are Instruction-Cost 1768 /// pairs. 1769 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1770 1771 /// A set containing all BasicBlocks that are known to present after 1772 /// vectorization as a predicated block. 1773 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1774 PredicatedBBsAfterVectorization; 1775 1776 /// Records whether it is allowed to have the original scalar loop execute at 1777 /// least once. This may be needed as a fallback loop in case runtime 1778 /// aliasing/dependence checks fail, or to handle the tail/remainder 1779 /// iterations when the trip count is unknown or doesn't divide by the VF, 1780 /// or as a peel-loop to handle gaps in interleave-groups. 1781 /// Under optsize and when the trip count is very small we don't allow any 1782 /// iterations to execute in the scalar loop. 1783 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1784 1785 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1786 bool CanFoldTailByMasking = false; 1787 1788 /// A map holding scalar costs for different vectorization factors. The 1789 /// presence of a cost for an instruction in the mapping indicates that the 1790 /// instruction will be scalarized when vectorizing with the associated 1791 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1792 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1793 1794 /// Holds the instructions known to be uniform after vectorization. 1795 /// The data is collected per VF. 1796 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1797 1798 /// Holds the instructions known to be scalar after vectorization. 1799 /// The data is collected per VF. 1800 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1801 1802 /// Holds the instructions (address computations) that are forced to be 1803 /// scalarized. 1804 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1805 1806 /// PHINodes of the reductions that should be expanded in-loop. 1807 SmallPtrSet<PHINode *, 4> InLoopReductions; 1808 1809 /// A Map of inloop reduction operations and their immediate chain operand. 1810 /// FIXME: This can be removed once reductions can be costed correctly in 1811 /// VPlan. This was added to allow quick lookup of the inloop operations. 1812 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1813 1814 /// Returns the expected difference in cost from scalarizing the expression 1815 /// feeding a predicated instruction \p PredInst. The instructions to 1816 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1817 /// non-negative return value implies the expression will be scalarized. 1818 /// Currently, only single-use chains are considered for scalarization. 1819 InstructionCost computePredInstDiscount(Instruction *PredInst, 1820 ScalarCostsTy &ScalarCosts, 1821 ElementCount VF); 1822 1823 /// Collect the instructions that are uniform after vectorization. An 1824 /// instruction is uniform if we represent it with a single scalar value in 1825 /// the vectorized loop corresponding to each vector iteration. Examples of 1826 /// uniform instructions include pointer operands of consecutive or 1827 /// interleaved memory accesses. Note that although uniformity implies an 1828 /// instruction will be scalar, the reverse is not true. In general, a 1829 /// scalarized instruction will be represented by VF scalar values in the 1830 /// vectorized loop, each corresponding to an iteration of the original 1831 /// scalar loop. 1832 void collectLoopUniforms(ElementCount VF); 1833 1834 /// Collect the instructions that are scalar after vectorization. An 1835 /// instruction is scalar if it is known to be uniform or will be scalarized 1836 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1837 /// to the list if they are used by a load/store instruction that is marked as 1838 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1839 /// VF values in the vectorized loop, each corresponding to an iteration of 1840 /// the original scalar loop. 1841 void collectLoopScalars(ElementCount VF); 1842 1843 /// Keeps cost model vectorization decision and cost for instructions. 1844 /// Right now it is used for memory instructions only. 1845 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1846 std::pair<InstWidening, InstructionCost>>; 1847 1848 DecisionList WideningDecisions; 1849 1850 using CallDecisionList = 1851 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1852 1853 CallDecisionList CallWideningDecisions; 1854 1855 /// Returns true if \p V is expected to be vectorized and it needs to be 1856 /// extracted. 1857 bool needsExtract(Value *V, ElementCount VF) const { 1858 Instruction *I = dyn_cast<Instruction>(V); 1859 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1860 TheLoop->isLoopInvariant(I)) 1861 return false; 1862 1863 // Assume we can vectorize V (and hence we need extraction) if the 1864 // scalars are not computed yet. This can happen, because it is called 1865 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1866 // the scalars are collected. That should be a safe assumption in most 1867 // cases, because we check if the operands have vectorizable types 1868 // beforehand in LoopVectorizationLegality. 1869 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1870 }; 1871 1872 /// Returns a range containing only operands needing to be extracted. 1873 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1874 ElementCount VF) const { 1875 return SmallVector<Value *, 4>(make_filter_range( 1876 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1877 } 1878 1879 public: 1880 /// The loop that we evaluate. 1881 Loop *TheLoop; 1882 1883 /// Predicated scalar evolution analysis. 1884 PredicatedScalarEvolution &PSE; 1885 1886 /// Loop Info analysis. 1887 LoopInfo *LI; 1888 1889 /// Vectorization legality. 1890 LoopVectorizationLegality *Legal; 1891 1892 /// Vector target information. 1893 const TargetTransformInfo &TTI; 1894 1895 /// Target Library Info. 1896 const TargetLibraryInfo *TLI; 1897 1898 /// Demanded bits analysis. 1899 DemandedBits *DB; 1900 1901 /// Assumption cache. 1902 AssumptionCache *AC; 1903 1904 /// Interface to emit optimization remarks. 1905 OptimizationRemarkEmitter *ORE; 1906 1907 const Function *TheFunction; 1908 1909 /// Loop Vectorize Hint. 1910 const LoopVectorizeHints *Hints; 1911 1912 /// The interleave access information contains groups of interleaved accesses 1913 /// with the same stride and close to each other. 1914 InterleavedAccessInfo &InterleaveInfo; 1915 1916 /// Values to ignore in the cost model. 1917 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1918 1919 /// Values to ignore in the cost model when VF > 1. 1920 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1921 1922 /// All element types found in the loop. 1923 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1924 }; 1925 } // end namespace llvm 1926 1927 namespace { 1928 /// Helper struct to manage generating runtime checks for vectorization. 1929 /// 1930 /// The runtime checks are created up-front in temporary blocks to allow better 1931 /// estimating the cost and un-linked from the existing IR. After deciding to 1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1933 /// temporary blocks are completely removed. 1934 class GeneratedRTChecks { 1935 /// Basic block which contains the generated SCEV checks, if any. 1936 BasicBlock *SCEVCheckBlock = nullptr; 1937 1938 /// The value representing the result of the generated SCEV checks. If it is 1939 /// nullptr, either no SCEV checks have been generated or they have been used. 1940 Value *SCEVCheckCond = nullptr; 1941 1942 /// Basic block which contains the generated memory runtime checks, if any. 1943 BasicBlock *MemCheckBlock = nullptr; 1944 1945 /// The value representing the result of the generated memory runtime checks. 1946 /// If it is nullptr, either no memory runtime checks have been generated or 1947 /// they have been used. 1948 Value *MemRuntimeCheckCond = nullptr; 1949 1950 DominatorTree *DT; 1951 LoopInfo *LI; 1952 TargetTransformInfo *TTI; 1953 1954 SCEVExpander SCEVExp; 1955 SCEVExpander MemCheckExp; 1956 1957 bool CostTooHigh = false; 1958 const bool AddBranchWeights; 1959 1960 public: 1961 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1962 TargetTransformInfo *TTI, const DataLayout &DL, 1963 bool AddBranchWeights) 1964 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1965 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} 1966 1967 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1968 /// accurately estimate the cost of the runtime checks. The blocks are 1969 /// un-linked from the IR and is added back during vector code generation. If 1970 /// there is no vector code generation, the check blocks are removed 1971 /// completely. 1972 void Create(Loop *L, const LoopAccessInfo &LAI, 1973 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1974 1975 // Hard cutoff to limit compile-time increase in case a very large number of 1976 // runtime checks needs to be generated. 1977 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1978 // profile info. 1979 CostTooHigh = 1980 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1981 if (CostTooHigh) 1982 return; 1983 1984 BasicBlock *LoopHeader = L->getHeader(); 1985 BasicBlock *Preheader = L->getLoopPreheader(); 1986 1987 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1988 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1989 // may be used by SCEVExpander. The blocks will be un-linked from their 1990 // predecessors and removed from LI & DT at the end of the function. 1991 if (!UnionPred.isAlwaysTrue()) { 1992 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1993 nullptr, "vector.scevcheck"); 1994 1995 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1996 &UnionPred, SCEVCheckBlock->getTerminator()); 1997 } 1998 1999 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2000 if (RtPtrChecking.Need) { 2001 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2002 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2003 "vector.memcheck"); 2004 2005 auto DiffChecks = RtPtrChecking.getDiffChecks(); 2006 if (DiffChecks) { 2007 Value *RuntimeVF = nullptr; 2008 MemRuntimeCheckCond = addDiffRuntimeChecks( 2009 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 2010 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 2011 if (!RuntimeVF) 2012 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 2013 return RuntimeVF; 2014 }, 2015 IC); 2016 } else { 2017 MemRuntimeCheckCond = addRuntimeChecks( 2018 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 2019 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 2020 } 2021 assert(MemRuntimeCheckCond && 2022 "no RT checks generated although RtPtrChecking " 2023 "claimed checks are required"); 2024 } 2025 2026 if (!MemCheckBlock && !SCEVCheckBlock) 2027 return; 2028 2029 // Unhook the temporary block with the checks, update various places 2030 // accordingly. 2031 if (SCEVCheckBlock) 2032 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2033 if (MemCheckBlock) 2034 MemCheckBlock->replaceAllUsesWith(Preheader); 2035 2036 if (SCEVCheckBlock) { 2037 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2038 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2039 Preheader->getTerminator()->eraseFromParent(); 2040 } 2041 if (MemCheckBlock) { 2042 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2043 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2044 Preheader->getTerminator()->eraseFromParent(); 2045 } 2046 2047 DT->changeImmediateDominator(LoopHeader, Preheader); 2048 if (MemCheckBlock) { 2049 DT->eraseNode(MemCheckBlock); 2050 LI->removeBlock(MemCheckBlock); 2051 } 2052 if (SCEVCheckBlock) { 2053 DT->eraseNode(SCEVCheckBlock); 2054 LI->removeBlock(SCEVCheckBlock); 2055 } 2056 } 2057 2058 InstructionCost getCost() { 2059 if (SCEVCheckBlock || MemCheckBlock) 2060 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2061 2062 if (CostTooHigh) { 2063 InstructionCost Cost; 2064 Cost.setInvalid(); 2065 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2066 return Cost; 2067 } 2068 2069 InstructionCost RTCheckCost = 0; 2070 if (SCEVCheckBlock) 2071 for (Instruction &I : *SCEVCheckBlock) { 2072 if (SCEVCheckBlock->getTerminator() == &I) 2073 continue; 2074 InstructionCost C = 2075 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2076 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2077 RTCheckCost += C; 2078 } 2079 if (MemCheckBlock) 2080 for (Instruction &I : *MemCheckBlock) { 2081 if (MemCheckBlock->getTerminator() == &I) 2082 continue; 2083 InstructionCost C = 2084 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2085 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2086 RTCheckCost += C; 2087 } 2088 2089 if (SCEVCheckBlock || MemCheckBlock) 2090 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2091 << "\n"); 2092 2093 return RTCheckCost; 2094 } 2095 2096 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2097 /// unused. 2098 ~GeneratedRTChecks() { 2099 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2100 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2101 if (!SCEVCheckCond) 2102 SCEVCleaner.markResultUsed(); 2103 2104 if (!MemRuntimeCheckCond) 2105 MemCheckCleaner.markResultUsed(); 2106 2107 if (MemRuntimeCheckCond) { 2108 auto &SE = *MemCheckExp.getSE(); 2109 // Memory runtime check generation creates compares that use expanded 2110 // values. Remove them before running the SCEVExpanderCleaners. 2111 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2112 if (MemCheckExp.isInsertedInstruction(&I)) 2113 continue; 2114 SE.forgetValue(&I); 2115 I.eraseFromParent(); 2116 } 2117 } 2118 MemCheckCleaner.cleanup(); 2119 SCEVCleaner.cleanup(); 2120 2121 if (SCEVCheckCond) 2122 SCEVCheckBlock->eraseFromParent(); 2123 if (MemRuntimeCheckCond) 2124 MemCheckBlock->eraseFromParent(); 2125 } 2126 2127 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2128 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2129 /// depending on the generated condition. 2130 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2131 BasicBlock *LoopVectorPreHeader, 2132 BasicBlock *LoopExitBlock) { 2133 if (!SCEVCheckCond) 2134 return nullptr; 2135 2136 Value *Cond = SCEVCheckCond; 2137 // Mark the check as used, to prevent it from being removed during cleanup. 2138 SCEVCheckCond = nullptr; 2139 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2140 if (C->isZero()) 2141 return nullptr; 2142 2143 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2144 2145 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2146 // Create new preheader for vector loop. 2147 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2148 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2149 2150 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2151 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2152 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2153 SCEVCheckBlock); 2154 2155 DT->addNewBlock(SCEVCheckBlock, Pred); 2156 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2157 2158 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2159 if (AddBranchWeights) 2160 setBranchWeights(BI, SCEVCheckBypassWeights); 2161 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2162 return SCEVCheckBlock; 2163 } 2164 2165 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2166 /// the branches to branch to the vector preheader or \p Bypass, depending on 2167 /// the generated condition. 2168 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2169 BasicBlock *LoopVectorPreHeader) { 2170 // Check if we generated code that checks in runtime if arrays overlap. 2171 if (!MemRuntimeCheckCond) 2172 return nullptr; 2173 2174 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2175 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2176 MemCheckBlock); 2177 2178 DT->addNewBlock(MemCheckBlock, Pred); 2179 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2180 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2181 2182 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2183 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2184 2185 BranchInst &BI = 2186 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2187 if (AddBranchWeights) { 2188 setBranchWeights(BI, MemCheckBypassWeights); 2189 } 2190 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2191 MemCheckBlock->getTerminator()->setDebugLoc( 2192 Pred->getTerminator()->getDebugLoc()); 2193 2194 // Mark the check as used, to prevent it from being removed during cleanup. 2195 MemRuntimeCheckCond = nullptr; 2196 return MemCheckBlock; 2197 } 2198 }; 2199 } // namespace 2200 2201 static bool useActiveLaneMask(TailFoldingStyle Style) { 2202 return Style == TailFoldingStyle::Data || 2203 Style == TailFoldingStyle::DataAndControlFlow || 2204 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2205 } 2206 2207 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2208 return Style == TailFoldingStyle::DataAndControlFlow || 2209 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2210 } 2211 2212 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2213 // vectorization. The loop needs to be annotated with #pragma omp simd 2214 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2215 // vector length information is not provided, vectorization is not considered 2216 // explicit. Interleave hints are not allowed either. These limitations will be 2217 // relaxed in the future. 2218 // Please, note that we are currently forced to abuse the pragma 'clang 2219 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2220 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2221 // provides *explicit vectorization hints* (LV can bypass legal checks and 2222 // assume that vectorization is legal). However, both hints are implemented 2223 // using the same metadata (llvm.loop.vectorize, processed by 2224 // LoopVectorizeHints). This will be fixed in the future when the native IR 2225 // representation for pragma 'omp simd' is introduced. 2226 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2227 OptimizationRemarkEmitter *ORE) { 2228 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2229 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2230 2231 // Only outer loops with an explicit vectorization hint are supported. 2232 // Unannotated outer loops are ignored. 2233 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2234 return false; 2235 2236 Function *Fn = OuterLp->getHeader()->getParent(); 2237 if (!Hints.allowVectorization(Fn, OuterLp, 2238 true /*VectorizeOnlyWhenForced*/)) { 2239 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2240 return false; 2241 } 2242 2243 if (Hints.getInterleave() > 1) { 2244 // TODO: Interleave support is future work. 2245 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2246 "outer loops.\n"); 2247 Hints.emitRemarkWithHints(); 2248 return false; 2249 } 2250 2251 return true; 2252 } 2253 2254 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2255 OptimizationRemarkEmitter *ORE, 2256 SmallVectorImpl<Loop *> &V) { 2257 // Collect inner loops and outer loops without irreducible control flow. For 2258 // now, only collect outer loops that have explicit vectorization hints. If we 2259 // are stress testing the VPlan H-CFG construction, we collect the outermost 2260 // loop of every loop nest. 2261 if (L.isInnermost() || VPlanBuildStressTest || 2262 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2263 LoopBlocksRPO RPOT(&L); 2264 RPOT.perform(LI); 2265 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2266 V.push_back(&L); 2267 // TODO: Collect inner loops inside marked outer loops in case 2268 // vectorization fails for the outer loop. Do not invoke 2269 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2270 // already known to be reducible. We can use an inherited attribute for 2271 // that. 2272 return; 2273 } 2274 } 2275 for (Loop *InnerL : L) 2276 collectSupportedLoops(*InnerL, LI, ORE, V); 2277 } 2278 2279 //===----------------------------------------------------------------------===// 2280 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2281 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2282 //===----------------------------------------------------------------------===// 2283 2284 /// Compute the transformed value of Index at offset StartValue using step 2285 /// StepValue. 2286 /// For integer induction, returns StartValue + Index * StepValue. 2287 /// For pointer induction, returns StartValue[Index * StepValue]. 2288 /// FIXME: The newly created binary instructions should contain nsw/nuw 2289 /// flags, which can be found from the original scalar operations. 2290 static Value * 2291 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2292 Value *Step, 2293 InductionDescriptor::InductionKind InductionKind, 2294 const BinaryOperator *InductionBinOp) { 2295 Type *StepTy = Step->getType(); 2296 Value *CastedIndex = StepTy->isIntegerTy() 2297 ? B.CreateSExtOrTrunc(Index, StepTy) 2298 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2299 if (CastedIndex != Index) { 2300 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2301 Index = CastedIndex; 2302 } 2303 2304 // Note: the IR at this point is broken. We cannot use SE to create any new 2305 // SCEV and then expand it, hoping that SCEV's simplification will give us 2306 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2307 // lead to various SCEV crashes. So all we can do is to use builder and rely 2308 // on InstCombine for future simplifications. Here we handle some trivial 2309 // cases only. 2310 auto CreateAdd = [&B](Value *X, Value *Y) { 2311 assert(X->getType() == Y->getType() && "Types don't match!"); 2312 if (auto *CX = dyn_cast<ConstantInt>(X)) 2313 if (CX->isZero()) 2314 return Y; 2315 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2316 if (CY->isZero()) 2317 return X; 2318 return B.CreateAdd(X, Y); 2319 }; 2320 2321 // We allow X to be a vector type, in which case Y will potentially be 2322 // splatted into a vector with the same element count. 2323 auto CreateMul = [&B](Value *X, Value *Y) { 2324 assert(X->getType()->getScalarType() == Y->getType() && 2325 "Types don't match!"); 2326 if (auto *CX = dyn_cast<ConstantInt>(X)) 2327 if (CX->isOne()) 2328 return Y; 2329 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2330 if (CY->isOne()) 2331 return X; 2332 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2333 if (XVTy && !isa<VectorType>(Y->getType())) 2334 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2335 return B.CreateMul(X, Y); 2336 }; 2337 2338 switch (InductionKind) { 2339 case InductionDescriptor::IK_IntInduction: { 2340 assert(!isa<VectorType>(Index->getType()) && 2341 "Vector indices not supported for integer inductions yet"); 2342 assert(Index->getType() == StartValue->getType() && 2343 "Index type does not match StartValue type"); 2344 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2345 return B.CreateSub(StartValue, Index); 2346 auto *Offset = CreateMul(Index, Step); 2347 return CreateAdd(StartValue, Offset); 2348 } 2349 case InductionDescriptor::IK_PtrInduction: { 2350 return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); 2351 } 2352 case InductionDescriptor::IK_FpInduction: { 2353 assert(!isa<VectorType>(Index->getType()) && 2354 "Vector indices not supported for FP inductions yet"); 2355 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2356 assert(InductionBinOp && 2357 (InductionBinOp->getOpcode() == Instruction::FAdd || 2358 InductionBinOp->getOpcode() == Instruction::FSub) && 2359 "Original bin op should be defined for FP induction"); 2360 2361 Value *MulExp = B.CreateFMul(Step, Index); 2362 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2363 "induction"); 2364 } 2365 case InductionDescriptor::IK_NoInduction: 2366 return nullptr; 2367 } 2368 llvm_unreachable("invalid enum"); 2369 } 2370 2371 std::optional<unsigned> getMaxVScale(const Function &F, 2372 const TargetTransformInfo &TTI) { 2373 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2374 return MaxVScale; 2375 2376 if (F.hasFnAttribute(Attribute::VScaleRange)) 2377 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2378 2379 return std::nullopt; 2380 } 2381 2382 /// For the given VF and UF and maximum trip count computed for the loop, return 2383 /// whether the induction variable might overflow in the vectorized loop. If not, 2384 /// then we know a runtime overflow check always evaluates to false and can be 2385 /// removed. 2386 static bool isIndvarOverflowCheckKnownFalse( 2387 const LoopVectorizationCostModel *Cost, 2388 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2389 // Always be conservative if we don't know the exact unroll factor. 2390 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2391 2392 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2393 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2394 2395 // We know the runtime overflow check is known false iff the (max) trip-count 2396 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2397 // the vector loop induction variable. 2398 if (unsigned TC = 2399 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2400 uint64_t MaxVF = VF.getKnownMinValue(); 2401 if (VF.isScalable()) { 2402 std::optional<unsigned> MaxVScale = 2403 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2404 if (!MaxVScale) 2405 return false; 2406 MaxVF *= *MaxVScale; 2407 } 2408 2409 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2410 } 2411 2412 return false; 2413 } 2414 2415 // Return whether we allow using masked interleave-groups (for dealing with 2416 // strided loads/stores that reside in predicated blocks, or for dealing 2417 // with gaps). 2418 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2419 // If an override option has been passed in for interleaved accesses, use it. 2420 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2421 return EnableMaskedInterleavedMemAccesses; 2422 2423 return TTI.enableMaskedInterleavedAccessVectorization(); 2424 } 2425 2426 // Try to vectorize the interleave group that \p Instr belongs to. 2427 // 2428 // E.g. Translate following interleaved load group (factor = 3): 2429 // for (i = 0; i < N; i+=3) { 2430 // R = Pic[i]; // Member of index 0 2431 // G = Pic[i+1]; // Member of index 1 2432 // B = Pic[i+2]; // Member of index 2 2433 // ... // do something to R, G, B 2434 // } 2435 // To: 2436 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2437 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2438 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2439 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2440 // 2441 // Or translate following interleaved store group (factor = 3): 2442 // for (i = 0; i < N; i+=3) { 2443 // ... do something to R, G, B 2444 // Pic[i] = R; // Member of index 0 2445 // Pic[i+1] = G; // Member of index 1 2446 // Pic[i+2] = B; // Member of index 2 2447 // } 2448 // To: 2449 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2450 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2451 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2452 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2453 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2454 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2455 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2456 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2457 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2458 Instruction *Instr = Group->getInsertPos(); 2459 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2460 2461 // Prepare for the vector type of the interleaved load/store. 2462 Type *ScalarTy = getLoadStoreType(Instr); 2463 unsigned InterleaveFactor = Group->getFactor(); 2464 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2465 2466 // Prepare for the new pointers. 2467 SmallVector<Value *, 2> AddrParts; 2468 unsigned Index = Group->getIndex(Instr); 2469 2470 // TODO: extend the masked interleaved-group support to reversed access. 2471 assert((!BlockInMask || !Group->isReverse()) && 2472 "Reversed masked interleave-group not supported."); 2473 2474 Value *Idx; 2475 // If the group is reverse, adjust the index to refer to the last vector lane 2476 // instead of the first. We adjust the index from the first vector lane, 2477 // rather than directly getting the pointer for lane VF - 1, because the 2478 // pointer operand of the interleaved access is supposed to be uniform. For 2479 // uniform instructions, we're only required to generate a value for the 2480 // first vector lane in each unroll iteration. 2481 if (Group->isReverse()) { 2482 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2483 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2484 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2485 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2486 Idx = Builder.CreateNeg(Idx); 2487 } else 2488 Idx = Builder.getInt32(-Index); 2489 2490 for (unsigned Part = 0; Part < UF; Part++) { 2491 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2492 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2493 State.setDebugLocFrom(I->getDebugLoc()); 2494 2495 // Notice current instruction could be any index. Need to adjust the address 2496 // to the member of index 0. 2497 // 2498 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2499 // b = A[i]; // Member of index 0 2500 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2501 // 2502 // E.g. A[i+1] = a; // Member of index 1 2503 // A[i] = b; // Member of index 0 2504 // A[i+2] = c; // Member of index 2 (Current instruction) 2505 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2506 2507 bool InBounds = false; 2508 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2509 InBounds = gep->isInBounds(); 2510 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2511 AddrParts.push_back(AddrPart); 2512 } 2513 2514 State.setDebugLocFrom(Instr->getDebugLoc()); 2515 Value *PoisonVec = PoisonValue::get(VecTy); 2516 2517 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2518 unsigned Part, Value *MaskForGaps) -> Value * { 2519 if (VF.isScalable()) { 2520 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2521 assert(InterleaveFactor == 2 && 2522 "Unsupported deinterleave factor for scalable vectors"); 2523 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2524 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2525 auto *MaskTy = 2526 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2527 return Builder.CreateIntrinsic( 2528 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2529 /*FMFSource=*/nullptr, "interleaved.mask"); 2530 } 2531 2532 if (!BlockInMask) 2533 return MaskForGaps; 2534 2535 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2536 Value *ShuffledMask = Builder.CreateShuffleVector( 2537 BlockInMaskPart, 2538 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2539 "interleaved.mask"); 2540 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2541 MaskForGaps) 2542 : ShuffledMask; 2543 }; 2544 2545 // Vectorize the interleaved load group. 2546 if (isa<LoadInst>(Instr)) { 2547 Value *MaskForGaps = nullptr; 2548 if (NeedsMaskForGaps) { 2549 MaskForGaps = 2550 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2551 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2552 } 2553 2554 // For each unroll part, create a wide load for the group. 2555 SmallVector<Value *, 2> NewLoads; 2556 for (unsigned Part = 0; Part < UF; Part++) { 2557 Instruction *NewLoad; 2558 if (BlockInMask || MaskForGaps) { 2559 assert(useMaskedInterleavedAccesses(*TTI) && 2560 "masked interleaved groups are not allowed."); 2561 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2562 NewLoad = 2563 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2564 GroupMask, PoisonVec, "wide.masked.vec"); 2565 } 2566 else 2567 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2568 Group->getAlign(), "wide.vec"); 2569 Group->addMetadata(NewLoad); 2570 NewLoads.push_back(NewLoad); 2571 } 2572 2573 if (VecTy->isScalableTy()) { 2574 assert(InterleaveFactor == 2 && 2575 "Unsupported deinterleave factor for scalable vectors"); 2576 2577 for (unsigned Part = 0; Part < UF; ++Part) { 2578 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2579 // so must use intrinsics to deinterleave. 2580 Value *DI = Builder.CreateIntrinsic( 2581 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2582 /*FMFSource=*/nullptr, "strided.vec"); 2583 unsigned J = 0; 2584 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2585 Instruction *Member = Group->getMember(I); 2586 2587 if (!Member) 2588 continue; 2589 2590 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2591 // If this member has different type, cast the result type. 2592 if (Member->getType() != ScalarTy) { 2593 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2594 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2595 } 2596 2597 if (Group->isReverse()) 2598 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2599 2600 State.set(VPDefs[J], StridedVec, Part); 2601 ++J; 2602 } 2603 } 2604 2605 return; 2606 } 2607 2608 // For each member in the group, shuffle out the appropriate data from the 2609 // wide loads. 2610 unsigned J = 0; 2611 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2612 Instruction *Member = Group->getMember(I); 2613 2614 // Skip the gaps in the group. 2615 if (!Member) 2616 continue; 2617 2618 auto StrideMask = 2619 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2620 for (unsigned Part = 0; Part < UF; Part++) { 2621 Value *StridedVec = Builder.CreateShuffleVector( 2622 NewLoads[Part], StrideMask, "strided.vec"); 2623 2624 // If this member has different type, cast the result type. 2625 if (Member->getType() != ScalarTy) { 2626 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2627 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2628 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2629 } 2630 2631 if (Group->isReverse()) 2632 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2633 2634 State.set(VPDefs[J], StridedVec, Part); 2635 } 2636 ++J; 2637 } 2638 return; 2639 } 2640 2641 // The sub vector type for current instruction. 2642 auto *SubVT = VectorType::get(ScalarTy, VF); 2643 2644 // Vectorize the interleaved store group. 2645 Value *MaskForGaps = 2646 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2647 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2648 "masked interleaved groups are not allowed."); 2649 assert((!MaskForGaps || !VF.isScalable()) && 2650 "masking gaps for scalable vectors is not yet supported."); 2651 for (unsigned Part = 0; Part < UF; Part++) { 2652 // Collect the stored vector from each member. 2653 SmallVector<Value *, 4> StoredVecs; 2654 unsigned StoredIdx = 0; 2655 for (unsigned i = 0; i < InterleaveFactor; i++) { 2656 assert((Group->getMember(i) || MaskForGaps) && 2657 "Fail to get a member from an interleaved store group"); 2658 Instruction *Member = Group->getMember(i); 2659 2660 // Skip the gaps in the group. 2661 if (!Member) { 2662 Value *Undef = PoisonValue::get(SubVT); 2663 StoredVecs.push_back(Undef); 2664 continue; 2665 } 2666 2667 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2668 ++StoredIdx; 2669 2670 if (Group->isReverse()) 2671 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2672 2673 // If this member has different type, cast it to a unified type. 2674 2675 if (StoredVec->getType() != SubVT) 2676 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2677 2678 StoredVecs.push_back(StoredVec); 2679 } 2680 2681 // Interleave all the smaller vectors into one wider vector. 2682 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2683 Instruction *NewStoreInstr; 2684 if (BlockInMask || MaskForGaps) { 2685 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2686 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2687 Group->getAlign(), GroupMask); 2688 } else 2689 NewStoreInstr = 2690 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2691 2692 Group->addMetadata(NewStoreInstr); 2693 } 2694 } 2695 2696 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2697 VPReplicateRecipe *RepRecipe, 2698 const VPIteration &Instance, 2699 VPTransformState &State) { 2700 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2701 2702 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2703 // the first lane and part. 2704 if (isa<NoAliasScopeDeclInst>(Instr)) 2705 if (!Instance.isFirstIteration()) 2706 return; 2707 2708 // Does this instruction return a value ? 2709 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2710 2711 Instruction *Cloned = Instr->clone(); 2712 if (!IsVoidRetTy) { 2713 Cloned->setName(Instr->getName() + ".cloned"); 2714 #if !defined(NDEBUG) 2715 // Verify that VPlan type inference results agree with the type of the 2716 // generated values. 2717 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2718 "inferred type and type from generated instructions do not match"); 2719 #endif 2720 } 2721 2722 RepRecipe->setFlags(Cloned); 2723 2724 if (auto DL = Instr->getDebugLoc()) 2725 State.setDebugLocFrom(DL); 2726 2727 // Replace the operands of the cloned instructions with their scalar 2728 // equivalents in the new loop. 2729 for (const auto &I : enumerate(RepRecipe->operands())) { 2730 auto InputInstance = Instance; 2731 VPValue *Operand = I.value(); 2732 if (vputils::isUniformAfterVectorization(Operand)) 2733 InputInstance.Lane = VPLane::getFirstLane(); 2734 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2735 } 2736 State.addNewMetadata(Cloned, Instr); 2737 2738 // Place the cloned scalar in the new loop. 2739 State.Builder.Insert(Cloned); 2740 2741 State.set(RepRecipe, Cloned, Instance); 2742 2743 // If we just cloned a new assumption, add it the assumption cache. 2744 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2745 AC->registerAssumption(II); 2746 2747 // End if-block. 2748 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2749 if (IfPredicateInstr) 2750 PredicatedInstructions.push_back(Cloned); 2751 } 2752 2753 Value * 2754 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2755 if (VectorTripCount) 2756 return VectorTripCount; 2757 2758 Value *TC = getTripCount(); 2759 IRBuilder<> Builder(InsertBlock->getTerminator()); 2760 2761 Type *Ty = TC->getType(); 2762 // This is where we can make the step a runtime constant. 2763 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2764 2765 // If the tail is to be folded by masking, round the number of iterations N 2766 // up to a multiple of Step instead of rounding down. This is done by first 2767 // adding Step-1 and then rounding down. Note that it's ok if this addition 2768 // overflows: the vector induction variable will eventually wrap to zero given 2769 // that it starts at zero and its Step is a power of two; the loop will then 2770 // exit, with the last early-exit vector comparison also producing all-true. 2771 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2772 // is accounted for in emitIterationCountCheck that adds an overflow check. 2773 if (Cost->foldTailByMasking()) { 2774 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2775 "VF*UF must be a power of 2 when folding tail by masking"); 2776 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2777 TC = Builder.CreateAdd( 2778 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2779 } 2780 2781 // Now we need to generate the expression for the part of the loop that the 2782 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2783 // iterations are not required for correctness, or N - Step, otherwise. Step 2784 // is equal to the vectorization factor (number of SIMD elements) times the 2785 // unroll factor (number of SIMD instructions). 2786 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2787 2788 // There are cases where we *must* run at least one iteration in the remainder 2789 // loop. See the cost model for when this can happen. If the step evenly 2790 // divides the trip count, we set the remainder to be equal to the step. If 2791 // the step does not evenly divide the trip count, no adjustment is necessary 2792 // since there will already be scalar iterations. Note that the minimum 2793 // iterations check ensures that N >= Step. 2794 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2795 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2796 R = Builder.CreateSelect(IsZero, Step, R); 2797 } 2798 2799 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2800 2801 return VectorTripCount; 2802 } 2803 2804 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2805 const DataLayout &DL) { 2806 // Verify that V is a vector type with same number of elements as DstVTy. 2807 auto *DstFVTy = cast<VectorType>(DstVTy); 2808 auto VF = DstFVTy->getElementCount(); 2809 auto *SrcVecTy = cast<VectorType>(V->getType()); 2810 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2811 Type *SrcElemTy = SrcVecTy->getElementType(); 2812 Type *DstElemTy = DstFVTy->getElementType(); 2813 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2814 "Vector elements must have same size"); 2815 2816 // Do a direct cast if element types are castable. 2817 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2818 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2819 } 2820 // V cannot be directly casted to desired vector type. 2821 // May happen when V is a floating point vector but DstVTy is a vector of 2822 // pointers or vice-versa. Handle this using a two-step bitcast using an 2823 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2824 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2825 "Only one type should be a pointer type"); 2826 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2827 "Only one type should be a floating point type"); 2828 Type *IntTy = 2829 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2830 auto *VecIntTy = VectorType::get(IntTy, VF); 2831 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2832 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2833 } 2834 2835 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2836 Value *Count = getTripCount(); 2837 // Reuse existing vector loop preheader for TC checks. 2838 // Note that new preheader block is generated for vector loop. 2839 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2840 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2841 2842 // Generate code to check if the loop's trip count is less than VF * UF, or 2843 // equal to it in case a scalar epilogue is required; this implies that the 2844 // vector trip count is zero. This check also covers the case where adding one 2845 // to the backedge-taken count overflowed leading to an incorrect trip count 2846 // of zero. In this case we will also jump to the scalar loop. 2847 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2848 : ICmpInst::ICMP_ULT; 2849 2850 // If tail is to be folded, vector loop takes care of all iterations. 2851 Type *CountTy = Count->getType(); 2852 Value *CheckMinIters = Builder.getFalse(); 2853 auto CreateStep = [&]() -> Value * { 2854 // Create step with max(MinProTripCount, UF * VF). 2855 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2856 return createStepForVF(Builder, CountTy, VF, UF); 2857 2858 Value *MinProfTC = 2859 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2860 if (!VF.isScalable()) 2861 return MinProfTC; 2862 return Builder.CreateBinaryIntrinsic( 2863 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2864 }; 2865 2866 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2867 if (Style == TailFoldingStyle::None) 2868 CheckMinIters = 2869 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2870 else if (VF.isScalable() && 2871 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2872 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2873 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2874 // an overflow to zero when updating induction variables and so an 2875 // additional overflow check is required before entering the vector loop. 2876 2877 // Get the maximum unsigned value for the type. 2878 Value *MaxUIntTripCount = 2879 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2880 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2881 2882 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2883 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2884 } 2885 2886 // Create new preheader for vector loop. 2887 LoopVectorPreHeader = 2888 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2889 "vector.ph"); 2890 2891 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2892 DT->getNode(Bypass)->getIDom()) && 2893 "TC check is expected to dominate Bypass"); 2894 2895 // Update dominator for Bypass & LoopExit (if needed). 2896 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2897 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2898 // If there is an epilogue which must run, there's no edge from the 2899 // middle block to exit blocks and thus no need to update the immediate 2900 // dominator of the exit blocks. 2901 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2902 2903 BranchInst &BI = 2904 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2905 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2906 setBranchWeights(BI, MinItersBypassWeights); 2907 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2908 LoopBypassBlocks.push_back(TCCheckBlock); 2909 } 2910 2911 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2912 BasicBlock *const SCEVCheckBlock = 2913 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2914 if (!SCEVCheckBlock) 2915 return nullptr; 2916 2917 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2918 (OptForSizeBasedOnProfile && 2919 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2920 "Cannot SCEV check stride or overflow when optimizing for size"); 2921 2922 2923 // Update dominator only if this is first RT check. 2924 if (LoopBypassBlocks.empty()) { 2925 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2926 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2927 // If there is an epilogue which must run, there's no edge from the 2928 // middle block to exit blocks and thus no need to update the immediate 2929 // dominator of the exit blocks. 2930 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2931 } 2932 2933 LoopBypassBlocks.push_back(SCEVCheckBlock); 2934 AddedSafetyChecks = true; 2935 return SCEVCheckBlock; 2936 } 2937 2938 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2939 // VPlan-native path does not do any analysis for runtime checks currently. 2940 if (EnableVPlanNativePath) 2941 return nullptr; 2942 2943 BasicBlock *const MemCheckBlock = 2944 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2945 2946 // Check if we generated code that checks in runtime if arrays overlap. We put 2947 // the checks into a separate block to make the more common case of few 2948 // elements faster. 2949 if (!MemCheckBlock) 2950 return nullptr; 2951 2952 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2953 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2954 "Cannot emit memory checks when optimizing for size, unless forced " 2955 "to vectorize."); 2956 ORE->emit([&]() { 2957 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2958 OrigLoop->getStartLoc(), 2959 OrigLoop->getHeader()) 2960 << "Code-size may be reduced by not forcing " 2961 "vectorization, or by source-code modifications " 2962 "eliminating the need for runtime checks " 2963 "(e.g., adding 'restrict')."; 2964 }); 2965 } 2966 2967 LoopBypassBlocks.push_back(MemCheckBlock); 2968 2969 AddedSafetyChecks = true; 2970 2971 return MemCheckBlock; 2972 } 2973 2974 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2975 LoopScalarBody = OrigLoop->getHeader(); 2976 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2977 assert(LoopVectorPreHeader && "Invalid loop structure"); 2978 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 2979 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 2980 "multiple exit loop without required epilogue?"); 2981 2982 LoopMiddleBlock = 2983 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2984 LI, nullptr, Twine(Prefix) + "middle.block"); 2985 LoopScalarPreHeader = 2986 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2987 nullptr, Twine(Prefix) + "scalar.ph"); 2988 2989 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 2990 2991 // Set up the middle block terminator. Two cases: 2992 // 1) If we know that we must execute the scalar epilogue, emit an 2993 // unconditional branch. 2994 // 2) Otherwise, we must have a single unique exit block (due to how we 2995 // implement the multiple exit case). In this case, set up a conditional 2996 // branch from the middle block to the loop scalar preheader, and the 2997 // exit block. completeLoopSkeleton will update the condition to use an 2998 // iteration check, if required to decide whether to execute the remainder. 2999 BranchInst *BrInst = 3000 Cost->requiresScalarEpilogue(VF.isVector()) 3001 ? BranchInst::Create(LoopScalarPreHeader) 3002 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3003 Builder.getTrue()); 3004 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3005 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3006 3007 // Update dominator for loop exit. During skeleton creation, only the vector 3008 // pre-header and the middle block are created. The vector loop is entirely 3009 // created during VPlan exection. 3010 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3011 // If there is an epilogue which must run, there's no edge from the 3012 // middle block to exit blocks and thus no need to update the immediate 3013 // dominator of the exit blocks. 3014 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3015 } 3016 3017 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3018 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3019 ArrayRef<BasicBlock *> BypassBlocks, 3020 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3021 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3022 assert(VectorTripCount && "Expected valid arguments"); 3023 3024 Instruction *OldInduction = Legal->getPrimaryInduction(); 3025 Value *&EndValue = IVEndValues[OrigPhi]; 3026 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3027 if (OrigPhi == OldInduction) { 3028 // We know what the end value is. 3029 EndValue = VectorTripCount; 3030 } else { 3031 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3032 3033 // Fast-math-flags propagate from the original induction instruction. 3034 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3035 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3036 3037 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), 3038 Step, II.getKind(), II.getInductionBinOp()); 3039 EndValue->setName("ind.end"); 3040 3041 // Compute the end value for the additional bypass (if applicable). 3042 if (AdditionalBypass.first) { 3043 B.SetInsertPoint(AdditionalBypass.first, 3044 AdditionalBypass.first->getFirstInsertionPt()); 3045 EndValueFromAdditionalBypass = 3046 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), 3047 Step, II.getKind(), II.getInductionBinOp()); 3048 EndValueFromAdditionalBypass->setName("ind.end"); 3049 } 3050 } 3051 3052 // Create phi nodes to merge from the backedge-taken check block. 3053 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3054 LoopScalarPreHeader->getTerminator()); 3055 // Copy original phi DL over to the new one. 3056 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3057 3058 // The new PHI merges the original incoming value, in case of a bypass, 3059 // or the value at the end of the vectorized loop. 3060 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3061 3062 // Fix the scalar body counter (PHI node). 3063 // The old induction's phi node in the scalar body needs the truncated 3064 // value. 3065 for (BasicBlock *BB : BypassBlocks) 3066 BCResumeVal->addIncoming(II.getStartValue(), BB); 3067 3068 if (AdditionalBypass.first) 3069 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3070 EndValueFromAdditionalBypass); 3071 return BCResumeVal; 3072 } 3073 3074 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3075 /// expansion results. 3076 static Value *getExpandedStep(const InductionDescriptor &ID, 3077 const SCEV2ValueTy &ExpandedSCEVs) { 3078 const SCEV *Step = ID.getStep(); 3079 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3080 return C->getValue(); 3081 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3082 return U->getValue(); 3083 auto I = ExpandedSCEVs.find(Step); 3084 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3085 return I->second; 3086 } 3087 3088 void InnerLoopVectorizer::createInductionResumeValues( 3089 const SCEV2ValueTy &ExpandedSCEVs, 3090 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3091 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3092 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3093 "Inconsistent information about additional bypass."); 3094 // We are going to resume the execution of the scalar loop. 3095 // Go over all of the induction variables that we found and fix the 3096 // PHIs that are left in the scalar version of the loop. 3097 // The starting values of PHI nodes depend on the counter of the last 3098 // iteration in the vectorized loop. 3099 // If we come from a bypass edge then we need to start from the original 3100 // start value. 3101 for (const auto &InductionEntry : Legal->getInductionVars()) { 3102 PHINode *OrigPhi = InductionEntry.first; 3103 const InductionDescriptor &II = InductionEntry.second; 3104 PHINode *BCResumeVal = createInductionResumeValue( 3105 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3106 AdditionalBypass); 3107 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3108 } 3109 } 3110 3111 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3112 // The trip counts should be cached by now. 3113 Value *Count = getTripCount(); 3114 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3115 3116 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3117 3118 // Add a check in the middle block to see if we have completed 3119 // all of the iterations in the first vector loop. Three cases: 3120 // 1) If we require a scalar epilogue, there is no conditional branch as 3121 // we unconditionally branch to the scalar preheader. Do nothing. 3122 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3123 // Thus if tail is to be folded, we know we don't need to run the 3124 // remainder and we can use the previous value for the condition (true). 3125 // 3) Otherwise, construct a runtime check. 3126 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3127 !Cost->foldTailByMasking()) { 3128 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3129 // of the corresponding compare because they may have ended up with 3130 // different line numbers and we want to avoid awkward line stepping while 3131 // debugging. Eg. if the compare has got a line number inside the loop. 3132 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant 3133 // operands. Perform simplification directly on VPlan once the branch is 3134 // modeled there. 3135 IRBuilder<> B(LoopMiddleBlock->getTerminator()); 3136 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); 3137 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); 3138 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); 3139 BI.setCondition(CmpN); 3140 if (hasBranchWeightMD(*ScalarLatchTerm)) { 3141 // Assume that `Count % VectorTripCount` is equally distributed. 3142 unsigned TripCount = UF * VF.getKnownMinValue(); 3143 assert(TripCount > 0 && "trip count should not be zero"); 3144 const uint32_t Weights[] = {1, TripCount - 1}; 3145 setBranchWeights(BI, Weights); 3146 } 3147 } 3148 3149 #ifdef EXPENSIVE_CHECKS 3150 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3151 #endif 3152 3153 return LoopVectorPreHeader; 3154 } 3155 3156 std::pair<BasicBlock *, Value *> 3157 InnerLoopVectorizer::createVectorizedLoopSkeleton( 3158 const SCEV2ValueTy &ExpandedSCEVs) { 3159 /* 3160 In this function we generate a new loop. The new loop will contain 3161 the vectorized instructions while the old loop will continue to run the 3162 scalar remainder. 3163 3164 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3165 / | preheader are expanded here. Eventually all required SCEV 3166 / | expansion should happen here. 3167 / v 3168 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3169 | / | 3170 | / v 3171 || [ ] <-- vector pre header. 3172 |/ | 3173 | v 3174 | [ ] \ 3175 | [ ]_| <-- vector loop (created during VPlan execution). 3176 | | 3177 | v 3178 \ -[ ] <--- middle-block. 3179 \/ | 3180 /\ v 3181 | ->[ ] <--- new preheader. 3182 | | 3183 (opt) v <-- edge from middle to exit iff epilogue is not required. 3184 | [ ] \ 3185 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3186 \ | 3187 \ v 3188 >[ ] <-- exit block(s). 3189 ... 3190 */ 3191 3192 // Create an empty vector loop, and prepare basic blocks for the runtime 3193 // checks. 3194 createVectorLoopSkeleton(""); 3195 3196 // Now, compare the new count to zero. If it is zero skip the vector loop and 3197 // jump to the scalar loop. This check also covers the case where the 3198 // backedge-taken count is uint##_max: adding one to it will overflow leading 3199 // to an incorrect trip count of zero. In this (rare) case we will also jump 3200 // to the scalar loop. 3201 emitIterationCountCheck(LoopScalarPreHeader); 3202 3203 // Generate the code to check any assumptions that we've made for SCEV 3204 // expressions. 3205 emitSCEVChecks(LoopScalarPreHeader); 3206 3207 // Generate the code that checks in runtime if arrays overlap. We put the 3208 // checks into a separate block to make the more common case of few elements 3209 // faster. 3210 emitMemRuntimeChecks(LoopScalarPreHeader); 3211 3212 // Emit phis for the new starting index of the scalar loop. 3213 createInductionResumeValues(ExpandedSCEVs); 3214 3215 return {completeLoopSkeleton(), nullptr}; 3216 } 3217 3218 // Fix up external users of the induction variable. At this point, we are 3219 // in LCSSA form, with all external PHIs that use the IV having one input value, 3220 // coming from the remainder loop. We need those PHIs to also have a correct 3221 // value for the IV when arriving directly from the middle block. 3222 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3223 const InductionDescriptor &II, 3224 Value *VectorTripCount, Value *EndValue, 3225 BasicBlock *MiddleBlock, 3226 BasicBlock *VectorHeader, VPlan &Plan, 3227 VPTransformState &State) { 3228 // There are two kinds of external IV usages - those that use the value 3229 // computed in the last iteration (the PHI) and those that use the penultimate 3230 // value (the value that feeds into the phi from the loop latch). 3231 // We allow both, but they, obviously, have different values. 3232 3233 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3234 3235 DenseMap<Value *, Value *> MissingVals; 3236 3237 // An external user of the last iteration's value should see the value that 3238 // the remainder loop uses to initialize its own IV. 3239 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3240 for (User *U : PostInc->users()) { 3241 Instruction *UI = cast<Instruction>(U); 3242 if (!OrigLoop->contains(UI)) { 3243 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3244 MissingVals[UI] = EndValue; 3245 } 3246 } 3247 3248 // An external user of the penultimate value need to see EndValue - Step. 3249 // The simplest way to get this is to recompute it from the constituent SCEVs, 3250 // that is Start + (Step * (CRD - 1)). 3251 for (User *U : OrigPhi->users()) { 3252 auto *UI = cast<Instruction>(U); 3253 if (!OrigLoop->contains(UI)) { 3254 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3255 IRBuilder<> B(MiddleBlock->getTerminator()); 3256 3257 // Fast-math-flags propagate from the original induction instruction. 3258 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3259 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3260 3261 Value *CountMinusOne = B.CreateSub( 3262 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3263 CountMinusOne->setName("cmo"); 3264 3265 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3266 assert(StepVPV && "step must have been expanded during VPlan execution"); 3267 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3268 : State.get(StepVPV, {0, 0}); 3269 Value *Escape = 3270 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, 3271 II.getKind(), II.getInductionBinOp()); 3272 Escape->setName("ind.escape"); 3273 MissingVals[UI] = Escape; 3274 } 3275 } 3276 3277 for (auto &I : MissingVals) { 3278 PHINode *PHI = cast<PHINode>(I.first); 3279 // One corner case we have to handle is two IVs "chasing" each-other, 3280 // that is %IV2 = phi [...], [ %IV1, %latch ] 3281 // In this case, if IV1 has an external use, we need to avoid adding both 3282 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3283 // don't already have an incoming value for the middle block. 3284 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3285 PHI->addIncoming(I.second, MiddleBlock); 3286 Plan.removeLiveOut(PHI); 3287 } 3288 } 3289 } 3290 3291 namespace { 3292 3293 struct CSEDenseMapInfo { 3294 static bool canHandle(const Instruction *I) { 3295 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3296 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3297 } 3298 3299 static inline Instruction *getEmptyKey() { 3300 return DenseMapInfo<Instruction *>::getEmptyKey(); 3301 } 3302 3303 static inline Instruction *getTombstoneKey() { 3304 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3305 } 3306 3307 static unsigned getHashValue(const Instruction *I) { 3308 assert(canHandle(I) && "Unknown instruction!"); 3309 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3310 I->value_op_end())); 3311 } 3312 3313 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3314 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3315 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3316 return LHS == RHS; 3317 return LHS->isIdenticalTo(RHS); 3318 } 3319 }; 3320 3321 } // end anonymous namespace 3322 3323 ///Perform cse of induction variable instructions. 3324 static void cse(BasicBlock *BB) { 3325 // Perform simple cse. 3326 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3327 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3328 if (!CSEDenseMapInfo::canHandle(&In)) 3329 continue; 3330 3331 // Check if we can replace this instruction with any of the 3332 // visited instructions. 3333 if (Instruction *V = CSEMap.lookup(&In)) { 3334 In.replaceAllUsesWith(V); 3335 In.eraseFromParent(); 3336 continue; 3337 } 3338 3339 CSEMap[&In] = &In; 3340 } 3341 } 3342 3343 InstructionCost 3344 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3345 ElementCount VF) const { 3346 // We only need to calculate a cost if the VF is scalar; for actual vectors 3347 // we should already have a pre-calculated cost at each VF. 3348 if (!VF.isScalar()) 3349 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 3350 3351 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3352 Type *RetTy = CI->getType(); 3353 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 3354 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 3355 return *RedCost; 3356 3357 SmallVector<Type *, 4> Tys; 3358 for (auto &ArgOp : CI->args()) 3359 Tys.push_back(ArgOp->getType()); 3360 3361 InstructionCost ScalarCallCost = 3362 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 3363 3364 // If this is an intrinsic we may have a lower cost for it. 3365 if (getVectorIntrinsicIDForCall(CI, TLI)) { 3366 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 3367 return std::min(ScalarCallCost, IntrinsicCost); 3368 } 3369 return ScalarCallCost; 3370 } 3371 3372 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3373 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3374 return Elt; 3375 return VectorType::get(Elt, VF); 3376 } 3377 3378 InstructionCost 3379 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3380 ElementCount VF) const { 3381 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3382 assert(ID && "Expected intrinsic call!"); 3383 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3384 FastMathFlags FMF; 3385 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3386 FMF = FPMO->getFastMathFlags(); 3387 3388 SmallVector<const Value *> Arguments(CI->args()); 3389 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3390 SmallVector<Type *> ParamTys; 3391 std::transform(FTy->param_begin(), FTy->param_end(), 3392 std::back_inserter(ParamTys), 3393 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3394 3395 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3396 dyn_cast<IntrinsicInst>(CI)); 3397 return TTI.getIntrinsicInstrCost(CostAttrs, 3398 TargetTransformInfo::TCK_RecipThroughput); 3399 } 3400 3401 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3402 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3403 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3404 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3405 } 3406 3407 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3408 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3409 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3410 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3411 } 3412 3413 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3414 VPlan &Plan) { 3415 // Fix widened non-induction PHIs by setting up the PHI operands. 3416 if (EnableVPlanNativePath) 3417 fixNonInductionPHIs(Plan, State); 3418 3419 // At this point every instruction in the original loop is widened to a 3420 // vector form. Now we need to fix the recurrences in the loop. These PHI 3421 // nodes are currently empty because we did not want to introduce cycles. 3422 // This is the second stage of vectorizing recurrences. Note that fixing 3423 // reduction phis are already modeled in VPlan. 3424 // TODO: Also model fixing fixed-order recurrence phis in VPlan. 3425 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 3426 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 3427 for (VPRecipeBase &R : HeaderVPBB->phis()) { 3428 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3429 fixFixedOrderRecurrence(FOR, State); 3430 } 3431 3432 // Forget the original basic block. 3433 PSE.getSE()->forgetLoop(OrigLoop); 3434 PSE.getSE()->forgetBlockAndLoopDispositions(); 3435 3436 // After vectorization, the exit blocks of the original loop will have 3437 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3438 // looked through single-entry phis. 3439 SmallVector<BasicBlock *> ExitBlocks; 3440 OrigLoop->getExitBlocks(ExitBlocks); 3441 for (BasicBlock *Exit : ExitBlocks) 3442 for (PHINode &PN : Exit->phis()) 3443 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 3444 3445 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); 3446 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3447 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3448 // No edge from the middle block to the unique exit block has been inserted 3449 // and there is nothing to fix from vector loop; phis should have incoming 3450 // from scalar loop only. 3451 } else { 3452 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3453 // the cost model. 3454 3455 // If we inserted an edge from the middle block to the unique exit block, 3456 // update uses outside the loop (phis) to account for the newly inserted 3457 // edge. 3458 3459 // Fix-up external users of the induction variables. 3460 for (const auto &Entry : Legal->getInductionVars()) 3461 fixupIVUsers(Entry.first, Entry.second, 3462 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3463 IVEndValues[Entry.first], LoopMiddleBlock, 3464 VectorLoop->getHeader(), Plan, State); 3465 } 3466 3467 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3468 // in the exit block, so update the builder. 3469 State.Builder.SetInsertPoint(State.CFG.ExitBB, 3470 State.CFG.ExitBB->getFirstNonPHIIt()); 3471 for (const auto &KV : Plan.getLiveOuts()) 3472 KV.second->fixPhi(Plan, State); 3473 3474 for (Instruction *PI : PredicatedInstructions) 3475 sinkScalarOperands(&*PI); 3476 3477 // Remove redundant induction instructions. 3478 cse(VectorLoop->getHeader()); 3479 3480 // Set/update profile weights for the vector and remainder loops as original 3481 // loop iterations are now distributed among them. Note that original loop 3482 // represented by LoopScalarBody becomes remainder loop after vectorization. 3483 // 3484 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3485 // end up getting slightly roughened result but that should be OK since 3486 // profile is not inherently precise anyway. Note also possible bypass of 3487 // vector code caused by legality checks is ignored, assigning all the weight 3488 // to the vector loop, optimistically. 3489 // 3490 // For scalable vectorization we can't know at compile time how many iterations 3491 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3492 // vscale of '1'. 3493 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3494 LI->getLoopFor(LoopScalarBody), 3495 VF.getKnownMinValue() * UF); 3496 } 3497 3498 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3499 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3500 // This is the second phase of vectorizing first-order recurrences. An 3501 // overview of the transformation is described below. Suppose we have the 3502 // following loop. 3503 // 3504 // for (int i = 0; i < n; ++i) 3505 // b[i] = a[i] - a[i - 1]; 3506 // 3507 // There is a first-order recurrence on "a". For this loop, the shorthand 3508 // scalar IR looks like: 3509 // 3510 // scalar.ph: 3511 // s_init = a[-1] 3512 // br scalar.body 3513 // 3514 // scalar.body: 3515 // i = phi [0, scalar.ph], [i+1, scalar.body] 3516 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3517 // s2 = a[i] 3518 // b[i] = s2 - s1 3519 // br cond, scalar.body, ... 3520 // 3521 // In this example, s1 is a recurrence because it's value depends on the 3522 // previous iteration. In the first phase of vectorization, we created a 3523 // vector phi v1 for s1. We now complete the vectorization and produce the 3524 // shorthand vector IR shown below (for VF = 4, UF = 1). 3525 // 3526 // vector.ph: 3527 // v_init = vector(..., ..., ..., a[-1]) 3528 // br vector.body 3529 // 3530 // vector.body 3531 // i = phi [0, vector.ph], [i+4, vector.body] 3532 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3533 // v2 = a[i, i+1, i+2, i+3]; 3534 // v3 = vector(v1(3), v2(0, 1, 2)) 3535 // b[i, i+1, i+2, i+3] = v2 - v3 3536 // br cond, vector.body, middle.block 3537 // 3538 // middle.block: 3539 // x = v2(3) 3540 // br scalar.ph 3541 // 3542 // scalar.ph: 3543 // s_init = phi [x, middle.block], [a[-1], otherwise] 3544 // br scalar.body 3545 // 3546 // After execution completes the vector loop, we extract the next value of 3547 // the recurrence (x) to use as the initial value in the scalar loop. 3548 3549 // Extract the last vector element in the middle block. This will be the 3550 // initial value for the recurrence when jumping to the scalar loop. 3551 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3552 Value *Incoming = State.get(PreviousDef, UF - 1); 3553 auto *ExtractForScalar = Incoming; 3554 auto *IdxTy = Builder.getInt32Ty(); 3555 Value *RuntimeVF = nullptr; 3556 if (VF.isVector()) { 3557 auto *One = ConstantInt::get(IdxTy, 1); 3558 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3559 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3560 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3561 ExtractForScalar = 3562 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3563 } 3564 3565 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3566 assert(PhiR->getNumUsers() == 1 && 3567 RecurSplice->getOpcode() == 3568 VPInstruction::FirstOrderRecurrenceSplice && 3569 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3570 SmallVector<VPLiveOut *> LiveOuts; 3571 for (VPUser *U : RecurSplice->users()) 3572 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3573 LiveOuts.push_back(LiveOut); 3574 3575 if (!LiveOuts.empty()) { 3576 // Extract the second last element in the middle block if the 3577 // Phi is used outside the loop. We need to extract the phi itself 3578 // and not the last element (the phi update in the current iteration). This 3579 // will be the value when jumping to the exit block from the 3580 // LoopMiddleBlock, when the scalar loop is not run at all. 3581 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3582 if (VF.isVector()) { 3583 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3584 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3585 Incoming, Idx, "vector.recur.extract.for.phi"); 3586 } else { 3587 assert(UF > 1 && "VF and UF cannot both be 1"); 3588 // When loop is unrolled without vectorizing, initialize 3589 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3590 // value of `Incoming`. This is analogous to the vectorized case above: 3591 // extracting the second last element when VF > 1. 3592 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3593 } 3594 3595 for (VPLiveOut *LiveOut : LiveOuts) { 3596 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3597 PHINode *LCSSAPhi = LiveOut->getPhi(); 3598 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3599 State.Plan->removeLiveOut(LCSSAPhi); 3600 } 3601 } 3602 3603 // Fix the initial value of the original recurrence in the scalar loop. 3604 Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); 3605 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3606 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3607 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3608 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3609 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3610 Start->addIncoming(Incoming, BB); 3611 } 3612 3613 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3614 Phi->setName("scalar.recur"); 3615 } 3616 3617 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3618 // The basic block and loop containing the predicated instruction. 3619 auto *PredBB = PredInst->getParent(); 3620 auto *VectorLoop = LI->getLoopFor(PredBB); 3621 3622 // Initialize a worklist with the operands of the predicated instruction. 3623 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3624 3625 // Holds instructions that we need to analyze again. An instruction may be 3626 // reanalyzed if we don't yet know if we can sink it or not. 3627 SmallVector<Instruction *, 8> InstsToReanalyze; 3628 3629 // Returns true if a given use occurs in the predicated block. Phi nodes use 3630 // their operands in their corresponding predecessor blocks. 3631 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3632 auto *I = cast<Instruction>(U.getUser()); 3633 BasicBlock *BB = I->getParent(); 3634 if (auto *Phi = dyn_cast<PHINode>(I)) 3635 BB = Phi->getIncomingBlock( 3636 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3637 return BB == PredBB; 3638 }; 3639 3640 // Iteratively sink the scalarized operands of the predicated instruction 3641 // into the block we created for it. When an instruction is sunk, it's 3642 // operands are then added to the worklist. The algorithm ends after one pass 3643 // through the worklist doesn't sink a single instruction. 3644 bool Changed; 3645 do { 3646 // Add the instructions that need to be reanalyzed to the worklist, and 3647 // reset the changed indicator. 3648 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3649 InstsToReanalyze.clear(); 3650 Changed = false; 3651 3652 while (!Worklist.empty()) { 3653 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3654 3655 // We can't sink an instruction if it is a phi node, is not in the loop, 3656 // may have side effects or may read from memory. 3657 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 3658 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3659 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3660 continue; 3661 3662 // If the instruction is already in PredBB, check if we can sink its 3663 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3664 // sinking the scalar instruction I, hence it appears in PredBB; but it 3665 // may have failed to sink I's operands (recursively), which we try 3666 // (again) here. 3667 if (I->getParent() == PredBB) { 3668 Worklist.insert(I->op_begin(), I->op_end()); 3669 continue; 3670 } 3671 3672 // It's legal to sink the instruction if all its uses occur in the 3673 // predicated block. Otherwise, there's nothing to do yet, and we may 3674 // need to reanalyze the instruction. 3675 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3676 InstsToReanalyze.push_back(I); 3677 continue; 3678 } 3679 3680 // Move the instruction to the beginning of the predicated block, and add 3681 // it's operands to the worklist. 3682 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3683 Worklist.insert(I->op_begin(), I->op_end()); 3684 3685 // The sinking may have enabled other instructions to be sunk, so we will 3686 // need to iterate. 3687 Changed = true; 3688 } 3689 } while (Changed); 3690 } 3691 3692 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 3693 VPTransformState &State) { 3694 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3695 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3696 for (VPRecipeBase &P : VPBB->phis()) { 3697 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3698 if (!VPPhi) 3699 continue; 3700 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 3701 // Make sure the builder has a valid insert point. 3702 Builder.SetInsertPoint(NewPhi); 3703 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 3704 VPValue *Inc = VPPhi->getIncomingValue(i); 3705 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 3706 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 3707 } 3708 } 3709 } 3710 } 3711 3712 bool InnerLoopVectorizer::useOrderedReductions( 3713 const RecurrenceDescriptor &RdxDesc) { 3714 return Cost->useOrderedReductions(RdxDesc); 3715 } 3716 3717 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3718 // We should not collect Scalars more than once per VF. Right now, this 3719 // function is called from collectUniformsAndScalars(), which already does 3720 // this check. Collecting Scalars for VF=1 does not make any sense. 3721 assert(VF.isVector() && !Scalars.contains(VF) && 3722 "This function should not be visited twice for the same VF"); 3723 3724 // This avoids any chances of creating a REPLICATE recipe during planning 3725 // since that would result in generation of scalarized code during execution, 3726 // which is not supported for scalable vectors. 3727 if (VF.isScalable()) { 3728 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3729 return; 3730 } 3731 3732 SmallSetVector<Instruction *, 8> Worklist; 3733 3734 // These sets are used to seed the analysis with pointers used by memory 3735 // accesses that will remain scalar. 3736 SmallSetVector<Instruction *, 8> ScalarPtrs; 3737 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3738 auto *Latch = TheLoop->getLoopLatch(); 3739 3740 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3741 // The pointer operands of loads and stores will be scalar as long as the 3742 // memory access is not a gather or scatter operation. The value operand of a 3743 // store will remain scalar if the store is scalarized. 3744 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3745 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3746 assert(WideningDecision != CM_Unknown && 3747 "Widening decision should be ready at this moment"); 3748 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3749 if (Ptr == Store->getValueOperand()) 3750 return WideningDecision == CM_Scalarize; 3751 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3752 "Ptr is neither a value or pointer operand"); 3753 return WideningDecision != CM_GatherScatter; 3754 }; 3755 3756 // A helper that returns true if the given value is a bitcast or 3757 // getelementptr instruction contained in the loop. 3758 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 3759 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 3760 isa<GetElementPtrInst>(V)) && 3761 !TheLoop->isLoopInvariant(V); 3762 }; 3763 3764 // A helper that evaluates a memory access's use of a pointer. If the use will 3765 // be a scalar use and the pointer is only used by memory accesses, we place 3766 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3767 // PossibleNonScalarPtrs. 3768 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3769 // We only care about bitcast and getelementptr instructions contained in 3770 // the loop. 3771 if (!isLoopVaryingBitCastOrGEP(Ptr)) 3772 return; 3773 3774 // If the pointer has already been identified as scalar (e.g., if it was 3775 // also identified as uniform), there's nothing to do. 3776 auto *I = cast<Instruction>(Ptr); 3777 if (Worklist.count(I)) 3778 return; 3779 3780 // If the use of the pointer will be a scalar use, and all users of the 3781 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3782 // place the pointer in PossibleNonScalarPtrs. 3783 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 3784 return isa<LoadInst>(U) || isa<StoreInst>(U); 3785 })) 3786 ScalarPtrs.insert(I); 3787 else 3788 PossibleNonScalarPtrs.insert(I); 3789 }; 3790 3791 // We seed the scalars analysis with three classes of instructions: (1) 3792 // instructions marked uniform-after-vectorization and (2) bitcast, 3793 // getelementptr and (pointer) phi instructions used by memory accesses 3794 // requiring a scalar use. 3795 // 3796 // (1) Add to the worklist all instructions that have been identified as 3797 // uniform-after-vectorization. 3798 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3799 3800 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3801 // memory accesses requiring a scalar use. The pointer operands of loads and 3802 // stores will be scalar as long as the memory accesses is not a gather or 3803 // scatter operation. The value operand of a store will remain scalar if the 3804 // store is scalarized. 3805 for (auto *BB : TheLoop->blocks()) 3806 for (auto &I : *BB) { 3807 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3808 evaluatePtrUse(Load, Load->getPointerOperand()); 3809 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3810 evaluatePtrUse(Store, Store->getPointerOperand()); 3811 evaluatePtrUse(Store, Store->getValueOperand()); 3812 } 3813 } 3814 for (auto *I : ScalarPtrs) 3815 if (!PossibleNonScalarPtrs.count(I)) { 3816 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3817 Worklist.insert(I); 3818 } 3819 3820 // Insert the forced scalars. 3821 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3822 // induction variable when the PHI user is scalarized. 3823 auto ForcedScalar = ForcedScalars.find(VF); 3824 if (ForcedScalar != ForcedScalars.end()) 3825 for (auto *I : ForcedScalar->second) { 3826 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3827 Worklist.insert(I); 3828 } 3829 3830 // Expand the worklist by looking through any bitcasts and getelementptr 3831 // instructions we've already identified as scalar. This is similar to the 3832 // expansion step in collectLoopUniforms(); however, here we're only 3833 // expanding to include additional bitcasts and getelementptr instructions. 3834 unsigned Idx = 0; 3835 while (Idx != Worklist.size()) { 3836 Instruction *Dst = Worklist[Idx++]; 3837 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 3838 continue; 3839 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3840 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3841 auto *J = cast<Instruction>(U); 3842 return !TheLoop->contains(J) || Worklist.count(J) || 3843 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3844 isScalarUse(J, Src)); 3845 })) { 3846 Worklist.insert(Src); 3847 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3848 } 3849 } 3850 3851 // An induction variable will remain scalar if all users of the induction 3852 // variable and induction variable update remain scalar. 3853 for (const auto &Induction : Legal->getInductionVars()) { 3854 auto *Ind = Induction.first; 3855 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3856 3857 // If tail-folding is applied, the primary induction variable will be used 3858 // to feed a vector compare. 3859 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3860 continue; 3861 3862 // Returns true if \p Indvar is a pointer induction that is used directly by 3863 // load/store instruction \p I. 3864 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3865 Instruction *I) { 3866 return Induction.second.getKind() == 3867 InductionDescriptor::IK_PtrInduction && 3868 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3869 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 3870 }; 3871 3872 // Determine if all users of the induction variable are scalar after 3873 // vectorization. 3874 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 3875 auto *I = cast<Instruction>(U); 3876 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3877 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3878 }); 3879 if (!ScalarInd) 3880 continue; 3881 3882 // Determine if all users of the induction variable update instruction are 3883 // scalar after vectorization. 3884 auto ScalarIndUpdate = 3885 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 3886 auto *I = cast<Instruction>(U); 3887 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3888 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3889 }); 3890 if (!ScalarIndUpdate) 3891 continue; 3892 3893 // The induction variable and its update instruction will remain scalar. 3894 Worklist.insert(Ind); 3895 Worklist.insert(IndUpdate); 3896 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3897 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3898 << "\n"); 3899 } 3900 3901 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3902 } 3903 3904 bool LoopVectorizationCostModel::isScalarWithPredication( 3905 Instruction *I, ElementCount VF) const { 3906 if (!isPredicatedInst(I)) 3907 return false; 3908 3909 // Do we have a non-scalar lowering for this predicated 3910 // instruction? No - it is scalar with predication. 3911 switch(I->getOpcode()) { 3912 default: 3913 return true; 3914 case Instruction::Call: 3915 if (VF.isScalar()) 3916 return true; 3917 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3918 .Kind == CM_Scalarize; 3919 case Instruction::Load: 3920 case Instruction::Store: { 3921 auto *Ptr = getLoadStorePointerOperand(I); 3922 auto *Ty = getLoadStoreType(I); 3923 Type *VTy = Ty; 3924 if (VF.isVector()) 3925 VTy = VectorType::get(Ty, VF); 3926 const Align Alignment = getLoadStoreAlignment(I); 3927 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3928 TTI.isLegalMaskedGather(VTy, Alignment)) 3929 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3930 TTI.isLegalMaskedScatter(VTy, Alignment)); 3931 } 3932 case Instruction::UDiv: 3933 case Instruction::SDiv: 3934 case Instruction::SRem: 3935 case Instruction::URem: { 3936 // We have the option to use the safe-divisor idiom to avoid predication. 3937 // The cost based decision here will always select safe-divisor for 3938 // scalable vectors as scalarization isn't legal. 3939 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3940 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3941 } 3942 } 3943 } 3944 3945 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3946 if (!blockNeedsPredicationForAnyReason(I->getParent())) 3947 return false; 3948 3949 // Can we prove this instruction is safe to unconditionally execute? 3950 // If not, we must use some form of predication. 3951 switch(I->getOpcode()) { 3952 default: 3953 return false; 3954 case Instruction::Load: 3955 case Instruction::Store: { 3956 if (!Legal->isMaskRequired(I)) 3957 return false; 3958 // When we know the load's address is loop invariant and the instruction 3959 // in the original scalar loop was unconditionally executed then we 3960 // don't need to mark it as a predicated instruction. Tail folding may 3961 // introduce additional predication, but we're guaranteed to always have 3962 // at least one active lane. We call Legal->blockNeedsPredication here 3963 // because it doesn't query tail-folding. For stores, we need to prove 3964 // both speculation safety (which follows from the same argument as loads), 3965 // but also must prove the value being stored is correct. The easiest 3966 // form of the later is to require that all values stored are the same. 3967 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 3968 (isa<LoadInst>(I) || 3969 (isa<StoreInst>(I) && 3970 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 3971 !Legal->blockNeedsPredication(I->getParent())) 3972 return false; 3973 return true; 3974 } 3975 case Instruction::UDiv: 3976 case Instruction::SDiv: 3977 case Instruction::SRem: 3978 case Instruction::URem: 3979 // TODO: We can use the loop-preheader as context point here and get 3980 // context sensitive reasoning 3981 return !isSafeToSpeculativelyExecute(I); 3982 case Instruction::Call: 3983 return Legal->isMaskRequired(I); 3984 } 3985 } 3986 3987 std::pair<InstructionCost, InstructionCost> 3988 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3989 ElementCount VF) const { 3990 assert(I->getOpcode() == Instruction::UDiv || 3991 I->getOpcode() == Instruction::SDiv || 3992 I->getOpcode() == Instruction::SRem || 3993 I->getOpcode() == Instruction::URem); 3994 assert(!isSafeToSpeculativelyExecute(I)); 3995 3996 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3997 3998 // Scalarization isn't legal for scalable vector types 3999 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4000 if (!VF.isScalable()) { 4001 // Get the scalarization cost and scale this amount by the probability of 4002 // executing the predicated block. If the instruction is not predicated, 4003 // we fall through to the next case. 4004 ScalarizationCost = 0; 4005 4006 // These instructions have a non-void type, so account for the phi nodes 4007 // that we will create. This cost is likely to be zero. The phi node 4008 // cost, if any, should be scaled by the block probability because it 4009 // models a copy at the end of each predicated block. 4010 ScalarizationCost += VF.getKnownMinValue() * 4011 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4012 4013 // The cost of the non-predicated instruction. 4014 ScalarizationCost += VF.getKnownMinValue() * 4015 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4016 4017 // The cost of insertelement and extractelement instructions needed for 4018 // scalarization. 4019 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4020 4021 // Scale the cost by the probability of executing the predicated blocks. 4022 // This assumes the predicated block for each vector lane is equally 4023 // likely. 4024 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4025 } 4026 InstructionCost SafeDivisorCost = 0; 4027 4028 auto *VecTy = ToVectorTy(I->getType(), VF); 4029 4030 // The cost of the select guard to ensure all lanes are well defined 4031 // after we speculate above any internal control flow. 4032 SafeDivisorCost += TTI.getCmpSelInstrCost( 4033 Instruction::Select, VecTy, 4034 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4035 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4036 4037 // Certain instructions can be cheaper to vectorize if they have a constant 4038 // second vector operand. One example of this are shifts on x86. 4039 Value *Op2 = I->getOperand(1); 4040 auto Op2Info = TTI.getOperandInfo(Op2); 4041 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4042 Legal->isInvariant(Op2)) 4043 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4044 4045 SmallVector<const Value *, 4> Operands(I->operand_values()); 4046 SafeDivisorCost += TTI.getArithmeticInstrCost( 4047 I->getOpcode(), VecTy, CostKind, 4048 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4049 Op2Info, Operands, I); 4050 return {ScalarizationCost, SafeDivisorCost}; 4051 } 4052 4053 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4054 Instruction *I, ElementCount VF) { 4055 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4056 assert(getWideningDecision(I, VF) == CM_Unknown && 4057 "Decision should not be set yet."); 4058 auto *Group = getInterleavedAccessGroup(I); 4059 assert(Group && "Must have a group."); 4060 4061 // If the instruction's allocated size doesn't equal it's type size, it 4062 // requires padding and will be scalarized. 4063 auto &DL = I->getModule()->getDataLayout(); 4064 auto *ScalarTy = getLoadStoreType(I); 4065 if (hasIrregularType(ScalarTy, DL)) 4066 return false; 4067 4068 // If the group involves a non-integral pointer, we may not be able to 4069 // losslessly cast all values to a common type. 4070 unsigned InterleaveFactor = Group->getFactor(); 4071 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4072 for (unsigned i = 0; i < InterleaveFactor; i++) { 4073 Instruction *Member = Group->getMember(i); 4074 if (!Member) 4075 continue; 4076 auto *MemberTy = getLoadStoreType(Member); 4077 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4078 // Don't coerce non-integral pointers to integers or vice versa. 4079 if (MemberNI != ScalarNI) { 4080 // TODO: Consider adding special nullptr value case here 4081 return false; 4082 } else if (MemberNI && ScalarNI && 4083 ScalarTy->getPointerAddressSpace() != 4084 MemberTy->getPointerAddressSpace()) { 4085 return false; 4086 } 4087 } 4088 4089 // Check if masking is required. 4090 // A Group may need masking for one of two reasons: it resides in a block that 4091 // needs predication, or it was decided to use masking to deal with gaps 4092 // (either a gap at the end of a load-access that may result in a speculative 4093 // load, or any gaps in a store-access). 4094 bool PredicatedAccessRequiresMasking = 4095 blockNeedsPredicationForAnyReason(I->getParent()) && 4096 Legal->isMaskRequired(I); 4097 bool LoadAccessWithGapsRequiresEpilogMasking = 4098 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4099 !isScalarEpilogueAllowed(); 4100 bool StoreAccessWithGapsRequiresMasking = 4101 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4102 if (!PredicatedAccessRequiresMasking && 4103 !LoadAccessWithGapsRequiresEpilogMasking && 4104 !StoreAccessWithGapsRequiresMasking) 4105 return true; 4106 4107 // If masked interleaving is required, we expect that the user/target had 4108 // enabled it, because otherwise it either wouldn't have been created or 4109 // it should have been invalidated by the CostModel. 4110 assert(useMaskedInterleavedAccesses(TTI) && 4111 "Masked interleave-groups for predicated accesses are not enabled."); 4112 4113 if (Group->isReverse()) 4114 return false; 4115 4116 auto *Ty = getLoadStoreType(I); 4117 const Align Alignment = getLoadStoreAlignment(I); 4118 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4119 : TTI.isLegalMaskedStore(Ty, Alignment); 4120 } 4121 4122 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4123 Instruction *I, ElementCount VF) { 4124 // Get and ensure we have a valid memory instruction. 4125 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4126 4127 auto *Ptr = getLoadStorePointerOperand(I); 4128 auto *ScalarTy = getLoadStoreType(I); 4129 4130 // In order to be widened, the pointer should be consecutive, first of all. 4131 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4132 return false; 4133 4134 // If the instruction is a store located in a predicated block, it will be 4135 // scalarized. 4136 if (isScalarWithPredication(I, VF)) 4137 return false; 4138 4139 // If the instruction's allocated size doesn't equal it's type size, it 4140 // requires padding and will be scalarized. 4141 auto &DL = I->getModule()->getDataLayout(); 4142 if (hasIrregularType(ScalarTy, DL)) 4143 return false; 4144 4145 return true; 4146 } 4147 4148 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4149 // We should not collect Uniforms more than once per VF. Right now, 4150 // this function is called from collectUniformsAndScalars(), which 4151 // already does this check. Collecting Uniforms for VF=1 does not make any 4152 // sense. 4153 4154 assert(VF.isVector() && !Uniforms.contains(VF) && 4155 "This function should not be visited twice for the same VF"); 4156 4157 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4158 // not analyze again. Uniforms.count(VF) will return 1. 4159 Uniforms[VF].clear(); 4160 4161 // We now know that the loop is vectorizable! 4162 // Collect instructions inside the loop that will remain uniform after 4163 // vectorization. 4164 4165 // Global values, params and instructions outside of current loop are out of 4166 // scope. 4167 auto isOutOfScope = [&](Value *V) -> bool { 4168 Instruction *I = dyn_cast<Instruction>(V); 4169 return (!I || !TheLoop->contains(I)); 4170 }; 4171 4172 // Worklist containing uniform instructions demanding lane 0. 4173 SetVector<Instruction *> Worklist; 4174 BasicBlock *Latch = TheLoop->getLoopLatch(); 4175 4176 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4177 // that are scalar with predication must not be considered uniform after 4178 // vectorization, because that would create an erroneous replicating region 4179 // where only a single instance out of VF should be formed. 4180 // TODO: optimize such seldom cases if found important, see PR40816. 4181 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4182 if (isOutOfScope(I)) { 4183 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4184 << *I << "\n"); 4185 return; 4186 } 4187 if (isScalarWithPredication(I, VF)) { 4188 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4189 << *I << "\n"); 4190 return; 4191 } 4192 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4193 Worklist.insert(I); 4194 }; 4195 4196 // Start with the conditional branch. If the branch condition is an 4197 // instruction contained in the loop that is only used by the branch, it is 4198 // uniform. 4199 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4200 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4201 addToWorklistIfAllowed(Cmp); 4202 4203 auto PrevVF = VF.divideCoefficientBy(2); 4204 // Return true if all lanes perform the same memory operation, and we can 4205 // thus chose to execute only one. 4206 auto isUniformMemOpUse = [&](Instruction *I) { 4207 // If the value was already known to not be uniform for the previous 4208 // (smaller VF), it cannot be uniform for the larger VF. 4209 if (PrevVF.isVector()) { 4210 auto Iter = Uniforms.find(PrevVF); 4211 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4212 return false; 4213 } 4214 if (!Legal->isUniformMemOp(*I, VF)) 4215 return false; 4216 if (isa<LoadInst>(I)) 4217 // Loading the same address always produces the same result - at least 4218 // assuming aliasing and ordering which have already been checked. 4219 return true; 4220 // Storing the same value on every iteration. 4221 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4222 }; 4223 4224 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4225 InstWidening WideningDecision = getWideningDecision(I, VF); 4226 assert(WideningDecision != CM_Unknown && 4227 "Widening decision should be ready at this moment"); 4228 4229 if (isUniformMemOpUse(I)) 4230 return true; 4231 4232 return (WideningDecision == CM_Widen || 4233 WideningDecision == CM_Widen_Reverse || 4234 WideningDecision == CM_Interleave); 4235 }; 4236 4237 // Returns true if Ptr is the pointer operand of a memory access instruction 4238 // I, I is known to not require scalarization, and the pointer is not also 4239 // stored. 4240 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4241 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4242 return false; 4243 return getLoadStorePointerOperand(I) == Ptr && 4244 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4245 }; 4246 4247 // Holds a list of values which are known to have at least one uniform use. 4248 // Note that there may be other uses which aren't uniform. A "uniform use" 4249 // here is something which only demands lane 0 of the unrolled iterations; 4250 // it does not imply that all lanes produce the same value (e.g. this is not 4251 // the usual meaning of uniform) 4252 SetVector<Value *> HasUniformUse; 4253 4254 // Scan the loop for instructions which are either a) known to have only 4255 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4256 for (auto *BB : TheLoop->blocks()) 4257 for (auto &I : *BB) { 4258 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4259 switch (II->getIntrinsicID()) { 4260 case Intrinsic::sideeffect: 4261 case Intrinsic::experimental_noalias_scope_decl: 4262 case Intrinsic::assume: 4263 case Intrinsic::lifetime_start: 4264 case Intrinsic::lifetime_end: 4265 if (TheLoop->hasLoopInvariantOperands(&I)) 4266 addToWorklistIfAllowed(&I); 4267 break; 4268 default: 4269 break; 4270 } 4271 } 4272 4273 // ExtractValue instructions must be uniform, because the operands are 4274 // known to be loop-invariant. 4275 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4276 assert(isOutOfScope(EVI->getAggregateOperand()) && 4277 "Expected aggregate value to be loop invariant"); 4278 addToWorklistIfAllowed(EVI); 4279 continue; 4280 } 4281 4282 // If there's no pointer operand, there's nothing to do. 4283 auto *Ptr = getLoadStorePointerOperand(&I); 4284 if (!Ptr) 4285 continue; 4286 4287 if (isUniformMemOpUse(&I)) 4288 addToWorklistIfAllowed(&I); 4289 4290 if (isVectorizedMemAccessUse(&I, Ptr)) 4291 HasUniformUse.insert(Ptr); 4292 } 4293 4294 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4295 // demanding) users. Since loops are assumed to be in LCSSA form, this 4296 // disallows uses outside the loop as well. 4297 for (auto *V : HasUniformUse) { 4298 if (isOutOfScope(V)) 4299 continue; 4300 auto *I = cast<Instruction>(V); 4301 auto UsersAreMemAccesses = 4302 llvm::all_of(I->users(), [&](User *U) -> bool { 4303 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4304 }); 4305 if (UsersAreMemAccesses) 4306 addToWorklistIfAllowed(I); 4307 } 4308 4309 // Expand Worklist in topological order: whenever a new instruction 4310 // is added , its users should be already inside Worklist. It ensures 4311 // a uniform instruction will only be used by uniform instructions. 4312 unsigned idx = 0; 4313 while (idx != Worklist.size()) { 4314 Instruction *I = Worklist[idx++]; 4315 4316 for (auto *OV : I->operand_values()) { 4317 // isOutOfScope operands cannot be uniform instructions. 4318 if (isOutOfScope(OV)) 4319 continue; 4320 // First order recurrence Phi's should typically be considered 4321 // non-uniform. 4322 auto *OP = dyn_cast<PHINode>(OV); 4323 if (OP && Legal->isFixedOrderRecurrence(OP)) 4324 continue; 4325 // If all the users of the operand are uniform, then add the 4326 // operand into the uniform worklist. 4327 auto *OI = cast<Instruction>(OV); 4328 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4329 auto *J = cast<Instruction>(U); 4330 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4331 })) 4332 addToWorklistIfAllowed(OI); 4333 } 4334 } 4335 4336 // For an instruction to be added into Worklist above, all its users inside 4337 // the loop should also be in Worklist. However, this condition cannot be 4338 // true for phi nodes that form a cyclic dependence. We must process phi 4339 // nodes separately. An induction variable will remain uniform if all users 4340 // of the induction variable and induction variable update remain uniform. 4341 // The code below handles both pointer and non-pointer induction variables. 4342 for (const auto &Induction : Legal->getInductionVars()) { 4343 auto *Ind = Induction.first; 4344 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4345 4346 // Determine if all users of the induction variable are uniform after 4347 // vectorization. 4348 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4349 auto *I = cast<Instruction>(U); 4350 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4351 isVectorizedMemAccessUse(I, Ind); 4352 }); 4353 if (!UniformInd) 4354 continue; 4355 4356 // Determine if all users of the induction variable update instruction are 4357 // uniform after vectorization. 4358 auto UniformIndUpdate = 4359 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4360 auto *I = cast<Instruction>(U); 4361 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4362 isVectorizedMemAccessUse(I, IndUpdate); 4363 }); 4364 if (!UniformIndUpdate) 4365 continue; 4366 4367 // The induction variable and its update instruction will remain uniform. 4368 addToWorklistIfAllowed(Ind); 4369 addToWorklistIfAllowed(IndUpdate); 4370 } 4371 4372 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4373 } 4374 4375 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4376 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4377 4378 if (Legal->getRuntimePointerChecking()->Need) { 4379 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4380 "runtime pointer checks needed. Enable vectorization of this " 4381 "loop with '#pragma clang loop vectorize(enable)' when " 4382 "compiling with -Os/-Oz", 4383 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4384 return true; 4385 } 4386 4387 if (!PSE.getPredicate().isAlwaysTrue()) { 4388 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4389 "runtime SCEV checks needed. Enable vectorization of this " 4390 "loop with '#pragma clang loop vectorize(enable)' when " 4391 "compiling with -Os/-Oz", 4392 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4393 return true; 4394 } 4395 4396 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4397 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4398 reportVectorizationFailure("Runtime stride check for small trip count", 4399 "runtime stride == 1 checks needed. Enable vectorization of " 4400 "this loop without such check by compiling with -Os/-Oz", 4401 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4402 return true; 4403 } 4404 4405 return false; 4406 } 4407 4408 ElementCount 4409 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4410 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4411 return ElementCount::getScalable(0); 4412 4413 if (Hints->isScalableVectorizationDisabled()) { 4414 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4415 "ScalableVectorizationDisabled", ORE, TheLoop); 4416 return ElementCount::getScalable(0); 4417 } 4418 4419 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4420 4421 auto MaxScalableVF = ElementCount::getScalable( 4422 std::numeric_limits<ElementCount::ScalarTy>::max()); 4423 4424 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4425 // FIXME: While for scalable vectors this is currently sufficient, this should 4426 // be replaced by a more detailed mechanism that filters out specific VFs, 4427 // instead of invalidating vectorization for a whole set of VFs based on the 4428 // MaxVF. 4429 4430 // Disable scalable vectorization if the loop contains unsupported reductions. 4431 if (!canVectorizeReductions(MaxScalableVF)) { 4432 reportVectorizationInfo( 4433 "Scalable vectorization not supported for the reduction " 4434 "operations found in this loop.", 4435 "ScalableVFUnfeasible", ORE, TheLoop); 4436 return ElementCount::getScalable(0); 4437 } 4438 4439 // Disable scalable vectorization if the loop contains any instructions 4440 // with element types not supported for scalable vectors. 4441 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4442 return !Ty->isVoidTy() && 4443 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4444 })) { 4445 reportVectorizationInfo("Scalable vectorization is not supported " 4446 "for all element types found in this loop.", 4447 "ScalableVFUnfeasible", ORE, TheLoop); 4448 return ElementCount::getScalable(0); 4449 } 4450 4451 if (Legal->isSafeForAnyVectorWidth()) 4452 return MaxScalableVF; 4453 4454 // Limit MaxScalableVF by the maximum safe dependence distance. 4455 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4456 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4457 else 4458 MaxScalableVF = ElementCount::getScalable(0); 4459 4460 if (!MaxScalableVF) 4461 reportVectorizationInfo( 4462 "Max legal vector width too small, scalable vectorization " 4463 "unfeasible.", 4464 "ScalableVFUnfeasible", ORE, TheLoop); 4465 4466 return MaxScalableVF; 4467 } 4468 4469 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4470 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4471 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4472 unsigned SmallestType, WidestType; 4473 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4474 4475 // Get the maximum safe dependence distance in bits computed by LAA. 4476 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4477 // the memory accesses that is most restrictive (involved in the smallest 4478 // dependence distance). 4479 unsigned MaxSafeElements = 4480 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4481 4482 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4483 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4484 4485 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4486 << ".\n"); 4487 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4488 << ".\n"); 4489 4490 // First analyze the UserVF, fall back if the UserVF should be ignored. 4491 if (UserVF) { 4492 auto MaxSafeUserVF = 4493 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4494 4495 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4496 // If `VF=vscale x N` is safe, then so is `VF=N` 4497 if (UserVF.isScalable()) 4498 return FixedScalableVFPair( 4499 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4500 else 4501 return UserVF; 4502 } 4503 4504 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4505 4506 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4507 // is better to ignore the hint and let the compiler choose a suitable VF. 4508 if (!UserVF.isScalable()) { 4509 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4510 << " is unsafe, clamping to max safe VF=" 4511 << MaxSafeFixedVF << ".\n"); 4512 ORE->emit([&]() { 4513 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4514 TheLoop->getStartLoc(), 4515 TheLoop->getHeader()) 4516 << "User-specified vectorization factor " 4517 << ore::NV("UserVectorizationFactor", UserVF) 4518 << " is unsafe, clamping to maximum safe vectorization factor " 4519 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4520 }); 4521 return MaxSafeFixedVF; 4522 } 4523 4524 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4525 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4526 << " is ignored because scalable vectors are not " 4527 "available.\n"); 4528 ORE->emit([&]() { 4529 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4530 TheLoop->getStartLoc(), 4531 TheLoop->getHeader()) 4532 << "User-specified vectorization factor " 4533 << ore::NV("UserVectorizationFactor", UserVF) 4534 << " is ignored because the target does not support scalable " 4535 "vectors. The compiler will pick a more suitable value."; 4536 }); 4537 } else { 4538 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4539 << " is unsafe. Ignoring scalable UserVF.\n"); 4540 ORE->emit([&]() { 4541 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4542 TheLoop->getStartLoc(), 4543 TheLoop->getHeader()) 4544 << "User-specified vectorization factor " 4545 << ore::NV("UserVectorizationFactor", UserVF) 4546 << " is unsafe. Ignoring the hint to let the compiler pick a " 4547 "more suitable value."; 4548 }); 4549 } 4550 } 4551 4552 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4553 << " / " << WidestType << " bits.\n"); 4554 4555 FixedScalableVFPair Result(ElementCount::getFixed(1), 4556 ElementCount::getScalable(0)); 4557 if (auto MaxVF = 4558 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4559 MaxSafeFixedVF, FoldTailByMasking)) 4560 Result.FixedVF = MaxVF; 4561 4562 if (auto MaxVF = 4563 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4564 MaxSafeScalableVF, FoldTailByMasking)) 4565 if (MaxVF.isScalable()) { 4566 Result.ScalableVF = MaxVF; 4567 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4568 << "\n"); 4569 } 4570 4571 return Result; 4572 } 4573 4574 FixedScalableVFPair 4575 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4576 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4577 // TODO: It may by useful to do since it's still likely to be dynamically 4578 // uniform if the target can skip. 4579 reportVectorizationFailure( 4580 "Not inserting runtime ptr check for divergent target", 4581 "runtime pointer checks needed. Not enabled for divergent target", 4582 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4583 return FixedScalableVFPair::getNone(); 4584 } 4585 4586 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4587 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 4588 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4589 if (TC == 1) { 4590 reportVectorizationFailure("Single iteration (non) loop", 4591 "loop trip count is one, irrelevant for vectorization", 4592 "SingleIterationLoop", ORE, TheLoop); 4593 return FixedScalableVFPair::getNone(); 4594 } 4595 4596 switch (ScalarEpilogueStatus) { 4597 case CM_ScalarEpilogueAllowed: 4598 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4599 case CM_ScalarEpilogueNotAllowedUsePredicate: 4600 [[fallthrough]]; 4601 case CM_ScalarEpilogueNotNeededUsePredicate: 4602 LLVM_DEBUG( 4603 dbgs() << "LV: vector predicate hint/switch found.\n" 4604 << "LV: Not allowing scalar epilogue, creating predicated " 4605 << "vector loop.\n"); 4606 break; 4607 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4608 // fallthrough as a special case of OptForSize 4609 case CM_ScalarEpilogueNotAllowedOptSize: 4610 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4611 LLVM_DEBUG( 4612 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4613 else 4614 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4615 << "count.\n"); 4616 4617 // Bail if runtime checks are required, which are not good when optimising 4618 // for size. 4619 if (runtimeChecksRequired()) 4620 return FixedScalableVFPair::getNone(); 4621 4622 break; 4623 } 4624 4625 // The only loops we can vectorize without a scalar epilogue, are loops with 4626 // a bottom-test and a single exiting block. We'd have to handle the fact 4627 // that not every instruction executes on the last iteration. This will 4628 // require a lane mask which varies through the vector loop body. (TODO) 4629 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4630 // If there was a tail-folding hint/switch, but we can't fold the tail by 4631 // masking, fallback to a vectorization with a scalar epilogue. 4632 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4633 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4634 "scalar epilogue instead.\n"); 4635 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4636 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4637 } 4638 return FixedScalableVFPair::getNone(); 4639 } 4640 4641 // Now try the tail folding 4642 4643 // Invalidate interleave groups that require an epilogue if we can't mask 4644 // the interleave-group. 4645 if (!useMaskedInterleavedAccesses(TTI)) { 4646 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4647 "No decisions should have been taken at this point"); 4648 // Note: There is no need to invalidate any cost modeling decisions here, as 4649 // non where taken so far. 4650 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4651 } 4652 4653 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4654 4655 // Avoid tail folding if the trip count is known to be a multiple of any VF 4656 // we choose. 4657 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4658 MaxFactors.FixedVF.getFixedValue(); 4659 if (MaxFactors.ScalableVF) { 4660 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4661 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4662 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4663 *MaxPowerOf2RuntimeVF, 4664 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4665 } else 4666 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4667 } 4668 4669 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4670 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4671 "MaxFixedVF must be a power of 2"); 4672 unsigned MaxVFtimesIC = 4673 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4674 ScalarEvolution *SE = PSE.getSE(); 4675 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 4676 const SCEV *ExitCount = SE->getAddExpr( 4677 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4678 const SCEV *Rem = SE->getURemExpr( 4679 SE->applyLoopGuards(ExitCount, TheLoop), 4680 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4681 if (Rem->isZero()) { 4682 // Accept MaxFixedVF if we do not have a tail. 4683 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4684 return MaxFactors; 4685 } 4686 } 4687 4688 // If we don't know the precise trip count, or if the trip count that we 4689 // found modulo the vectorization factor is not zero, try to fold the tail 4690 // by masking. 4691 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4692 if (Legal->prepareToFoldTailByMasking()) { 4693 CanFoldTailByMasking = true; 4694 return MaxFactors; 4695 } 4696 4697 // If there was a tail-folding hint/switch, but we can't fold the tail by 4698 // masking, fallback to a vectorization with a scalar epilogue. 4699 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4700 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4701 "scalar epilogue instead.\n"); 4702 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4703 return MaxFactors; 4704 } 4705 4706 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4707 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4708 return FixedScalableVFPair::getNone(); 4709 } 4710 4711 if (TC == 0) { 4712 reportVectorizationFailure( 4713 "Unable to calculate the loop count due to complex control flow", 4714 "unable to calculate the loop count due to complex control flow", 4715 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4716 return FixedScalableVFPair::getNone(); 4717 } 4718 4719 reportVectorizationFailure( 4720 "Cannot optimize for size and vectorize at the same time.", 4721 "cannot optimize for size and vectorize at the same time. " 4722 "Enable vectorization of this loop with '#pragma clang loop " 4723 "vectorize(enable)' when compiling with -Os/-Oz", 4724 "NoTailLoopWithOptForSize", ORE, TheLoop); 4725 return FixedScalableVFPair::getNone(); 4726 } 4727 4728 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4729 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4730 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4731 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4732 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4733 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4734 : TargetTransformInfo::RGK_FixedWidthVector); 4735 4736 // Convenience function to return the minimum of two ElementCounts. 4737 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4738 assert((LHS.isScalable() == RHS.isScalable()) && 4739 "Scalable flags must match"); 4740 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4741 }; 4742 4743 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4744 // Note that both WidestRegister and WidestType may not be a powers of 2. 4745 auto MaxVectorElementCount = ElementCount::get( 4746 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4747 ComputeScalableMaxVF); 4748 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4749 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4750 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4751 4752 if (!MaxVectorElementCount) { 4753 LLVM_DEBUG(dbgs() << "LV: The target has no " 4754 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4755 << " vector registers.\n"); 4756 return ElementCount::getFixed(1); 4757 } 4758 4759 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4760 if (MaxVectorElementCount.isScalable() && 4761 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4762 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4763 auto Min = Attr.getVScaleRangeMin(); 4764 WidestRegisterMinEC *= Min; 4765 } 4766 4767 // When a scalar epilogue is required, at least one iteration of the scalar 4768 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4769 // max VF that results in a dead vector loop. 4770 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4771 MaxTripCount -= 1; 4772 4773 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4774 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4775 // If upper bound loop trip count (TC) is known at compile time there is no 4776 // point in choosing VF greater than TC (as done in the loop below). Select 4777 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4778 // scalable, we only fall back on a fixed VF when the TC is less than or 4779 // equal to the known number of lanes. 4780 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4781 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4782 "exceeding the constant trip count: " 4783 << ClampedUpperTripCount << "\n"); 4784 return ElementCount::get( 4785 ClampedUpperTripCount, 4786 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4787 } 4788 4789 TargetTransformInfo::RegisterKind RegKind = 4790 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4791 : TargetTransformInfo::RGK_FixedWidthVector; 4792 ElementCount MaxVF = MaxVectorElementCount; 4793 if (MaximizeBandwidth || 4794 (MaximizeBandwidth.getNumOccurrences() == 0 && 4795 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4796 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4797 auto MaxVectorElementCountMaxBW = ElementCount::get( 4798 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4799 ComputeScalableMaxVF); 4800 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4801 4802 // Collect all viable vectorization factors larger than the default MaxVF 4803 // (i.e. MaxVectorElementCount). 4804 SmallVector<ElementCount, 8> VFs; 4805 for (ElementCount VS = MaxVectorElementCount * 2; 4806 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4807 VFs.push_back(VS); 4808 4809 // For each VF calculate its register usage. 4810 auto RUs = calculateRegisterUsage(VFs); 4811 4812 // Select the largest VF which doesn't require more registers than existing 4813 // ones. 4814 for (int i = RUs.size() - 1; i >= 0; --i) { 4815 bool Selected = true; 4816 for (auto &pair : RUs[i].MaxLocalUsers) { 4817 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 4818 if (pair.second > TargetNumRegisters) 4819 Selected = false; 4820 } 4821 if (Selected) { 4822 MaxVF = VFs[i]; 4823 break; 4824 } 4825 } 4826 if (ElementCount MinVF = 4827 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4828 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4829 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4830 << ") with target's minimum: " << MinVF << '\n'); 4831 MaxVF = MinVF; 4832 } 4833 } 4834 4835 // Invalidate any widening decisions we might have made, in case the loop 4836 // requires prediction (decided later), but we have already made some 4837 // load/store widening decisions. 4838 invalidateCostModelingDecisions(); 4839 } 4840 return MaxVF; 4841 } 4842 4843 /// Convenience function that returns the value of vscale_range iff 4844 /// vscale_range.min == vscale_range.max or otherwise returns the value 4845 /// returned by the corresponding TTI method. 4846 static std::optional<unsigned> 4847 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4848 const Function *Fn = L->getHeader()->getParent(); 4849 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4850 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4851 auto Min = Attr.getVScaleRangeMin(); 4852 auto Max = Attr.getVScaleRangeMax(); 4853 if (Max && Min == Max) 4854 return Max; 4855 } 4856 4857 return TTI.getVScaleForTuning(); 4858 } 4859 4860 bool LoopVectorizationPlanner::isMoreProfitable( 4861 const VectorizationFactor &A, const VectorizationFactor &B) const { 4862 InstructionCost CostA = A.Cost; 4863 InstructionCost CostB = B.Cost; 4864 4865 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 4866 4867 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 4868 // If the trip count is a known (possibly small) constant, the trip count 4869 // will be rounded up to an integer number of iterations under 4870 // FoldTailByMasking. The total cost in that case will be 4871 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4872 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4873 // some extra overheads, but for the purpose of comparing the costs of 4874 // different VFs we can use this to compare the total loop-body cost 4875 // expected after vectorization. 4876 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4877 InstructionCost VectorCost, 4878 InstructionCost ScalarCost) { 4879 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 4880 : VectorCost * (MaxTripCount / VF) + 4881 ScalarCost * (MaxTripCount % VF); 4882 }; 4883 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 4884 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 4885 4886 return RTCostA < RTCostB; 4887 } 4888 4889 // Improve estimate for the vector width if it is scalable. 4890 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4891 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4892 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4893 if (A.Width.isScalable()) 4894 EstimatedWidthA *= *VScale; 4895 if (B.Width.isScalable()) 4896 EstimatedWidthB *= *VScale; 4897 } 4898 4899 // Assume vscale may be larger than 1 (or the value being tuned for), 4900 // so that scalable vectorization is slightly favorable over fixed-width 4901 // vectorization. 4902 if (A.Width.isScalable() && !B.Width.isScalable()) 4903 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 4904 4905 // To avoid the need for FP division: 4906 // (CostA / A.Width) < (CostB / B.Width) 4907 // <=> (CostA * B.Width) < (CostB * A.Width) 4908 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 4909 } 4910 4911 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 4912 OptimizationRemarkEmitter *ORE, 4913 Loop *TheLoop) { 4914 if (InvalidCosts.empty()) 4915 return; 4916 4917 // Emit a report of VFs with invalid costs in the loop. 4918 4919 // Group the remarks per instruction, keeping the instruction order from 4920 // InvalidCosts. 4921 std::map<Instruction *, unsigned> Numbering; 4922 unsigned I = 0; 4923 for (auto &Pair : InvalidCosts) 4924 if (!Numbering.count(Pair.first)) 4925 Numbering[Pair.first] = I++; 4926 4927 // Sort the list, first on instruction(number) then on VF. 4928 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 4929 if (Numbering[A.first] != Numbering[B.first]) 4930 return Numbering[A.first] < Numbering[B.first]; 4931 ElementCountComparator ECC; 4932 return ECC(A.second, B.second); 4933 }); 4934 4935 // For a list of ordered instruction-vf pairs: 4936 // [(load, vf1), (load, vf2), (store, vf1)] 4937 // Group the instructions together to emit separate remarks for: 4938 // load (vf1, vf2) 4939 // store (vf1) 4940 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 4941 auto Subset = ArrayRef<InstructionVFPair>(); 4942 do { 4943 if (Subset.empty()) 4944 Subset = Tail.take_front(1); 4945 4946 Instruction *I = Subset.front().first; 4947 4948 // If the next instruction is different, or if there are no other pairs, 4949 // emit a remark for the collated subset. e.g. 4950 // [(load, vf1), (load, vf2))] 4951 // to emit: 4952 // remark: invalid costs for 'load' at VF=(vf, vf2) 4953 if (Subset == Tail || Tail[Subset.size()].first != I) { 4954 std::string OutString; 4955 raw_string_ostream OS(OutString); 4956 assert(!Subset.empty() && "Unexpected empty range"); 4957 OS << "Instruction with invalid costs prevented vectorization at VF=("; 4958 for (const auto &Pair : Subset) 4959 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4960 OS << "):"; 4961 if (auto *CI = dyn_cast<CallInst>(I)) 4962 OS << " call to " << CI->getCalledFunction()->getName(); 4963 else 4964 OS << " " << I->getOpcodeName(); 4965 OS.flush(); 4966 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 4967 Tail = Tail.drop_front(Subset.size()); 4968 Subset = {}; 4969 } else 4970 // Grow the subset by one element 4971 Subset = Tail.take_front(Subset.size() + 1); 4972 } while (!Tail.empty()); 4973 } 4974 4975 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 4976 const ElementCountSet &VFCandidates) { 4977 InstructionCost ExpectedCost = 4978 CM.expectedCost(ElementCount::getFixed(1)).first; 4979 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4980 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4981 assert(VFCandidates.count(ElementCount::getFixed(1)) && 4982 "Expected Scalar VF to be a candidate"); 4983 4984 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4985 ExpectedCost); 4986 VectorizationFactor ChosenFactor = ScalarCost; 4987 4988 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4989 if (ForceVectorization && VFCandidates.size() > 1) { 4990 // Ignore scalar width, because the user explicitly wants vectorization. 4991 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4992 // evaluation. 4993 ChosenFactor.Cost = InstructionCost::getMax(); 4994 } 4995 4996 SmallVector<InstructionVFPair> InvalidCosts; 4997 for (const auto &i : VFCandidates) { 4998 // The cost for scalar VF=1 is already calculated, so ignore it. 4999 if (i.isScalar()) 5000 continue; 5001 5002 LoopVectorizationCostModel::VectorizationCostTy C = 5003 CM.expectedCost(i, &InvalidCosts); 5004 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5005 5006 #ifndef NDEBUG 5007 unsigned AssumedMinimumVscale = 1; 5008 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5009 AssumedMinimumVscale = *VScale; 5010 unsigned Width = 5011 Candidate.Width.isScalable() 5012 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5013 : Candidate.Width.getFixedValue(); 5014 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5015 << " costs: " << (Candidate.Cost / Width)); 5016 if (i.isScalable()) 5017 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5018 << AssumedMinimumVscale << ")"); 5019 LLVM_DEBUG(dbgs() << ".\n"); 5020 #endif 5021 5022 if (!C.second && !ForceVectorization) { 5023 LLVM_DEBUG( 5024 dbgs() << "LV: Not considering vector loop of width " << i 5025 << " because it will not generate any vector instructions.\n"); 5026 continue; 5027 } 5028 5029 // If profitable add it to ProfitableVF list. 5030 if (isMoreProfitable(Candidate, ScalarCost)) 5031 ProfitableVFs.push_back(Candidate); 5032 5033 if (isMoreProfitable(Candidate, ChosenFactor)) 5034 ChosenFactor = Candidate; 5035 } 5036 5037 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5038 5039 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5040 reportVectorizationFailure( 5041 "There are conditional stores.", 5042 "store that is conditionally executed prevents vectorization", 5043 "ConditionalStore", ORE, OrigLoop); 5044 ChosenFactor = ScalarCost; 5045 } 5046 5047 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5048 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5049 << "LV: Vectorization seems to be not beneficial, " 5050 << "but was forced by a user.\n"); 5051 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5052 return ChosenFactor; 5053 } 5054 5055 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5056 ElementCount VF) const { 5057 // Cross iteration phis such as reductions need special handling and are 5058 // currently unsupported. 5059 if (any_of(OrigLoop->getHeader()->phis(), 5060 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5061 return false; 5062 5063 // Phis with uses outside of the loop require special handling and are 5064 // currently unsupported. 5065 for (const auto &Entry : Legal->getInductionVars()) { 5066 // Look for uses of the value of the induction at the last iteration. 5067 Value *PostInc = 5068 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5069 for (User *U : PostInc->users()) 5070 if (!OrigLoop->contains(cast<Instruction>(U))) 5071 return false; 5072 // Look for uses of penultimate value of the induction. 5073 for (User *U : Entry.first->users()) 5074 if (!OrigLoop->contains(cast<Instruction>(U))) 5075 return false; 5076 } 5077 5078 // Epilogue vectorization code has not been auditted to ensure it handles 5079 // non-latch exits properly. It may be fine, but it needs auditted and 5080 // tested. 5081 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5082 return false; 5083 5084 return true; 5085 } 5086 5087 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5088 const ElementCount VF) const { 5089 // FIXME: We need a much better cost-model to take different parameters such 5090 // as register pressure, code size increase and cost of extra branches into 5091 // account. For now we apply a very crude heuristic and only consider loops 5092 // with vectorization factors larger than a certain value. 5093 5094 // Allow the target to opt out entirely. 5095 if (!TTI.preferEpilogueVectorization()) 5096 return false; 5097 5098 // We also consider epilogue vectorization unprofitable for targets that don't 5099 // consider interleaving beneficial (eg. MVE). 5100 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5101 return false; 5102 5103 unsigned Multiplier = 1; 5104 if (VF.isScalable()) 5105 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5106 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5107 return true; 5108 return false; 5109 } 5110 5111 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5112 const ElementCount MainLoopVF, unsigned IC) { 5113 VectorizationFactor Result = VectorizationFactor::Disabled(); 5114 if (!EnableEpilogueVectorization) { 5115 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5116 return Result; 5117 } 5118 5119 if (!CM.isScalarEpilogueAllowed()) { 5120 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5121 "epilogue is allowed.\n"); 5122 return Result; 5123 } 5124 5125 // Not really a cost consideration, but check for unsupported cases here to 5126 // simplify the logic. 5127 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5128 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5129 "is not a supported candidate.\n"); 5130 return Result; 5131 } 5132 5133 if (EpilogueVectorizationForceVF > 1) { 5134 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5135 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5136 if (hasPlanWithVF(ForcedEC)) 5137 return {ForcedEC, 0, 0}; 5138 else { 5139 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5140 "viable.\n"); 5141 return Result; 5142 } 5143 } 5144 5145 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5146 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5147 LLVM_DEBUG( 5148 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5149 return Result; 5150 } 5151 5152 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5153 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5154 "this loop\n"); 5155 return Result; 5156 } 5157 5158 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5159 // the main loop handles 8 lanes per iteration. We could still benefit from 5160 // vectorizing the epilogue loop with VF=4. 5161 ElementCount EstimatedRuntimeVF = MainLoopVF; 5162 if (MainLoopVF.isScalable()) { 5163 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5164 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5165 EstimatedRuntimeVF *= *VScale; 5166 } 5167 5168 ScalarEvolution &SE = *PSE.getSE(); 5169 Type *TCType = Legal->getWidestInductionType(); 5170 const SCEV *RemainingIterations = nullptr; 5171 for (auto &NextVF : ProfitableVFs) { 5172 // Skip candidate VFs without a corresponding VPlan. 5173 if (!hasPlanWithVF(NextVF.Width)) 5174 continue; 5175 5176 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5177 // vectors) or the VF of the main loop (fixed vectors). 5178 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5179 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5180 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5181 continue; 5182 5183 // If NextVF is greater than the number of remaining iterations, the 5184 // epilogue loop would be dead. Skip such factors. 5185 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5186 // TODO: extend to support scalable VFs. 5187 if (!RemainingIterations) { 5188 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5189 RemainingIterations = SE.getURemExpr( 5190 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5191 } 5192 if (SE.isKnownPredicate( 5193 CmpInst::ICMP_UGT, 5194 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5195 RemainingIterations)) 5196 continue; 5197 } 5198 5199 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5200 Result = NextVF; 5201 } 5202 5203 if (Result != VectorizationFactor::Disabled()) 5204 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5205 << Result.Width << "\n"); 5206 return Result; 5207 } 5208 5209 std::pair<unsigned, unsigned> 5210 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5211 unsigned MinWidth = -1U; 5212 unsigned MaxWidth = 8; 5213 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5214 // For in-loop reductions, no element types are added to ElementTypesInLoop 5215 // if there are no loads/stores in the loop. In this case, check through the 5216 // reduction variables to determine the maximum width. 5217 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5218 // Reset MaxWidth so that we can find the smallest type used by recurrences 5219 // in the loop. 5220 MaxWidth = -1U; 5221 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5222 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5223 // When finding the min width used by the recurrence we need to account 5224 // for casts on the input operands of the recurrence. 5225 MaxWidth = std::min<unsigned>( 5226 MaxWidth, std::min<unsigned>( 5227 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5228 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5229 } 5230 } else { 5231 for (Type *T : ElementTypesInLoop) { 5232 MinWidth = std::min<unsigned>( 5233 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5234 MaxWidth = std::max<unsigned>( 5235 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5236 } 5237 } 5238 return {MinWidth, MaxWidth}; 5239 } 5240 5241 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5242 ElementTypesInLoop.clear(); 5243 // For each block. 5244 for (BasicBlock *BB : TheLoop->blocks()) { 5245 // For each instruction in the loop. 5246 for (Instruction &I : BB->instructionsWithoutDebug()) { 5247 Type *T = I.getType(); 5248 5249 // Skip ignored values. 5250 if (ValuesToIgnore.count(&I)) 5251 continue; 5252 5253 // Only examine Loads, Stores and PHINodes. 5254 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5255 continue; 5256 5257 // Examine PHI nodes that are reduction variables. Update the type to 5258 // account for the recurrence type. 5259 if (auto *PN = dyn_cast<PHINode>(&I)) { 5260 if (!Legal->isReductionVariable(PN)) 5261 continue; 5262 const RecurrenceDescriptor &RdxDesc = 5263 Legal->getReductionVars().find(PN)->second; 5264 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5265 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5266 RdxDesc.getRecurrenceType(), 5267 TargetTransformInfo::ReductionFlags())) 5268 continue; 5269 T = RdxDesc.getRecurrenceType(); 5270 } 5271 5272 // Examine the stored values. 5273 if (auto *ST = dyn_cast<StoreInst>(&I)) 5274 T = ST->getValueOperand()->getType(); 5275 5276 assert(T->isSized() && 5277 "Expected the load/store/recurrence type to be sized"); 5278 5279 ElementTypesInLoop.insert(T); 5280 } 5281 } 5282 } 5283 5284 unsigned 5285 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5286 InstructionCost LoopCost) { 5287 // -- The interleave heuristics -- 5288 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5289 // There are many micro-architectural considerations that we can't predict 5290 // at this level. For example, frontend pressure (on decode or fetch) due to 5291 // code size, or the number and capabilities of the execution ports. 5292 // 5293 // We use the following heuristics to select the interleave count: 5294 // 1. If the code has reductions, then we interleave to break the cross 5295 // iteration dependency. 5296 // 2. If the loop is really small, then we interleave to reduce the loop 5297 // overhead. 5298 // 3. We don't interleave if we think that we will spill registers to memory 5299 // due to the increased register pressure. 5300 5301 if (!isScalarEpilogueAllowed()) 5302 return 1; 5303 5304 // We used the distance for the interleave count. 5305 if (!Legal->isSafeForAnyVectorWidth()) 5306 return 1; 5307 5308 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5309 const bool HasReductions = !Legal->getReductionVars().empty(); 5310 // Do not interleave loops with a relatively small known or estimated trip 5311 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5312 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5313 // because with the above conditions interleaving can expose ILP and break 5314 // cross iteration dependences for reductions. 5315 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5316 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5317 return 1; 5318 5319 // If we did not calculate the cost for VF (because the user selected the VF) 5320 // then we calculate the cost of VF here. 5321 if (LoopCost == 0) { 5322 LoopCost = expectedCost(VF).first; 5323 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5324 5325 // Loop body is free and there is no need for interleaving. 5326 if (LoopCost == 0) 5327 return 1; 5328 } 5329 5330 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5331 // We divide by these constants so assume that we have at least one 5332 // instruction that uses at least one register. 5333 for (auto& pair : R.MaxLocalUsers) { 5334 pair.second = std::max(pair.second, 1U); 5335 } 5336 5337 // We calculate the interleave count using the following formula. 5338 // Subtract the number of loop invariants from the number of available 5339 // registers. These registers are used by all of the interleaved instances. 5340 // Next, divide the remaining registers by the number of registers that is 5341 // required by the loop, in order to estimate how many parallel instances 5342 // fit without causing spills. All of this is rounded down if necessary to be 5343 // a power of two. We want power of two interleave count to simplify any 5344 // addressing operations or alignment considerations. 5345 // We also want power of two interleave counts to ensure that the induction 5346 // variable of the vector loop wraps to zero, when tail is folded by masking; 5347 // this currently happens when OptForSize, in which case IC is set to 1 above. 5348 unsigned IC = UINT_MAX; 5349 5350 for (auto& pair : R.MaxLocalUsers) { 5351 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5352 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5353 << " registers of " 5354 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5355 if (VF.isScalar()) { 5356 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5357 TargetNumRegisters = ForceTargetNumScalarRegs; 5358 } else { 5359 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5360 TargetNumRegisters = ForceTargetNumVectorRegs; 5361 } 5362 unsigned MaxLocalUsers = pair.second; 5363 unsigned LoopInvariantRegs = 0; 5364 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5365 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5366 5367 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5368 MaxLocalUsers); 5369 // Don't count the induction variable as interleaved. 5370 if (EnableIndVarRegisterHeur) { 5371 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5372 std::max(1U, (MaxLocalUsers - 1))); 5373 } 5374 5375 IC = std::min(IC, TmpIC); 5376 } 5377 5378 // Clamp the interleave ranges to reasonable counts. 5379 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5380 5381 // Check if the user has overridden the max. 5382 if (VF.isScalar()) { 5383 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5384 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5385 } else { 5386 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5387 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5388 } 5389 5390 unsigned EstimatedVF = VF.getKnownMinValue(); 5391 if (VF.isScalable()) { 5392 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) 5393 EstimatedVF *= *VScale; 5394 } 5395 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 5396 5397 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5398 if (KnownTC) { 5399 // If trip count is known we select between two prospective ICs, where 5400 // 1) the aggressive IC is capped by the trip count divided by VF 5401 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 5402 // The final IC is selected in a way that the epilogue loop trip count is 5403 // minimized while maximizing the IC itself, so that we either run the 5404 // vector loop at least once if it generates a small epilogue loop, or else 5405 // we run the vector loop at least twice. 5406 5407 unsigned InterleaveCountUB = bit_floor( 5408 std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); 5409 unsigned InterleaveCountLB = bit_floor(std::max( 5410 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5411 MaxInterleaveCount = InterleaveCountLB; 5412 5413 if (InterleaveCountUB != InterleaveCountLB) { 5414 unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); 5415 unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); 5416 // If both produce same scalar tail, maximize the IC to do the same work 5417 // in fewer vector loop iterations 5418 if (TailTripCountUB == TailTripCountLB) 5419 MaxInterleaveCount = InterleaveCountUB; 5420 } 5421 } else if (BestKnownTC) { 5422 // If trip count is an estimated compile time constant, limit the 5423 // IC to be capped by the trip count divided by VF * 2, such that the vector 5424 // loop runs at least twice to make interleaving seem profitable when there 5425 // is an epilogue loop present. Since exact Trip count is not known we 5426 // choose to be conservative in our IC estimate. 5427 MaxInterleaveCount = bit_floor(std::max( 5428 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5429 } 5430 5431 assert(MaxInterleaveCount > 0 && 5432 "Maximum interleave count must be greater than 0"); 5433 5434 // Clamp the calculated IC to be between the 1 and the max interleave count 5435 // that the target and trip count allows. 5436 if (IC > MaxInterleaveCount) 5437 IC = MaxInterleaveCount; 5438 else 5439 // Make sure IC is greater than 0. 5440 IC = std::max(1u, IC); 5441 5442 assert(IC > 0 && "Interleave count must be greater than 0."); 5443 5444 // Interleave if we vectorized this loop and there is a reduction that could 5445 // benefit from interleaving. 5446 if (VF.isVector() && HasReductions) { 5447 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5448 return IC; 5449 } 5450 5451 // For any scalar loop that either requires runtime checks or predication we 5452 // are better off leaving this to the unroller. Note that if we've already 5453 // vectorized the loop we will have done the runtime check and so interleaving 5454 // won't require further checks. 5455 bool ScalarInterleavingRequiresPredication = 5456 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5457 return Legal->blockNeedsPredication(BB); 5458 })); 5459 bool ScalarInterleavingRequiresRuntimePointerCheck = 5460 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5461 5462 // We want to interleave small loops in order to reduce the loop overhead and 5463 // potentially expose ILP opportunities. 5464 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5465 << "LV: IC is " << IC << '\n' 5466 << "LV: VF is " << VF << '\n'); 5467 const bool AggressivelyInterleaveReductions = 5468 TTI.enableAggressiveInterleaving(HasReductions); 5469 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5470 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5471 // We assume that the cost overhead is 1 and we use the cost model 5472 // to estimate the cost of the loop and interleave until the cost of the 5473 // loop overhead is about 5% of the cost of the loop. 5474 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5475 SmallLoopCost / *LoopCost.getValue())); 5476 5477 // Interleave until store/load ports (estimated by max interleave count) are 5478 // saturated. 5479 unsigned NumStores = Legal->getNumStores(); 5480 unsigned NumLoads = Legal->getNumLoads(); 5481 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5482 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5483 5484 // There is little point in interleaving for reductions containing selects 5485 // and compares when VF=1 since it may just create more overhead than it's 5486 // worth for loops with small trip counts. This is because we still have to 5487 // do the final reduction after the loop. 5488 bool HasSelectCmpReductions = 5489 HasReductions && 5490 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5491 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5492 return RecurrenceDescriptor::isAnyOfRecurrenceKind( 5493 RdxDesc.getRecurrenceKind()); 5494 }); 5495 if (HasSelectCmpReductions) { 5496 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5497 return 1; 5498 } 5499 5500 // If we have a scalar reduction (vector reductions are already dealt with 5501 // by this point), we can increase the critical path length if the loop 5502 // we're interleaving is inside another loop. For tree-wise reductions 5503 // set the limit to 2, and for ordered reductions it's best to disable 5504 // interleaving entirely. 5505 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5506 bool HasOrderedReductions = 5507 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5508 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5509 return RdxDesc.isOrdered(); 5510 }); 5511 if (HasOrderedReductions) { 5512 LLVM_DEBUG( 5513 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5514 return 1; 5515 } 5516 5517 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5518 SmallIC = std::min(SmallIC, F); 5519 StoresIC = std::min(StoresIC, F); 5520 LoadsIC = std::min(LoadsIC, F); 5521 } 5522 5523 if (EnableLoadStoreRuntimeInterleave && 5524 std::max(StoresIC, LoadsIC) > SmallIC) { 5525 LLVM_DEBUG( 5526 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5527 return std::max(StoresIC, LoadsIC); 5528 } 5529 5530 // If there are scalar reductions and TTI has enabled aggressive 5531 // interleaving for reductions, we will interleave to expose ILP. 5532 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5533 AggressivelyInterleaveReductions) { 5534 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5535 // Interleave no less than SmallIC but not as aggressive as the normal IC 5536 // to satisfy the rare situation when resources are too limited. 5537 return std::max(IC / 2, SmallIC); 5538 } else { 5539 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5540 return SmallIC; 5541 } 5542 } 5543 5544 // Interleave if this is a large loop (small loops are already dealt with by 5545 // this point) that could benefit from interleaving. 5546 if (AggressivelyInterleaveReductions) { 5547 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5548 return IC; 5549 } 5550 5551 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5552 return 1; 5553 } 5554 5555 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5556 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5557 // This function calculates the register usage by measuring the highest number 5558 // of values that are alive at a single location. Obviously, this is a very 5559 // rough estimation. We scan the loop in a topological order in order and 5560 // assign a number to each instruction. We use RPO to ensure that defs are 5561 // met before their users. We assume that each instruction that has in-loop 5562 // users starts an interval. We record every time that an in-loop value is 5563 // used, so we have a list of the first and last occurrences of each 5564 // instruction. Next, we transpose this data structure into a multi map that 5565 // holds the list of intervals that *end* at a specific location. This multi 5566 // map allows us to perform a linear search. We scan the instructions linearly 5567 // and record each time that a new interval starts, by placing it in a set. 5568 // If we find this value in the multi-map then we remove it from the set. 5569 // The max register usage is the maximum size of the set. 5570 // We also search for instructions that are defined outside the loop, but are 5571 // used inside the loop. We need this number separately from the max-interval 5572 // usage number because when we unroll, loop-invariant values do not take 5573 // more register. 5574 LoopBlocksDFS DFS(TheLoop); 5575 DFS.perform(LI); 5576 5577 RegisterUsage RU; 5578 5579 // Each 'key' in the map opens a new interval. The values 5580 // of the map are the index of the 'last seen' usage of the 5581 // instruction that is the key. 5582 using IntervalMap = DenseMap<Instruction *, unsigned>; 5583 5584 // Maps instruction to its index. 5585 SmallVector<Instruction *, 64> IdxToInstr; 5586 // Marks the end of each interval. 5587 IntervalMap EndPoint; 5588 // Saves the list of instruction indices that are used in the loop. 5589 SmallPtrSet<Instruction *, 8> Ends; 5590 // Saves the list of values that are used in the loop but are defined outside 5591 // the loop (not including non-instruction values such as arguments and 5592 // constants). 5593 SmallSetVector<Instruction *, 8> LoopInvariants; 5594 5595 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5596 for (Instruction &I : BB->instructionsWithoutDebug()) { 5597 IdxToInstr.push_back(&I); 5598 5599 // Save the end location of each USE. 5600 for (Value *U : I.operands()) { 5601 auto *Instr = dyn_cast<Instruction>(U); 5602 5603 // Ignore non-instruction values such as arguments, constants, etc. 5604 // FIXME: Might need some motivation why these values are ignored. If 5605 // for example an argument is used inside the loop it will increase the 5606 // register pressure (so shouldn't we add it to LoopInvariants). 5607 if (!Instr) 5608 continue; 5609 5610 // If this instruction is outside the loop then record it and continue. 5611 if (!TheLoop->contains(Instr)) { 5612 LoopInvariants.insert(Instr); 5613 continue; 5614 } 5615 5616 // Overwrite previous end points. 5617 EndPoint[Instr] = IdxToInstr.size(); 5618 Ends.insert(Instr); 5619 } 5620 } 5621 } 5622 5623 // Saves the list of intervals that end with the index in 'key'. 5624 using InstrList = SmallVector<Instruction *, 2>; 5625 DenseMap<unsigned, InstrList> TransposeEnds; 5626 5627 // Transpose the EndPoints to a list of values that end at each index. 5628 for (auto &Interval : EndPoint) 5629 TransposeEnds[Interval.second].push_back(Interval.first); 5630 5631 SmallPtrSet<Instruction *, 8> OpenIntervals; 5632 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5633 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5634 5635 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5636 5637 const auto &TTICapture = TTI; 5638 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5639 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5640 return 0; 5641 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5642 }; 5643 5644 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5645 Instruction *I = IdxToInstr[i]; 5646 5647 // Remove all of the instructions that end at this location. 5648 InstrList &List = TransposeEnds[i]; 5649 for (Instruction *ToRemove : List) 5650 OpenIntervals.erase(ToRemove); 5651 5652 // Ignore instructions that are never used within the loop. 5653 if (!Ends.count(I)) 5654 continue; 5655 5656 // Skip ignored values. 5657 if (ValuesToIgnore.count(I)) 5658 continue; 5659 5660 collectInLoopReductions(); 5661 5662 // For each VF find the maximum usage of registers. 5663 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5664 // Count the number of registers used, per register class, given all open 5665 // intervals. 5666 // Note that elements in this SmallMapVector will be default constructed 5667 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5668 // there is no previous entry for ClassID. 5669 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5670 5671 if (VFs[j].isScalar()) { 5672 for (auto *Inst : OpenIntervals) { 5673 unsigned ClassID = 5674 TTI.getRegisterClassForType(false, Inst->getType()); 5675 // FIXME: The target might use more than one register for the type 5676 // even in the scalar case. 5677 RegUsage[ClassID] += 1; 5678 } 5679 } else { 5680 collectUniformsAndScalars(VFs[j]); 5681 for (auto *Inst : OpenIntervals) { 5682 // Skip ignored values for VF > 1. 5683 if (VecValuesToIgnore.count(Inst)) 5684 continue; 5685 if (isScalarAfterVectorization(Inst, VFs[j])) { 5686 unsigned ClassID = 5687 TTI.getRegisterClassForType(false, Inst->getType()); 5688 // FIXME: The target might use more than one register for the type 5689 // even in the scalar case. 5690 RegUsage[ClassID] += 1; 5691 } else { 5692 unsigned ClassID = 5693 TTI.getRegisterClassForType(true, Inst->getType()); 5694 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5695 } 5696 } 5697 } 5698 5699 for (auto& pair : RegUsage) { 5700 auto &Entry = MaxUsages[j][pair.first]; 5701 Entry = std::max(Entry, pair.second); 5702 } 5703 } 5704 5705 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5706 << OpenIntervals.size() << '\n'); 5707 5708 // Add the current instruction to the list of open intervals. 5709 OpenIntervals.insert(I); 5710 } 5711 5712 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5713 // Note that elements in this SmallMapVector will be default constructed 5714 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5715 // there is no previous entry for ClassID. 5716 SmallMapVector<unsigned, unsigned, 4> Invariant; 5717 5718 for (auto *Inst : LoopInvariants) { 5719 // FIXME: The target might use more than one register for the type 5720 // even in the scalar case. 5721 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5722 auto *I = cast<Instruction>(U); 5723 return TheLoop != LI->getLoopFor(I->getParent()) || 5724 isScalarAfterVectorization(I, VFs[i]); 5725 }); 5726 5727 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 5728 unsigned ClassID = 5729 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5730 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5731 } 5732 5733 LLVM_DEBUG({ 5734 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5735 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5736 << " item\n"; 5737 for (const auto &pair : MaxUsages[i]) { 5738 dbgs() << "LV(REG): RegisterClass: " 5739 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5740 << " registers\n"; 5741 } 5742 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5743 << " item\n"; 5744 for (const auto &pair : Invariant) { 5745 dbgs() << "LV(REG): RegisterClass: " 5746 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5747 << " registers\n"; 5748 } 5749 }); 5750 5751 RU.LoopInvariantRegs = Invariant; 5752 RU.MaxLocalUsers = MaxUsages[i]; 5753 RUs[i] = RU; 5754 } 5755 5756 return RUs; 5757 } 5758 5759 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5760 ElementCount VF) { 5761 // TODO: Cost model for emulated masked load/store is completely 5762 // broken. This hack guides the cost model to use an artificially 5763 // high enough value to practically disable vectorization with such 5764 // operations, except where previously deployed legality hack allowed 5765 // using very low cost values. This is to avoid regressions coming simply 5766 // from moving "masked load/store" check from legality to cost model. 5767 // Masked Load/Gather emulation was previously never allowed. 5768 // Limited number of Masked Store/Scatter emulation was allowed. 5769 assert((isPredicatedInst(I)) && 5770 "Expecting a scalar emulated instruction"); 5771 return isa<LoadInst>(I) || 5772 (isa<StoreInst>(I) && 5773 NumPredStores > NumberOfStoresToPredicate); 5774 } 5775 5776 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5777 // If we aren't vectorizing the loop, or if we've already collected the 5778 // instructions to scalarize, there's nothing to do. Collection may already 5779 // have occurred if we have a user-selected VF and are now computing the 5780 // expected cost for interleaving. 5781 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5782 return; 5783 5784 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5785 // not profitable to scalarize any instructions, the presence of VF in the 5786 // map will indicate that we've analyzed it already. 5787 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5788 5789 PredicatedBBsAfterVectorization[VF].clear(); 5790 5791 // Find all the instructions that are scalar with predication in the loop and 5792 // determine if it would be better to not if-convert the blocks they are in. 5793 // If so, we also record the instructions to scalarize. 5794 for (BasicBlock *BB : TheLoop->blocks()) { 5795 if (!blockNeedsPredicationForAnyReason(BB)) 5796 continue; 5797 for (Instruction &I : *BB) 5798 if (isScalarWithPredication(&I, VF)) { 5799 ScalarCostsTy ScalarCosts; 5800 // Do not apply discount if scalable, because that would lead to 5801 // invalid scalarization costs. 5802 // Do not apply discount logic if hacked cost is needed 5803 // for emulated masked memrefs. 5804 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 5805 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5806 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5807 // Remember that BB will remain after vectorization. 5808 PredicatedBBsAfterVectorization[VF].insert(BB); 5809 } 5810 } 5811 } 5812 5813 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5814 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5815 assert(!isUniformAfterVectorization(PredInst, VF) && 5816 "Instruction marked uniform-after-vectorization will be predicated"); 5817 5818 // Initialize the discount to zero, meaning that the scalar version and the 5819 // vector version cost the same. 5820 InstructionCost Discount = 0; 5821 5822 // Holds instructions to analyze. The instructions we visit are mapped in 5823 // ScalarCosts. Those instructions are the ones that would be scalarized if 5824 // we find that the scalar version costs less. 5825 SmallVector<Instruction *, 8> Worklist; 5826 5827 // Returns true if the given instruction can be scalarized. 5828 auto canBeScalarized = [&](Instruction *I) -> bool { 5829 // We only attempt to scalarize instructions forming a single-use chain 5830 // from the original predicated block that would otherwise be vectorized. 5831 // Although not strictly necessary, we give up on instructions we know will 5832 // already be scalar to avoid traversing chains that are unlikely to be 5833 // beneficial. 5834 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5835 isScalarAfterVectorization(I, VF)) 5836 return false; 5837 5838 // If the instruction is scalar with predication, it will be analyzed 5839 // separately. We ignore it within the context of PredInst. 5840 if (isScalarWithPredication(I, VF)) 5841 return false; 5842 5843 // If any of the instruction's operands are uniform after vectorization, 5844 // the instruction cannot be scalarized. This prevents, for example, a 5845 // masked load from being scalarized. 5846 // 5847 // We assume we will only emit a value for lane zero of an instruction 5848 // marked uniform after vectorization, rather than VF identical values. 5849 // Thus, if we scalarize an instruction that uses a uniform, we would 5850 // create uses of values corresponding to the lanes we aren't emitting code 5851 // for. This behavior can be changed by allowing getScalarValue to clone 5852 // the lane zero values for uniforms rather than asserting. 5853 for (Use &U : I->operands()) 5854 if (auto *J = dyn_cast<Instruction>(U.get())) 5855 if (isUniformAfterVectorization(J, VF)) 5856 return false; 5857 5858 // Otherwise, we can scalarize the instruction. 5859 return true; 5860 }; 5861 5862 // Compute the expected cost discount from scalarizing the entire expression 5863 // feeding the predicated instruction. We currently only consider expressions 5864 // that are single-use instruction chains. 5865 Worklist.push_back(PredInst); 5866 while (!Worklist.empty()) { 5867 Instruction *I = Worklist.pop_back_val(); 5868 5869 // If we've already analyzed the instruction, there's nothing to do. 5870 if (ScalarCosts.contains(I)) 5871 continue; 5872 5873 // Compute the cost of the vector instruction. Note that this cost already 5874 // includes the scalarization overhead of the predicated instruction. 5875 InstructionCost VectorCost = getInstructionCost(I, VF).first; 5876 5877 // Compute the cost of the scalarized instruction. This cost is the cost of 5878 // the instruction as if it wasn't if-converted and instead remained in the 5879 // predicated block. We will scale this cost by block probability after 5880 // computing the scalarization overhead. 5881 InstructionCost ScalarCost = 5882 VF.getFixedValue() * 5883 getInstructionCost(I, ElementCount::getFixed(1)).first; 5884 5885 // Compute the scalarization overhead of needed insertelement instructions 5886 // and phi nodes. 5887 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5888 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5889 ScalarCost += TTI.getScalarizationOverhead( 5890 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5891 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5892 /*Extract*/ false, CostKind); 5893 ScalarCost += 5894 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5895 } 5896 5897 // Compute the scalarization overhead of needed extractelement 5898 // instructions. For each of the instruction's operands, if the operand can 5899 // be scalarized, add it to the worklist; otherwise, account for the 5900 // overhead. 5901 for (Use &U : I->operands()) 5902 if (auto *J = dyn_cast<Instruction>(U.get())) { 5903 assert(VectorType::isValidElementType(J->getType()) && 5904 "Instruction has non-scalar type"); 5905 if (canBeScalarized(J)) 5906 Worklist.push_back(J); 5907 else if (needsExtract(J, VF)) { 5908 ScalarCost += TTI.getScalarizationOverhead( 5909 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5910 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5911 /*Extract*/ true, CostKind); 5912 } 5913 } 5914 5915 // Scale the total scalar cost by block probability. 5916 ScalarCost /= getReciprocalPredBlockProb(); 5917 5918 // Compute the discount. A non-negative discount means the vector version 5919 // of the instruction costs more, and scalarizing would be beneficial. 5920 Discount += VectorCost - ScalarCost; 5921 ScalarCosts[I] = ScalarCost; 5922 } 5923 5924 return Discount; 5925 } 5926 5927 LoopVectorizationCostModel::VectorizationCostTy 5928 LoopVectorizationCostModel::expectedCost( 5929 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 5930 VectorizationCostTy Cost; 5931 5932 // For each block. 5933 for (BasicBlock *BB : TheLoop->blocks()) { 5934 VectorizationCostTy BlockCost; 5935 5936 // For each instruction in the old loop. 5937 for (Instruction &I : BB->instructionsWithoutDebug()) { 5938 // Skip ignored values. 5939 if (ValuesToIgnore.count(&I) || 5940 (VF.isVector() && VecValuesToIgnore.count(&I))) 5941 continue; 5942 5943 VectorizationCostTy C = getInstructionCost(&I, VF); 5944 5945 // Check if we should override the cost. 5946 if (C.first.isValid() && 5947 ForceTargetInstructionCost.getNumOccurrences() > 0) 5948 C.first = InstructionCost(ForceTargetInstructionCost); 5949 5950 // Keep a list of instructions with invalid costs. 5951 if (Invalid && !C.first.isValid()) 5952 Invalid->emplace_back(&I, VF); 5953 5954 BlockCost.first += C.first; 5955 BlockCost.second |= C.second; 5956 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5957 << " for VF " << VF << " For instruction: " << I 5958 << '\n'); 5959 } 5960 5961 // If we are vectorizing a predicated block, it will have been 5962 // if-converted. This means that the block's instructions (aside from 5963 // stores and instructions that may divide by zero) will now be 5964 // unconditionally executed. For the scalar case, we may not always execute 5965 // the predicated block, if it is an if-else block. Thus, scale the block's 5966 // cost by the probability of executing it. blockNeedsPredication from 5967 // Legal is used so as to not include all blocks in tail folded loops. 5968 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5969 BlockCost.first /= getReciprocalPredBlockProb(); 5970 5971 Cost.first += BlockCost.first; 5972 Cost.second |= BlockCost.second; 5973 } 5974 5975 return Cost; 5976 } 5977 5978 /// Gets Address Access SCEV after verifying that the access pattern 5979 /// is loop invariant except the induction variable dependence. 5980 /// 5981 /// This SCEV can be sent to the Target in order to estimate the address 5982 /// calculation cost. 5983 static const SCEV *getAddressAccessSCEV( 5984 Value *Ptr, 5985 LoopVectorizationLegality *Legal, 5986 PredicatedScalarEvolution &PSE, 5987 const Loop *TheLoop) { 5988 5989 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5990 if (!Gep) 5991 return nullptr; 5992 5993 // We are looking for a gep with all loop invariant indices except for one 5994 // which should be an induction variable. 5995 auto SE = PSE.getSE(); 5996 unsigned NumOperands = Gep->getNumOperands(); 5997 for (unsigned i = 1; i < NumOperands; ++i) { 5998 Value *Opd = Gep->getOperand(i); 5999 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6000 !Legal->isInductionVariable(Opd)) 6001 return nullptr; 6002 } 6003 6004 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6005 return PSE.getSCEV(Ptr); 6006 } 6007 6008 InstructionCost 6009 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6010 ElementCount VF) { 6011 assert(VF.isVector() && 6012 "Scalarization cost of instruction implies vectorization."); 6013 if (VF.isScalable()) 6014 return InstructionCost::getInvalid(); 6015 6016 Type *ValTy = getLoadStoreType(I); 6017 auto SE = PSE.getSE(); 6018 6019 unsigned AS = getLoadStoreAddressSpace(I); 6020 Value *Ptr = getLoadStorePointerOperand(I); 6021 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6022 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6023 // that it is being called from this specific place. 6024 6025 // Figure out whether the access is strided and get the stride value 6026 // if it's known in compile time 6027 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6028 6029 // Get the cost of the scalar memory instruction and address computation. 6030 InstructionCost Cost = 6031 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6032 6033 // Don't pass *I here, since it is scalar but will actually be part of a 6034 // vectorized loop where the user of it is a vectorized instruction. 6035 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6036 const Align Alignment = getLoadStoreAlignment(I); 6037 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6038 ValTy->getScalarType(), 6039 Alignment, AS, CostKind); 6040 6041 // Get the overhead of the extractelement and insertelement instructions 6042 // we might create due to scalarization. 6043 Cost += getScalarizationOverhead(I, VF, CostKind); 6044 6045 // If we have a predicated load/store, it will need extra i1 extracts and 6046 // conditional branches, but may not be executed for each vector lane. Scale 6047 // the cost by the probability of executing the predicated block. 6048 if (isPredicatedInst(I)) { 6049 Cost /= getReciprocalPredBlockProb(); 6050 6051 // Add the cost of an i1 extract and a branch 6052 auto *Vec_i1Ty = 6053 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6054 Cost += TTI.getScalarizationOverhead( 6055 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6056 /*Insert=*/false, /*Extract=*/true, CostKind); 6057 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6058 6059 if (useEmulatedMaskMemRefHack(I, VF)) 6060 // Artificially setting to a high enough value to practically disable 6061 // vectorization with such operations. 6062 Cost = 3000000; 6063 } 6064 6065 return Cost; 6066 } 6067 6068 InstructionCost 6069 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6070 ElementCount VF) { 6071 Type *ValTy = getLoadStoreType(I); 6072 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6073 Value *Ptr = getLoadStorePointerOperand(I); 6074 unsigned AS = getLoadStoreAddressSpace(I); 6075 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6076 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6077 6078 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6079 "Stride should be 1 or -1 for consecutive memory access"); 6080 const Align Alignment = getLoadStoreAlignment(I); 6081 InstructionCost Cost = 0; 6082 if (Legal->isMaskRequired(I)) { 6083 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6084 CostKind); 6085 } else { 6086 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6087 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6088 CostKind, OpInfo, I); 6089 } 6090 6091 bool Reverse = ConsecutiveStride < 0; 6092 if (Reverse) 6093 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6094 std::nullopt, CostKind, 0); 6095 return Cost; 6096 } 6097 6098 InstructionCost 6099 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6100 ElementCount VF) { 6101 assert(Legal->isUniformMemOp(*I, VF)); 6102 6103 Type *ValTy = getLoadStoreType(I); 6104 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6105 const Align Alignment = getLoadStoreAlignment(I); 6106 unsigned AS = getLoadStoreAddressSpace(I); 6107 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6108 if (isa<LoadInst>(I)) { 6109 return TTI.getAddressComputationCost(ValTy) + 6110 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6111 CostKind) + 6112 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6113 } 6114 StoreInst *SI = cast<StoreInst>(I); 6115 6116 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6117 return TTI.getAddressComputationCost(ValTy) + 6118 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6119 CostKind) + 6120 (isLoopInvariantStoreValue 6121 ? 0 6122 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6123 CostKind, VF.getKnownMinValue() - 1)); 6124 } 6125 6126 InstructionCost 6127 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6128 ElementCount VF) { 6129 Type *ValTy = getLoadStoreType(I); 6130 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6131 const Align Alignment = getLoadStoreAlignment(I); 6132 const Value *Ptr = getLoadStorePointerOperand(I); 6133 6134 return TTI.getAddressComputationCost(VectorTy) + 6135 TTI.getGatherScatterOpCost( 6136 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6137 TargetTransformInfo::TCK_RecipThroughput, I); 6138 } 6139 6140 InstructionCost 6141 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6142 ElementCount VF) { 6143 Type *ValTy = getLoadStoreType(I); 6144 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6145 unsigned AS = getLoadStoreAddressSpace(I); 6146 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6147 6148 auto Group = getInterleavedAccessGroup(I); 6149 assert(Group && "Fail to get an interleaved access group."); 6150 6151 unsigned InterleaveFactor = Group->getFactor(); 6152 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6153 6154 // Holds the indices of existing members in the interleaved group. 6155 SmallVector<unsigned, 4> Indices; 6156 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6157 if (Group->getMember(IF)) 6158 Indices.push_back(IF); 6159 6160 // Calculate the cost of the whole interleaved group. 6161 bool UseMaskForGaps = 6162 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6163 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6164 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6165 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6166 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6167 6168 if (Group->isReverse()) { 6169 // TODO: Add support for reversed masked interleaved access. 6170 assert(!Legal->isMaskRequired(I) && 6171 "Reverse masked interleaved access not supported."); 6172 Cost += Group->getNumMembers() * 6173 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6174 std::nullopt, CostKind, 0); 6175 } 6176 return Cost; 6177 } 6178 6179 std::optional<InstructionCost> 6180 LoopVectorizationCostModel::getReductionPatternCost( 6181 Instruction *I, ElementCount VF, Type *Ty, 6182 TTI::TargetCostKind CostKind) const { 6183 using namespace llvm::PatternMatch; 6184 // Early exit for no inloop reductions 6185 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6186 return std::nullopt; 6187 auto *VectorTy = cast<VectorType>(Ty); 6188 6189 // We are looking for a pattern of, and finding the minimal acceptable cost: 6190 // reduce(mul(ext(A), ext(B))) or 6191 // reduce(mul(A, B)) or 6192 // reduce(ext(A)) or 6193 // reduce(A). 6194 // The basic idea is that we walk down the tree to do that, finding the root 6195 // reduction instruction in InLoopReductionImmediateChains. From there we find 6196 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6197 // of the components. If the reduction cost is lower then we return it for the 6198 // reduction instruction and 0 for the other instructions in the pattern. If 6199 // it is not we return an invalid cost specifying the orignal cost method 6200 // should be used. 6201 Instruction *RetI = I; 6202 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6203 if (!RetI->hasOneUser()) 6204 return std::nullopt; 6205 RetI = RetI->user_back(); 6206 } 6207 6208 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6209 RetI->user_back()->getOpcode() == Instruction::Add) { 6210 RetI = RetI->user_back(); 6211 } 6212 6213 // Test if the found instruction is a reduction, and if not return an invalid 6214 // cost specifying the parent to use the original cost modelling. 6215 if (!InLoopReductionImmediateChains.count(RetI)) 6216 return std::nullopt; 6217 6218 // Find the reduction this chain is a part of and calculate the basic cost of 6219 // the reduction on its own. 6220 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 6221 Instruction *ReductionPhi = LastChain; 6222 while (!isa<PHINode>(ReductionPhi)) 6223 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 6224 6225 const RecurrenceDescriptor &RdxDesc = 6226 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6227 6228 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6229 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6230 6231 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6232 // normal fmul instruction to the cost of the fadd reduction. 6233 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6234 BaseCost += 6235 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6236 6237 // If we're using ordered reductions then we can just return the base cost 6238 // here, since getArithmeticReductionCost calculates the full ordered 6239 // reduction cost when FP reassociation is not allowed. 6240 if (useOrderedReductions(RdxDesc)) 6241 return BaseCost; 6242 6243 // Get the operand that was not the reduction chain and match it to one of the 6244 // patterns, returning the better cost if it is found. 6245 Instruction *RedOp = RetI->getOperand(1) == LastChain 6246 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6247 : dyn_cast<Instruction>(RetI->getOperand(1)); 6248 6249 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6250 6251 Instruction *Op0, *Op1; 6252 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6253 match(RedOp, 6254 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6255 match(Op0, m_ZExtOrSExt(m_Value())) && 6256 Op0->getOpcode() == Op1->getOpcode() && 6257 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6258 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6259 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6260 6261 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6262 // Note that the extend opcodes need to all match, or if A==B they will have 6263 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6264 // which is equally fine. 6265 bool IsUnsigned = isa<ZExtInst>(Op0); 6266 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6267 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6268 6269 InstructionCost ExtCost = 6270 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6271 TTI::CastContextHint::None, CostKind, Op0); 6272 InstructionCost MulCost = 6273 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6274 InstructionCost Ext2Cost = 6275 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6276 TTI::CastContextHint::None, CostKind, RedOp); 6277 6278 InstructionCost RedCost = TTI.getMulAccReductionCost( 6279 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6280 6281 if (RedCost.isValid() && 6282 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6283 return I == RetI ? RedCost : 0; 6284 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6285 !TheLoop->isLoopInvariant(RedOp)) { 6286 // Matched reduce(ext(A)) 6287 bool IsUnsigned = isa<ZExtInst>(RedOp); 6288 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6289 InstructionCost RedCost = TTI.getExtendedReductionCost( 6290 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6291 RdxDesc.getFastMathFlags(), CostKind); 6292 6293 InstructionCost ExtCost = 6294 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6295 TTI::CastContextHint::None, CostKind, RedOp); 6296 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6297 return I == RetI ? RedCost : 0; 6298 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6299 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6300 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6301 Op0->getOpcode() == Op1->getOpcode() && 6302 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6303 bool IsUnsigned = isa<ZExtInst>(Op0); 6304 Type *Op0Ty = Op0->getOperand(0)->getType(); 6305 Type *Op1Ty = Op1->getOperand(0)->getType(); 6306 Type *LargestOpTy = 6307 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6308 : Op0Ty; 6309 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6310 6311 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6312 // different sizes. We take the largest type as the ext to reduce, and add 6313 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6314 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6315 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6316 TTI::CastContextHint::None, CostKind, Op0); 6317 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6318 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6319 TTI::CastContextHint::None, CostKind, Op1); 6320 InstructionCost MulCost = 6321 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6322 6323 InstructionCost RedCost = TTI.getMulAccReductionCost( 6324 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6325 InstructionCost ExtraExtCost = 0; 6326 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6327 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6328 ExtraExtCost = TTI.getCastInstrCost( 6329 ExtraExtOp->getOpcode(), ExtType, 6330 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6331 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6332 } 6333 6334 if (RedCost.isValid() && 6335 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6336 return I == RetI ? RedCost : 0; 6337 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6338 // Matched reduce.add(mul()) 6339 InstructionCost MulCost = 6340 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6341 6342 InstructionCost RedCost = TTI.getMulAccReductionCost( 6343 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6344 6345 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6346 return I == RetI ? RedCost : 0; 6347 } 6348 } 6349 6350 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6351 } 6352 6353 InstructionCost 6354 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6355 ElementCount VF) { 6356 // Calculate scalar cost only. Vectorization cost should be ready at this 6357 // moment. 6358 if (VF.isScalar()) { 6359 Type *ValTy = getLoadStoreType(I); 6360 const Align Alignment = getLoadStoreAlignment(I); 6361 unsigned AS = getLoadStoreAddressSpace(I); 6362 6363 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6364 return TTI.getAddressComputationCost(ValTy) + 6365 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6366 TTI::TCK_RecipThroughput, OpInfo, I); 6367 } 6368 return getWideningCost(I, VF); 6369 } 6370 6371 LoopVectorizationCostModel::VectorizationCostTy 6372 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6373 ElementCount VF) { 6374 // If we know that this instruction will remain uniform, check the cost of 6375 // the scalar version. 6376 if (isUniformAfterVectorization(I, VF)) 6377 VF = ElementCount::getFixed(1); 6378 6379 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6380 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6381 6382 // Forced scalars do not have any scalarization overhead. 6383 auto ForcedScalar = ForcedScalars.find(VF); 6384 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6385 auto InstSet = ForcedScalar->second; 6386 if (InstSet.count(I)) 6387 return VectorizationCostTy( 6388 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6389 VF.getKnownMinValue()), 6390 false); 6391 } 6392 6393 Type *VectorTy; 6394 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6395 6396 bool TypeNotScalarized = false; 6397 if (VF.isVector() && VectorTy->isVectorTy()) { 6398 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6399 if (VF.isScalable()) 6400 // <vscale x 1 x iN> is assumed to be profitable over iN because 6401 // scalable registers are a distinct register class from scalar ones. 6402 // If we ever find a target which wants to lower scalable vectors 6403 // back to scalars, we'll need to update this code to explicitly 6404 // ask TTI about the register class uses for each part. 6405 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6406 else 6407 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6408 } else 6409 C = InstructionCost::getInvalid(); 6410 } 6411 return VectorizationCostTy(C, TypeNotScalarized); 6412 } 6413 6414 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6415 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6416 6417 // There is no mechanism yet to create a scalable scalarization loop, 6418 // so this is currently Invalid. 6419 if (VF.isScalable()) 6420 return InstructionCost::getInvalid(); 6421 6422 if (VF.isScalar()) 6423 return 0; 6424 6425 InstructionCost Cost = 0; 6426 Type *RetTy = ToVectorTy(I->getType(), VF); 6427 if (!RetTy->isVoidTy() && 6428 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6429 Cost += TTI.getScalarizationOverhead( 6430 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6431 /*Insert*/ true, 6432 /*Extract*/ false, CostKind); 6433 6434 // Some targets keep addresses scalar. 6435 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6436 return Cost; 6437 6438 // Some targets support efficient element stores. 6439 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6440 return Cost; 6441 6442 // Collect operands to consider. 6443 CallInst *CI = dyn_cast<CallInst>(I); 6444 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6445 6446 // Skip operands that do not require extraction/scalarization and do not incur 6447 // any overhead. 6448 SmallVector<Type *> Tys; 6449 for (auto *V : filterExtractingOperands(Ops, VF)) 6450 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6451 return Cost + TTI.getOperandsScalarizationOverhead( 6452 filterExtractingOperands(Ops, VF), Tys, CostKind); 6453 } 6454 6455 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6456 if (VF.isScalar()) 6457 return; 6458 NumPredStores = 0; 6459 for (BasicBlock *BB : TheLoop->blocks()) { 6460 // For each instruction in the old loop. 6461 for (Instruction &I : *BB) { 6462 Value *Ptr = getLoadStorePointerOperand(&I); 6463 if (!Ptr) 6464 continue; 6465 6466 // TODO: We should generate better code and update the cost model for 6467 // predicated uniform stores. Today they are treated as any other 6468 // predicated store (see added test cases in 6469 // invariant-store-vectorization.ll). 6470 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6471 NumPredStores++; 6472 6473 if (Legal->isUniformMemOp(I, VF)) { 6474 auto isLegalToScalarize = [&]() { 6475 if (!VF.isScalable()) 6476 // Scalarization of fixed length vectors "just works". 6477 return true; 6478 6479 // We have dedicated lowering for unpredicated uniform loads and 6480 // stores. Note that even with tail folding we know that at least 6481 // one lane is active (i.e. generalized predication is not possible 6482 // here), and the logic below depends on this fact. 6483 if (!foldTailByMasking()) 6484 return true; 6485 6486 // For scalable vectors, a uniform memop load is always 6487 // uniform-by-parts and we know how to scalarize that. 6488 if (isa<LoadInst>(I)) 6489 return true; 6490 6491 // A uniform store isn't neccessarily uniform-by-part 6492 // and we can't assume scalarization. 6493 auto &SI = cast<StoreInst>(I); 6494 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6495 }; 6496 6497 const InstructionCost GatherScatterCost = 6498 isLegalGatherOrScatter(&I, VF) ? 6499 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6500 6501 // Load: Scalar load + broadcast 6502 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6503 // FIXME: This cost is a significant under-estimate for tail folded 6504 // memory ops. 6505 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6506 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6507 6508 // Choose better solution for the current VF, Note that Invalid 6509 // costs compare as maximumal large. If both are invalid, we get 6510 // scalable invalid which signals a failure and a vectorization abort. 6511 if (GatherScatterCost < ScalarizationCost) 6512 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6513 else 6514 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6515 continue; 6516 } 6517 6518 // We assume that widening is the best solution when possible. 6519 if (memoryInstructionCanBeWidened(&I, VF)) { 6520 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6521 int ConsecutiveStride = Legal->isConsecutivePtr( 6522 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6523 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6524 "Expected consecutive stride."); 6525 InstWidening Decision = 6526 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6527 setWideningDecision(&I, VF, Decision, Cost); 6528 continue; 6529 } 6530 6531 // Choose between Interleaving, Gather/Scatter or Scalarization. 6532 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6533 unsigned NumAccesses = 1; 6534 if (isAccessInterleaved(&I)) { 6535 auto Group = getInterleavedAccessGroup(&I); 6536 assert(Group && "Fail to get an interleaved access group."); 6537 6538 // Make one decision for the whole group. 6539 if (getWideningDecision(&I, VF) != CM_Unknown) 6540 continue; 6541 6542 NumAccesses = Group->getNumMembers(); 6543 if (interleavedAccessCanBeWidened(&I, VF)) 6544 InterleaveCost = getInterleaveGroupCost(&I, VF); 6545 } 6546 6547 InstructionCost GatherScatterCost = 6548 isLegalGatherOrScatter(&I, VF) 6549 ? getGatherScatterCost(&I, VF) * NumAccesses 6550 : InstructionCost::getInvalid(); 6551 6552 InstructionCost ScalarizationCost = 6553 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6554 6555 // Choose better solution for the current VF, 6556 // write down this decision and use it during vectorization. 6557 InstructionCost Cost; 6558 InstWidening Decision; 6559 if (InterleaveCost <= GatherScatterCost && 6560 InterleaveCost < ScalarizationCost) { 6561 Decision = CM_Interleave; 6562 Cost = InterleaveCost; 6563 } else if (GatherScatterCost < ScalarizationCost) { 6564 Decision = CM_GatherScatter; 6565 Cost = GatherScatterCost; 6566 } else { 6567 Decision = CM_Scalarize; 6568 Cost = ScalarizationCost; 6569 } 6570 // If the instructions belongs to an interleave group, the whole group 6571 // receives the same decision. The whole group receives the cost, but 6572 // the cost will actually be assigned to one instruction. 6573 if (auto Group = getInterleavedAccessGroup(&I)) 6574 setWideningDecision(Group, VF, Decision, Cost); 6575 else 6576 setWideningDecision(&I, VF, Decision, Cost); 6577 } 6578 } 6579 6580 // Make sure that any load of address and any other address computation 6581 // remains scalar unless there is gather/scatter support. This avoids 6582 // inevitable extracts into address registers, and also has the benefit of 6583 // activating LSR more, since that pass can't optimize vectorized 6584 // addresses. 6585 if (TTI.prefersVectorizedAddressing()) 6586 return; 6587 6588 // Start with all scalar pointer uses. 6589 SmallPtrSet<Instruction *, 8> AddrDefs; 6590 for (BasicBlock *BB : TheLoop->blocks()) 6591 for (Instruction &I : *BB) { 6592 Instruction *PtrDef = 6593 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6594 if (PtrDef && TheLoop->contains(PtrDef) && 6595 getWideningDecision(&I, VF) != CM_GatherScatter) 6596 AddrDefs.insert(PtrDef); 6597 } 6598 6599 // Add all instructions used to generate the addresses. 6600 SmallVector<Instruction *, 4> Worklist; 6601 append_range(Worklist, AddrDefs); 6602 while (!Worklist.empty()) { 6603 Instruction *I = Worklist.pop_back_val(); 6604 for (auto &Op : I->operands()) 6605 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6606 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6607 AddrDefs.insert(InstOp).second) 6608 Worklist.push_back(InstOp); 6609 } 6610 6611 for (auto *I : AddrDefs) { 6612 if (isa<LoadInst>(I)) { 6613 // Setting the desired widening decision should ideally be handled in 6614 // by cost functions, but since this involves the task of finding out 6615 // if the loaded register is involved in an address computation, it is 6616 // instead changed here when we know this is the case. 6617 InstWidening Decision = getWideningDecision(I, VF); 6618 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6619 // Scalarize a widened load of address. 6620 setWideningDecision( 6621 I, VF, CM_Scalarize, 6622 (VF.getKnownMinValue() * 6623 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6624 else if (auto Group = getInterleavedAccessGroup(I)) { 6625 // Scalarize an interleave group of address loads. 6626 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6627 if (Instruction *Member = Group->getMember(I)) 6628 setWideningDecision( 6629 Member, VF, CM_Scalarize, 6630 (VF.getKnownMinValue() * 6631 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6632 } 6633 } 6634 } else 6635 // Make sure I gets scalarized and a cost estimate without 6636 // scalarization overhead. 6637 ForcedScalars[VF].insert(I); 6638 } 6639 } 6640 6641 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6642 assert(!VF.isScalar() && 6643 "Trying to set a vectorization decision for a scalar VF"); 6644 6645 for (BasicBlock *BB : TheLoop->blocks()) { 6646 // For each instruction in the old loop. 6647 for (Instruction &I : *BB) { 6648 CallInst *CI = dyn_cast<CallInst>(&I); 6649 6650 if (!CI) 6651 continue; 6652 6653 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6654 InstructionCost VectorCost = InstructionCost::getInvalid(); 6655 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6656 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6657 6658 Function *ScalarFunc = CI->getCalledFunction(); 6659 Type *ScalarRetTy = CI->getType(); 6660 SmallVector<Type *, 4> Tys, ScalarTys; 6661 bool MaskRequired = Legal->isMaskRequired(CI); 6662 for (auto &ArgOp : CI->args()) 6663 ScalarTys.push_back(ArgOp->getType()); 6664 6665 // Compute corresponding vector type for return value and arguments. 6666 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 6667 for (Type *ScalarTy : ScalarTys) 6668 Tys.push_back(ToVectorTy(ScalarTy, VF)); 6669 6670 // An in-loop reduction using an fmuladd intrinsic is a special case; 6671 // we don't want the normal cost for that intrinsic. 6672 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6673 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6674 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6675 getVectorIntrinsicIDForCall(CI, TLI), 6676 std::nullopt, *RedCost); 6677 continue; 6678 } 6679 6680 // Estimate cost of scalarized vector call. The source operands are 6681 // assumed to be vectors, so we need to extract individual elements from 6682 // there, execute VF scalar calls, and then gather the result into the 6683 // vector return value. 6684 InstructionCost ScalarCallCost = 6685 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6686 6687 // Compute costs of unpacking argument values for the scalar calls and 6688 // packing the return values to a vector. 6689 InstructionCost ScalarizationCost = 6690 getScalarizationOverhead(CI, VF, CostKind); 6691 6692 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6693 6694 // Find the cost of vectorizing the call, if we can find a suitable 6695 // vector variant of the function. 6696 bool UsesMask = false; 6697 VFInfo FuncInfo; 6698 Function *VecFunc = nullptr; 6699 // Search through any available variants for one we can use at this VF. 6700 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6701 // Must match requested VF. 6702 if (Info.Shape.VF != VF) 6703 continue; 6704 6705 // Must take a mask argument if one is required 6706 if (MaskRequired && !Info.isMasked()) 6707 continue; 6708 6709 // Check that all parameter kinds are supported 6710 bool ParamsOk = true; 6711 for (VFParameter Param : Info.Shape.Parameters) { 6712 switch (Param.ParamKind) { 6713 case VFParamKind::Vector: 6714 break; 6715 case VFParamKind::OMP_Uniform: { 6716 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6717 // Make sure the scalar parameter in the loop is invariant. 6718 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6719 TheLoop)) 6720 ParamsOk = false; 6721 break; 6722 } 6723 case VFParamKind::OMP_Linear: { 6724 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6725 // Find the stride for the scalar parameter in this loop and see if 6726 // it matches the stride for the variant. 6727 // TODO: do we need to figure out the cost of an extract to get the 6728 // first lane? Or do we hope that it will be folded away? 6729 ScalarEvolution *SE = PSE.getSE(); 6730 const auto *SAR = 6731 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6732 6733 if (!SAR || SAR->getLoop() != TheLoop) { 6734 ParamsOk = false; 6735 break; 6736 } 6737 6738 const SCEVConstant *Step = 6739 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6740 6741 if (!Step || 6742 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6743 ParamsOk = false; 6744 6745 break; 6746 } 6747 case VFParamKind::GlobalPredicate: 6748 UsesMask = true; 6749 break; 6750 default: 6751 ParamsOk = false; 6752 break; 6753 } 6754 } 6755 6756 if (!ParamsOk) 6757 continue; 6758 6759 // Found a suitable candidate, stop here. 6760 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6761 FuncInfo = Info; 6762 break; 6763 } 6764 6765 // Add in the cost of synthesizing a mask if one wasn't required. 6766 InstructionCost MaskCost = 0; 6767 if (VecFunc && UsesMask && !MaskRequired) 6768 MaskCost = TTI.getShuffleCost( 6769 TargetTransformInfo::SK_Broadcast, 6770 VectorType::get(IntegerType::getInt1Ty( 6771 VecFunc->getFunctionType()->getContext()), 6772 VF)); 6773 6774 if (TLI && VecFunc && !CI->isNoBuiltin()) 6775 VectorCost = 6776 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6777 6778 // Find the cost of an intrinsic; some targets may have instructions that 6779 // perform the operation without needing an actual call. 6780 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6781 if (IID != Intrinsic::not_intrinsic) 6782 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6783 6784 InstructionCost Cost = ScalarCost; 6785 InstWidening Decision = CM_Scalarize; 6786 6787 if (VectorCost <= Cost) { 6788 Cost = VectorCost; 6789 Decision = CM_VectorCall; 6790 } 6791 6792 if (IntrinsicCost <= Cost) { 6793 Cost = IntrinsicCost; 6794 Decision = CM_IntrinsicCall; 6795 } 6796 6797 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6798 FuncInfo.getParamIndexForOptionalMask(), Cost); 6799 } 6800 } 6801 } 6802 6803 InstructionCost 6804 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6805 Type *&VectorTy) { 6806 Type *RetTy = I->getType(); 6807 if (canTruncateToMinimalBitwidth(I, VF)) 6808 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6809 auto SE = PSE.getSE(); 6810 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6811 6812 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6813 ElementCount VF) -> bool { 6814 if (VF.isScalar()) 6815 return true; 6816 6817 auto Scalarized = InstsToScalarize.find(VF); 6818 assert(Scalarized != InstsToScalarize.end() && 6819 "VF not yet analyzed for scalarization profitability"); 6820 return !Scalarized->second.count(I) && 6821 llvm::all_of(I->users(), [&](User *U) { 6822 auto *UI = cast<Instruction>(U); 6823 return !Scalarized->second.count(UI); 6824 }); 6825 }; 6826 (void) hasSingleCopyAfterVectorization; 6827 6828 if (isScalarAfterVectorization(I, VF)) { 6829 // With the exception of GEPs and PHIs, after scalarization there should 6830 // only be one copy of the instruction generated in the loop. This is 6831 // because the VF is either 1, or any instructions that need scalarizing 6832 // have already been dealt with by the time we get here. As a result, 6833 // it means we don't have to multiply the instruction cost by VF. 6834 assert(I->getOpcode() == Instruction::GetElementPtr || 6835 I->getOpcode() == Instruction::PHI || 6836 (I->getOpcode() == Instruction::BitCast && 6837 I->getType()->isPointerTy()) || 6838 hasSingleCopyAfterVectorization(I, VF)); 6839 VectorTy = RetTy; 6840 } else 6841 VectorTy = ToVectorTy(RetTy, VF); 6842 6843 // TODO: We need to estimate the cost of intrinsic calls. 6844 switch (I->getOpcode()) { 6845 case Instruction::GetElementPtr: 6846 // We mark this instruction as zero-cost because the cost of GEPs in 6847 // vectorized code depends on whether the corresponding memory instruction 6848 // is scalarized or not. Therefore, we handle GEPs with the memory 6849 // instruction cost. 6850 return 0; 6851 case Instruction::Br: { 6852 // In cases of scalarized and predicated instructions, there will be VF 6853 // predicated blocks in the vectorized loop. Each branch around these 6854 // blocks requires also an extract of its vector compare i1 element. 6855 bool ScalarPredicatedBB = false; 6856 BranchInst *BI = cast<BranchInst>(I); 6857 if (VF.isVector() && BI->isConditional() && 6858 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6859 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 6860 ScalarPredicatedBB = true; 6861 6862 if (ScalarPredicatedBB) { 6863 // Not possible to scalarize scalable vector with predicated instructions. 6864 if (VF.isScalable()) 6865 return InstructionCost::getInvalid(); 6866 // Return cost for branches around scalarized and predicated blocks. 6867 auto *Vec_i1Ty = 6868 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6869 return ( 6870 TTI.getScalarizationOverhead( 6871 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 6872 /*Insert*/ false, /*Extract*/ true, CostKind) + 6873 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6874 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6875 // The back-edge branch will remain, as will all scalar branches. 6876 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6877 else 6878 // This branch will be eliminated by if-conversion. 6879 return 0; 6880 // Note: We currently assume zero cost for an unconditional branch inside 6881 // a predicated block since it will become a fall-through, although we 6882 // may decide in the future to call TTI for all branches. 6883 } 6884 case Instruction::PHI: { 6885 auto *Phi = cast<PHINode>(I); 6886 6887 // First-order recurrences are replaced by vector shuffles inside the loop. 6888 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6889 SmallVector<int> Mask(VF.getKnownMinValue()); 6890 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6891 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6892 cast<VectorType>(VectorTy), Mask, CostKind, 6893 VF.getKnownMinValue() - 1); 6894 } 6895 6896 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6897 // converted into select instructions. We require N - 1 selects per phi 6898 // node, where N is the number of incoming values. 6899 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6900 return (Phi->getNumIncomingValues() - 1) * 6901 TTI.getCmpSelInstrCost( 6902 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6903 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6904 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6905 6906 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6907 } 6908 case Instruction::UDiv: 6909 case Instruction::SDiv: 6910 case Instruction::URem: 6911 case Instruction::SRem: 6912 if (VF.isVector() && isPredicatedInst(I)) { 6913 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6914 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6915 ScalarCost : SafeDivisorCost; 6916 } 6917 // We've proven all lanes safe to speculate, fall through. 6918 [[fallthrough]]; 6919 case Instruction::Add: 6920 case Instruction::FAdd: 6921 case Instruction::Sub: 6922 case Instruction::FSub: 6923 case Instruction::Mul: 6924 case Instruction::FMul: 6925 case Instruction::FDiv: 6926 case Instruction::FRem: 6927 case Instruction::Shl: 6928 case Instruction::LShr: 6929 case Instruction::AShr: 6930 case Instruction::And: 6931 case Instruction::Or: 6932 case Instruction::Xor: { 6933 // If we're speculating on the stride being 1, the multiplication may 6934 // fold away. We can generalize this for all operations using the notion 6935 // of neutral elements. (TODO) 6936 if (I->getOpcode() == Instruction::Mul && 6937 (PSE.getSCEV(I->getOperand(0))->isOne() || 6938 PSE.getSCEV(I->getOperand(1))->isOne())) 6939 return 0; 6940 6941 // Detect reduction patterns 6942 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6943 return *RedCost; 6944 6945 // Certain instructions can be cheaper to vectorize if they have a constant 6946 // second vector operand. One example of this are shifts on x86. 6947 Value *Op2 = I->getOperand(1); 6948 auto Op2Info = TTI.getOperandInfo(Op2); 6949 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6950 Legal->isInvariant(Op2)) 6951 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6952 6953 SmallVector<const Value *, 4> Operands(I->operand_values()); 6954 return TTI.getArithmeticInstrCost( 6955 I->getOpcode(), VectorTy, CostKind, 6956 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6957 Op2Info, Operands, I); 6958 } 6959 case Instruction::FNeg: { 6960 return TTI.getArithmeticInstrCost( 6961 I->getOpcode(), VectorTy, CostKind, 6962 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6963 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6964 I->getOperand(0), I); 6965 } 6966 case Instruction::Select: { 6967 SelectInst *SI = cast<SelectInst>(I); 6968 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6969 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6970 6971 const Value *Op0, *Op1; 6972 using namespace llvm::PatternMatch; 6973 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6974 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6975 // select x, y, false --> x & y 6976 // select x, true, y --> x | y 6977 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6978 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6979 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6980 Op1->getType()->getScalarSizeInBits() == 1); 6981 6982 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6983 return TTI.getArithmeticInstrCost( 6984 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6985 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6986 } 6987 6988 Type *CondTy = SI->getCondition()->getType(); 6989 if (!ScalarCond) 6990 CondTy = VectorType::get(CondTy, VF); 6991 6992 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6993 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6994 Pred = Cmp->getPredicate(); 6995 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6996 CostKind, I); 6997 } 6998 case Instruction::ICmp: 6999 case Instruction::FCmp: { 7000 Type *ValTy = I->getOperand(0)->getType(); 7001 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7002 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7003 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7004 VectorTy = ToVectorTy(ValTy, VF); 7005 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7006 cast<CmpInst>(I)->getPredicate(), CostKind, 7007 I); 7008 } 7009 case Instruction::Store: 7010 case Instruction::Load: { 7011 ElementCount Width = VF; 7012 if (Width.isVector()) { 7013 InstWidening Decision = getWideningDecision(I, Width); 7014 assert(Decision != CM_Unknown && 7015 "CM decision should be taken at this point"); 7016 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7017 return InstructionCost::getInvalid(); 7018 if (Decision == CM_Scalarize) 7019 Width = ElementCount::getFixed(1); 7020 } 7021 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7022 return getMemoryInstructionCost(I, VF); 7023 } 7024 case Instruction::BitCast: 7025 if (I->getType()->isPointerTy()) 7026 return 0; 7027 [[fallthrough]]; 7028 case Instruction::ZExt: 7029 case Instruction::SExt: 7030 case Instruction::FPToUI: 7031 case Instruction::FPToSI: 7032 case Instruction::FPExt: 7033 case Instruction::PtrToInt: 7034 case Instruction::IntToPtr: 7035 case Instruction::SIToFP: 7036 case Instruction::UIToFP: 7037 case Instruction::Trunc: 7038 case Instruction::FPTrunc: { 7039 // Computes the CastContextHint from a Load/Store instruction. 7040 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7041 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7042 "Expected a load or a store!"); 7043 7044 if (VF.isScalar() || !TheLoop->contains(I)) 7045 return TTI::CastContextHint::Normal; 7046 7047 switch (getWideningDecision(I, VF)) { 7048 case LoopVectorizationCostModel::CM_GatherScatter: 7049 return TTI::CastContextHint::GatherScatter; 7050 case LoopVectorizationCostModel::CM_Interleave: 7051 return TTI::CastContextHint::Interleave; 7052 case LoopVectorizationCostModel::CM_Scalarize: 7053 case LoopVectorizationCostModel::CM_Widen: 7054 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7055 : TTI::CastContextHint::Normal; 7056 case LoopVectorizationCostModel::CM_Widen_Reverse: 7057 return TTI::CastContextHint::Reversed; 7058 case LoopVectorizationCostModel::CM_Unknown: 7059 llvm_unreachable("Instr did not go through cost modelling?"); 7060 case LoopVectorizationCostModel::CM_VectorCall: 7061 case LoopVectorizationCostModel::CM_IntrinsicCall: 7062 llvm_unreachable_internal("Instr has invalid widening decision"); 7063 } 7064 7065 llvm_unreachable("Unhandled case!"); 7066 }; 7067 7068 unsigned Opcode = I->getOpcode(); 7069 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7070 // For Trunc, the context is the only user, which must be a StoreInst. 7071 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7072 if (I->hasOneUse()) 7073 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7074 CCH = ComputeCCH(Store); 7075 } 7076 // For Z/Sext, the context is the operand, which must be a LoadInst. 7077 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7078 Opcode == Instruction::FPExt) { 7079 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7080 CCH = ComputeCCH(Load); 7081 } 7082 7083 // We optimize the truncation of induction variables having constant 7084 // integer steps. The cost of these truncations is the same as the scalar 7085 // operation. 7086 if (isOptimizableIVTruncate(I, VF)) { 7087 auto *Trunc = cast<TruncInst>(I); 7088 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7089 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7090 } 7091 7092 // Detect reduction patterns 7093 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7094 return *RedCost; 7095 7096 Type *SrcScalarTy = I->getOperand(0)->getType(); 7097 Type *SrcVecTy = 7098 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7099 if (canTruncateToMinimalBitwidth(I, VF)) { 7100 // This cast is going to be shrunk. This may remove the cast or it might 7101 // turn it into slightly different cast. For example, if MinBW == 16, 7102 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7103 // 7104 // Calculate the modified src and dest types. 7105 Type *MinVecTy = VectorTy; 7106 if (Opcode == Instruction::Trunc) { 7107 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7108 VectorTy = 7109 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7110 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7111 // Leave SrcVecTy unchanged - we only shrink the destination element 7112 // type. 7113 VectorTy = 7114 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7115 } 7116 } 7117 7118 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7119 } 7120 case Instruction::Call: 7121 return getVectorCallCost(cast<CallInst>(I), VF); 7122 case Instruction::ExtractValue: 7123 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7124 case Instruction::Alloca: 7125 // We cannot easily widen alloca to a scalable alloca, as 7126 // the result would need to be a vector of pointers. 7127 if (VF.isScalable()) 7128 return InstructionCost::getInvalid(); 7129 [[fallthrough]]; 7130 default: 7131 // This opcode is unknown. Assume that it is the same as 'mul'. 7132 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7133 } // end of switch. 7134 } 7135 7136 void LoopVectorizationCostModel::collectValuesToIgnore() { 7137 // Ignore ephemeral values. 7138 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7139 7140 // Find all stores to invariant variables. Since they are going to sink 7141 // outside the loop we do not need calculate cost for them. 7142 for (BasicBlock *BB : TheLoop->blocks()) 7143 for (Instruction &I : *BB) { 7144 StoreInst *SI; 7145 if ((SI = dyn_cast<StoreInst>(&I)) && 7146 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7147 ValuesToIgnore.insert(&I); 7148 } 7149 7150 // Ignore type-promoting instructions we identified during reduction 7151 // detection. 7152 for (const auto &Reduction : Legal->getReductionVars()) { 7153 const RecurrenceDescriptor &RedDes = Reduction.second; 7154 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7155 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7156 } 7157 // Ignore type-casting instructions we identified during induction 7158 // detection. 7159 for (const auto &Induction : Legal->getInductionVars()) { 7160 const InductionDescriptor &IndDes = Induction.second; 7161 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7162 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7163 } 7164 } 7165 7166 void LoopVectorizationCostModel::collectInLoopReductions() { 7167 for (const auto &Reduction : Legal->getReductionVars()) { 7168 PHINode *Phi = Reduction.first; 7169 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7170 7171 // We don't collect reductions that are type promoted (yet). 7172 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7173 continue; 7174 7175 // If the target would prefer this reduction to happen "in-loop", then we 7176 // want to record it as such. 7177 unsigned Opcode = RdxDesc.getOpcode(); 7178 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7179 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7180 TargetTransformInfo::ReductionFlags())) 7181 continue; 7182 7183 // Check that we can correctly put the reductions into the loop, by 7184 // finding the chain of operations that leads from the phi to the loop 7185 // exit value. 7186 SmallVector<Instruction *, 4> ReductionOperations = 7187 RdxDesc.getReductionOpChain(Phi, TheLoop); 7188 bool InLoop = !ReductionOperations.empty(); 7189 7190 if (InLoop) { 7191 InLoopReductions.insert(Phi); 7192 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7193 Instruction *LastChain = Phi; 7194 for (auto *I : ReductionOperations) { 7195 InLoopReductionImmediateChains[I] = LastChain; 7196 LastChain = I; 7197 } 7198 } 7199 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7200 << " reduction for phi: " << *Phi << "\n"); 7201 } 7202 } 7203 7204 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, 7205 DebugLoc DL, const Twine &Name) { 7206 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && 7207 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); 7208 return tryInsertInstruction( 7209 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); 7210 } 7211 7212 // This function will select a scalable VF if the target supports scalable 7213 // vectors and a fixed one otherwise. 7214 // TODO: we could return a pair of values that specify the max VF and 7215 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7216 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7217 // doesn't have a cost model that can choose which plan to execute if 7218 // more than one is generated. 7219 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7220 LoopVectorizationCostModel &CM) { 7221 unsigned WidestType; 7222 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7223 7224 TargetTransformInfo::RegisterKind RegKind = 7225 TTI.enableScalableVectorization() 7226 ? TargetTransformInfo::RGK_ScalableVector 7227 : TargetTransformInfo::RGK_FixedWidthVector; 7228 7229 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7230 unsigned N = RegSize.getKnownMinValue() / WidestType; 7231 return ElementCount::get(N, RegSize.isScalable()); 7232 } 7233 7234 VectorizationFactor 7235 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7236 ElementCount VF = UserVF; 7237 // Outer loop handling: They may require CFG and instruction level 7238 // transformations before even evaluating whether vectorization is profitable. 7239 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7240 // the vectorization pipeline. 7241 if (!OrigLoop->isInnermost()) { 7242 // If the user doesn't provide a vectorization factor, determine a 7243 // reasonable one. 7244 if (UserVF.isZero()) { 7245 VF = determineVPlanVF(TTI, CM); 7246 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7247 7248 // Make sure we have a VF > 1 for stress testing. 7249 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7250 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7251 << "overriding computed VF.\n"); 7252 VF = ElementCount::getFixed(4); 7253 } 7254 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7255 !ForceTargetSupportsScalableVectors) { 7256 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7257 << "not supported by the target.\n"); 7258 reportVectorizationFailure( 7259 "Scalable vectorization requested but not supported by the target", 7260 "the scalable user-specified vectorization width for outer-loop " 7261 "vectorization cannot be used because the target does not support " 7262 "scalable vectors.", 7263 "ScalableVFUnfeasible", ORE, OrigLoop); 7264 return VectorizationFactor::Disabled(); 7265 } 7266 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7267 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7268 "VF needs to be a power of two"); 7269 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7270 << "VF " << VF << " to build VPlans.\n"); 7271 buildVPlans(VF, VF); 7272 7273 // For VPlan build stress testing, we bail out after VPlan construction. 7274 if (VPlanBuildStressTest) 7275 return VectorizationFactor::Disabled(); 7276 7277 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7278 } 7279 7280 LLVM_DEBUG( 7281 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7282 "VPlan-native path.\n"); 7283 return VectorizationFactor::Disabled(); 7284 } 7285 7286 std::optional<VectorizationFactor> 7287 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7288 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7289 CM.collectValuesToIgnore(); 7290 CM.collectElementTypesForWidening(); 7291 7292 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7293 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7294 return std::nullopt; 7295 7296 // Invalidate interleave groups if all blocks of loop will be predicated. 7297 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7298 !useMaskedInterleavedAccesses(TTI)) { 7299 LLVM_DEBUG( 7300 dbgs() 7301 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7302 "which requires masked-interleaved support.\n"); 7303 if (CM.InterleaveInfo.invalidateGroups()) 7304 // Invalidating interleave groups also requires invalidating all decisions 7305 // based on them, which includes widening decisions and uniform and scalar 7306 // values. 7307 CM.invalidateCostModelingDecisions(); 7308 } 7309 7310 ElementCount MaxUserVF = 7311 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7312 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7313 if (!UserVF.isZero() && UserVFIsLegal) { 7314 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7315 "VF needs to be a power of two"); 7316 // Collect the instructions (and their associated costs) that will be more 7317 // profitable to scalarize. 7318 CM.collectInLoopReductions(); 7319 if (CM.selectUserVectorizationFactor(UserVF)) { 7320 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7321 buildVPlansWithVPRecipes(UserVF, UserVF); 7322 if (!hasPlanWithVF(UserVF)) { 7323 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7324 << ".\n"); 7325 return std::nullopt; 7326 } 7327 7328 LLVM_DEBUG(printPlans(dbgs())); 7329 return {{UserVF, 0, 0}}; 7330 } else 7331 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7332 "InvalidCost", ORE, OrigLoop); 7333 } 7334 7335 // Populate the set of Vectorization Factor Candidates. 7336 ElementCountSet VFCandidates; 7337 for (auto VF = ElementCount::getFixed(1); 7338 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7339 VFCandidates.insert(VF); 7340 for (auto VF = ElementCount::getScalable(1); 7341 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7342 VFCandidates.insert(VF); 7343 7344 CM.collectInLoopReductions(); 7345 for (const auto &VF : VFCandidates) { 7346 // Collect Uniform and Scalar instructions after vectorization with VF. 7347 CM.collectUniformsAndScalars(VF); 7348 7349 // Collect the instructions (and their associated costs) that will be more 7350 // profitable to scalarize. 7351 if (VF.isVector()) 7352 CM.collectInstsToScalarize(VF); 7353 } 7354 7355 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7356 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7357 7358 LLVM_DEBUG(printPlans(dbgs())); 7359 if (!MaxFactors.hasVector()) 7360 return VectorizationFactor::Disabled(); 7361 7362 // Select the optimal vectorization factor. 7363 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7364 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7365 if (!hasPlanWithVF(VF.Width)) { 7366 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7367 << ".\n"); 7368 return std::nullopt; 7369 } 7370 return VF; 7371 } 7372 7373 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7374 assert(count_if(VPlans, 7375 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7376 1 && 7377 "Best VF has not a single VPlan."); 7378 7379 for (const VPlanPtr &Plan : VPlans) { 7380 if (Plan->hasVF(VF)) 7381 return *Plan.get(); 7382 } 7383 llvm_unreachable("No plan found!"); 7384 } 7385 7386 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7387 SmallVector<Metadata *, 4> MDs; 7388 // Reserve first location for self reference to the LoopID metadata node. 7389 MDs.push_back(nullptr); 7390 bool IsUnrollMetadata = false; 7391 MDNode *LoopID = L->getLoopID(); 7392 if (LoopID) { 7393 // First find existing loop unrolling disable metadata. 7394 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7395 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7396 if (MD) { 7397 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7398 IsUnrollMetadata = 7399 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7400 } 7401 MDs.push_back(LoopID->getOperand(i)); 7402 } 7403 } 7404 7405 if (!IsUnrollMetadata) { 7406 // Add runtime unroll disable metadata. 7407 LLVMContext &Context = L->getHeader()->getContext(); 7408 SmallVector<Metadata *, 1> DisableOperands; 7409 DisableOperands.push_back( 7410 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7411 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7412 MDs.push_back(DisableNode); 7413 MDNode *NewLoopID = MDNode::get(Context, MDs); 7414 // Set operand 0 to refer to the loop id itself. 7415 NewLoopID->replaceOperandWith(0, NewLoopID); 7416 L->setLoopID(NewLoopID); 7417 } 7418 } 7419 7420 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is 7421 // create a merge phi node for it and add it to \p ReductionResumeValues. 7422 static void createAndCollectMergePhiForReduction( 7423 VPInstruction *RedResult, 7424 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, 7425 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { 7426 if (!RedResult || 7427 RedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7428 return; 7429 7430 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); 7431 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 7432 7433 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 7434 Value *FinalValue = 7435 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); 7436 auto *ResumePhi = 7437 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 7438 7439 // TODO: bc.merge.rdx should not be created here, instead it should be 7440 // modeled in VPlan. 7441 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); 7442 // Create a phi node that merges control-flow from the backedge-taken check 7443 // block and the middle block. 7444 auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", 7445 LoopScalarPreHeader->getTerminator()); 7446 7447 // If we are fixing reductions in the epilogue loop then we should already 7448 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 7449 // we carry over the incoming values correctly. 7450 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 7451 if (Incoming == LoopMiddleBlock) 7452 BCBlockPhi->addIncoming(FinalValue, Incoming); 7453 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) 7454 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 7455 Incoming); 7456 else 7457 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 7458 } 7459 7460 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 7461 // TODO: This fixup should instead be modeled in VPlan. 7462 // Fix the scalar loop reduction variable with the incoming reduction sum 7463 // from the vector body and from the backedge value. 7464 int IncomingEdgeBlockIdx = 7465 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 7466 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 7467 // Pick the other block. 7468 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 7469 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 7470 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 7471 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 7472 7473 ReductionResumeValues[&RdxDesc] = BCBlockPhi; 7474 } 7475 7476 std::pair<DenseMap<const SCEV *, Value *>, 7477 DenseMap<const RecurrenceDescriptor *, Value *>> 7478 LoopVectorizationPlanner::executePlan( 7479 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7480 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7481 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7482 assert(BestVPlan.hasVF(BestVF) && 7483 "Trying to execute plan with unsupported VF"); 7484 assert(BestVPlan.hasUF(BestUF) && 7485 "Trying to execute plan with unsupported UF"); 7486 assert( 7487 (IsEpilogueVectorization || !ExpandedSCEVs) && 7488 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7489 7490 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7491 << '\n'); 7492 7493 if (!IsEpilogueVectorization) 7494 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7495 7496 // Perform the actual loop transformation. 7497 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, 7498 OrigLoop->getHeader()->getContext()); 7499 7500 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7501 // before making any changes to the CFG. 7502 if (!BestVPlan.getPreheader()->empty()) { 7503 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7504 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7505 BestVPlan.getPreheader()->execute(&State); 7506 } 7507 if (!ILV.getTripCount()) 7508 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7509 else 7510 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7511 "count during epilogue vectorization"); 7512 7513 // 1. Set up the skeleton for vectorization, including vector pre-header and 7514 // middle block. The vector loop is created during VPlan execution. 7515 Value *CanonicalIVStartValue; 7516 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7517 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7518 : State.ExpandedSCEVs); 7519 7520 // Only use noalias metadata when using memory checks guaranteeing no overlap 7521 // across all iterations. 7522 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7523 std::unique_ptr<LoopVersioning> LVer = nullptr; 7524 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7525 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7526 7527 // We currently don't use LoopVersioning for the actual loop cloning but we 7528 // still use it to add the noalias metadata. 7529 // TODO: Find a better way to re-use LoopVersioning functionality to add 7530 // metadata. 7531 LVer = std::make_unique<LoopVersioning>( 7532 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7533 PSE.getSE()); 7534 State.LVer = &*LVer; 7535 State.LVer->prepareNoAliasMetadata(); 7536 } 7537 7538 ILV.collectPoisonGeneratingRecipes(State); 7539 7540 ILV.printDebugTracesAtStart(); 7541 7542 //===------------------------------------------------===// 7543 // 7544 // Notice: any optimization or new instruction that go 7545 // into the code below should also be implemented in 7546 // the cost-model. 7547 // 7548 //===------------------------------------------------===// 7549 7550 // 2. Copy and widen instructions from the old loop into the new loop. 7551 BestVPlan.prepareToExecute(ILV.getTripCount(), 7552 ILV.getOrCreateVectorTripCount(nullptr), 7553 CanonicalIVStartValue, State); 7554 7555 BestVPlan.execute(&State); 7556 7557 // 2.5 Collect reduction resume values. 7558 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; 7559 auto *ExitVPBB = 7560 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); 7561 for (VPRecipeBase &R : *ExitVPBB) { 7562 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), 7563 ReductionResumeValues, State, OrigLoop, 7564 State.CFG.VPBB2IRBB[ExitVPBB]); 7565 } 7566 7567 // 2.6. Maintain Loop Hints 7568 // Keep all loop hints from the original loop on the vector loop (we'll 7569 // replace the vectorizer-specific hints below). 7570 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7571 7572 std::optional<MDNode *> VectorizedLoopID = 7573 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7574 LLVMLoopVectorizeFollowupVectorized}); 7575 7576 VPBasicBlock *HeaderVPBB = 7577 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7578 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7579 if (VectorizedLoopID) 7580 L->setLoopID(*VectorizedLoopID); 7581 else { 7582 // Keep all loop hints from the original loop on the vector loop (we'll 7583 // replace the vectorizer-specific hints below). 7584 if (MDNode *LID = OrigLoop->getLoopID()) 7585 L->setLoopID(LID); 7586 7587 LoopVectorizeHints Hints(L, true, *ORE); 7588 Hints.setAlreadyVectorized(); 7589 } 7590 TargetTransformInfo::UnrollingPreferences UP; 7591 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7592 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7593 AddRuntimeUnrollDisableMetaData(L); 7594 7595 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7596 // predication, updating analyses. 7597 ILV.fixVectorizedLoop(State, BestVPlan); 7598 7599 ILV.printDebugTracesAtEnd(); 7600 7601 return {State.ExpandedSCEVs, ReductionResumeValues}; 7602 } 7603 7604 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7605 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7606 for (const auto &Plan : VPlans) 7607 if (PrintVPlansInDotFormat) 7608 Plan->printDOT(O); 7609 else 7610 Plan->print(O); 7611 } 7612 #endif 7613 7614 //===--------------------------------------------------------------------===// 7615 // EpilogueVectorizerMainLoop 7616 //===--------------------------------------------------------------------===// 7617 7618 /// This function is partially responsible for generating the control flow 7619 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7620 std::pair<BasicBlock *, Value *> 7621 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7622 const SCEV2ValueTy &ExpandedSCEVs) { 7623 createVectorLoopSkeleton(""); 7624 7625 // Generate the code to check the minimum iteration count of the vector 7626 // epilogue (see below). 7627 EPI.EpilogueIterationCountCheck = 7628 emitIterationCountCheck(LoopScalarPreHeader, true); 7629 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7630 7631 // Generate the code to check any assumptions that we've made for SCEV 7632 // expressions. 7633 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7634 7635 // Generate the code that checks at runtime if arrays overlap. We put the 7636 // checks into a separate block to make the more common case of few elements 7637 // faster. 7638 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7639 7640 // Generate the iteration count check for the main loop, *after* the check 7641 // for the epilogue loop, so that the path-length is shorter for the case 7642 // that goes directly through the vector epilogue. The longer-path length for 7643 // the main loop is compensated for, by the gain from vectorizing the larger 7644 // trip count. Note: the branch will get updated later on when we vectorize 7645 // the epilogue. 7646 EPI.MainLoopIterationCountCheck = 7647 emitIterationCountCheck(LoopScalarPreHeader, false); 7648 7649 // Generate the induction variable. 7650 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7651 7652 // Skip induction resume value creation here because they will be created in 7653 // the second pass for the scalar loop. The induction resume values for the 7654 // inductions in the epilogue loop are created before executing the plan for 7655 // the epilogue loop. 7656 7657 return {completeLoopSkeleton(), nullptr}; 7658 } 7659 7660 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7661 LLVM_DEBUG({ 7662 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7663 << "Main Loop VF:" << EPI.MainLoopVF 7664 << ", Main Loop UF:" << EPI.MainLoopUF 7665 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7666 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7667 }); 7668 } 7669 7670 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7671 DEBUG_WITH_TYPE(VerboseDebug, { 7672 dbgs() << "intermediate fn:\n" 7673 << *OrigLoop->getHeader()->getParent() << "\n"; 7674 }); 7675 } 7676 7677 BasicBlock * 7678 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7679 bool ForEpilogue) { 7680 assert(Bypass && "Expected valid bypass basic block."); 7681 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7682 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7683 Value *Count = getTripCount(); 7684 // Reuse existing vector loop preheader for TC checks. 7685 // Note that new preheader block is generated for vector loop. 7686 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7687 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7688 7689 // Generate code to check if the loop's trip count is less than VF * UF of the 7690 // main vector loop. 7691 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7692 : VF.isVector()) 7693 ? ICmpInst::ICMP_ULE 7694 : ICmpInst::ICMP_ULT; 7695 7696 Value *CheckMinIters = Builder.CreateICmp( 7697 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7698 "min.iters.check"); 7699 7700 if (!ForEpilogue) 7701 TCCheckBlock->setName("vector.main.loop.iter.check"); 7702 7703 // Create new preheader for vector loop. 7704 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7705 DT, LI, nullptr, "vector.ph"); 7706 7707 if (ForEpilogue) { 7708 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7709 DT->getNode(Bypass)->getIDom()) && 7710 "TC check is expected to dominate Bypass"); 7711 7712 // Update dominator for Bypass & LoopExit. 7713 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7714 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7715 // For loops with multiple exits, there's no edge from the middle block 7716 // to exit blocks (as the epilogue must run) and thus no need to update 7717 // the immediate dominator of the exit blocks. 7718 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7719 7720 LoopBypassBlocks.push_back(TCCheckBlock); 7721 7722 // Save the trip count so we don't have to regenerate it in the 7723 // vec.epilog.iter.check. This is safe to do because the trip count 7724 // generated here dominates the vector epilog iter check. 7725 EPI.TripCount = Count; 7726 } 7727 7728 BranchInst &BI = 7729 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7730 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7731 setBranchWeights(BI, MinItersBypassWeights); 7732 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7733 7734 return TCCheckBlock; 7735 } 7736 7737 //===--------------------------------------------------------------------===// 7738 // EpilogueVectorizerEpilogueLoop 7739 //===--------------------------------------------------------------------===// 7740 7741 /// This function is partially responsible for generating the control flow 7742 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7743 std::pair<BasicBlock *, Value *> 7744 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7745 const SCEV2ValueTy &ExpandedSCEVs) { 7746 createVectorLoopSkeleton("vec.epilog."); 7747 7748 // Now, compare the remaining count and if there aren't enough iterations to 7749 // execute the vectorized epilogue skip to the scalar part. 7750 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7751 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7752 LoopVectorPreHeader = 7753 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7754 LI, nullptr, "vec.epilog.ph"); 7755 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7756 VecEpilogueIterationCountCheck); 7757 7758 // Adjust the control flow taking the state info from the main loop 7759 // vectorization into account. 7760 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7761 "expected this to be saved from the previous pass."); 7762 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7763 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7764 7765 DT->changeImmediateDominator(LoopVectorPreHeader, 7766 EPI.MainLoopIterationCountCheck); 7767 7768 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7769 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7770 7771 if (EPI.SCEVSafetyCheck) 7772 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7773 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7774 if (EPI.MemSafetyCheck) 7775 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7776 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7777 7778 DT->changeImmediateDominator( 7779 VecEpilogueIterationCountCheck, 7780 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7781 7782 DT->changeImmediateDominator(LoopScalarPreHeader, 7783 EPI.EpilogueIterationCountCheck); 7784 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7785 // If there is an epilogue which must run, there's no edge from the 7786 // middle block to exit blocks and thus no need to update the immediate 7787 // dominator of the exit blocks. 7788 DT->changeImmediateDominator(LoopExitBlock, 7789 EPI.EpilogueIterationCountCheck); 7790 7791 // Keep track of bypass blocks, as they feed start values to the induction and 7792 // reduction phis in the scalar loop preheader. 7793 if (EPI.SCEVSafetyCheck) 7794 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7795 if (EPI.MemSafetyCheck) 7796 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7797 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7798 7799 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7800 // reductions which merge control-flow from the latch block and the middle 7801 // block. Update the incoming values here and move the Phi into the preheader. 7802 SmallVector<PHINode *, 4> PhisInBlock; 7803 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7804 PhisInBlock.push_back(&Phi); 7805 7806 for (PHINode *Phi : PhisInBlock) { 7807 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7808 Phi->replaceIncomingBlockWith( 7809 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7810 VecEpilogueIterationCountCheck); 7811 7812 // If the phi doesn't have an incoming value from the 7813 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7814 // value and also those from other check blocks. This is needed for 7815 // reduction phis only. 7816 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7817 return EPI.EpilogueIterationCountCheck == IncB; 7818 })) 7819 continue; 7820 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7821 if (EPI.SCEVSafetyCheck) 7822 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7823 if (EPI.MemSafetyCheck) 7824 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7825 } 7826 7827 // Generate a resume induction for the vector epilogue and put it in the 7828 // vector epilogue preheader 7829 Type *IdxTy = Legal->getWidestInductionType(); 7830 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); 7831 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7832 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7833 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7834 EPI.MainLoopIterationCountCheck); 7835 7836 // Generate induction resume values. These variables save the new starting 7837 // indexes for the scalar loop. They are used to test if there are any tail 7838 // iterations left once the vector loop has completed. 7839 // Note that when the vectorized epilogue is skipped due to iteration count 7840 // check, then the resume value for the induction variable comes from 7841 // the trip count of the main vector loop, hence passing the AdditionalBypass 7842 // argument. 7843 createInductionResumeValues(ExpandedSCEVs, 7844 {VecEpilogueIterationCountCheck, 7845 EPI.VectorTripCount} /* AdditionalBypass */); 7846 7847 return {completeLoopSkeleton(), EPResumeVal}; 7848 } 7849 7850 BasicBlock * 7851 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7852 BasicBlock *Bypass, BasicBlock *Insert) { 7853 7854 assert(EPI.TripCount && 7855 "Expected trip count to have been safed in the first pass."); 7856 assert( 7857 (!isa<Instruction>(EPI.TripCount) || 7858 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7859 "saved trip count does not dominate insertion point."); 7860 Value *TC = EPI.TripCount; 7861 IRBuilder<> Builder(Insert->getTerminator()); 7862 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7863 7864 // Generate code to check if the loop's trip count is less than VF * UF of the 7865 // vector epilogue loop. 7866 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 7867 ? ICmpInst::ICMP_ULE 7868 : ICmpInst::ICMP_ULT; 7869 7870 Value *CheckMinIters = 7871 Builder.CreateICmp(P, Count, 7872 createStepForVF(Builder, Count->getType(), 7873 EPI.EpilogueVF, EPI.EpilogueUF), 7874 "min.epilog.iters.check"); 7875 7876 BranchInst &BI = 7877 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7878 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7879 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 7880 unsigned EpilogueLoopStep = 7881 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 7882 // We assume the remaining `Count` is equally distributed in 7883 // [0, MainLoopStep) 7884 // So the probability for `Count < EpilogueLoopStep` should be 7885 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 7886 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 7887 const uint32_t Weights[] = {EstimatedSkipCount, 7888 MainLoopStep - EstimatedSkipCount}; 7889 setBranchWeights(BI, Weights); 7890 } 7891 ReplaceInstWithInst(Insert->getTerminator(), &BI); 7892 7893 LoopBypassBlocks.push_back(Insert); 7894 return Insert; 7895 } 7896 7897 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7898 LLVM_DEBUG({ 7899 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7900 << "Epilogue Loop VF:" << EPI.EpilogueVF 7901 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7902 }); 7903 } 7904 7905 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7906 DEBUG_WITH_TYPE(VerboseDebug, { 7907 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7908 }); 7909 } 7910 7911 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7912 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7913 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7914 bool PredicateAtRangeStart = Predicate(Range.Start); 7915 7916 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 7917 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7918 Range.End = TmpVF; 7919 break; 7920 } 7921 7922 return PredicateAtRangeStart; 7923 } 7924 7925 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7926 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7927 /// of VF's starting at a given VF and extending it as much as possible. Each 7928 /// vectorization decision can potentially shorten this sub-range during 7929 /// buildVPlan(). 7930 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7931 ElementCount MaxVF) { 7932 auto MaxVFTimes2 = MaxVF * 2; 7933 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 7934 VFRange SubRange = {VF, MaxVFTimes2}; 7935 VPlans.push_back(buildVPlan(SubRange)); 7936 VF = SubRange.End; 7937 } 7938 } 7939 7940 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7941 VPlan &Plan) { 7942 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7943 7944 // Look for cached value. 7945 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7946 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7947 if (ECEntryIt != EdgeMaskCache.end()) 7948 return ECEntryIt->second; 7949 7950 VPValue *SrcMask = getBlockInMask(Src); 7951 7952 // The terminator has to be a branch inst! 7953 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7954 assert(BI && "Unexpected terminator found"); 7955 7956 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7957 return EdgeMaskCache[Edge] = SrcMask; 7958 7959 // If source is an exiting block, we know the exit edge is dynamically dead 7960 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7961 // adding uses of an otherwise potentially dead instruction. 7962 if (OrigLoop->isLoopExiting(Src)) 7963 return EdgeMaskCache[Edge] = SrcMask; 7964 7965 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 7966 assert(EdgeMask && "No Edge Mask found for condition"); 7967 7968 if (BI->getSuccessor(0) != Dst) 7969 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7970 7971 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 7972 // The condition is 'SrcMask && EdgeMask', which is equivalent to 7973 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 7974 // The select version does not introduce new UB if SrcMask is false and 7975 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 7976 VPValue *False = Plan.getVPValueOrAddLiveIn( 7977 ConstantInt::getFalse(BI->getCondition()->getType())); 7978 EdgeMask = 7979 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 7980 } 7981 7982 return EdgeMaskCache[Edge] = EdgeMask; 7983 } 7984 7985 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { 7986 BasicBlock *Header = OrigLoop->getHeader(); 7987 7988 // When not folding the tail, use nullptr to model all-true mask. 7989 if (!CM.foldTailByMasking()) { 7990 BlockMaskCache[Header] = nullptr; 7991 return; 7992 } 7993 7994 // Introduce the early-exit compare IV <= BTC to form header block mask. 7995 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 7996 // constructing the desired canonical IV in the header block as its first 7997 // non-phi instructions. 7998 7999 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8000 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8001 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8002 HeaderVPBB->insert(IV, NewInsertionPoint); 8003 8004 VPBuilder::InsertPointGuard Guard(Builder); 8005 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8006 VPValue *BlockMask = nullptr; 8007 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8008 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8009 BlockMaskCache[Header] = BlockMask; 8010 } 8011 8012 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8013 // Return the cached value. 8014 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8015 assert(BCEntryIt != BlockMaskCache.end() && 8016 "Trying to access mask for block without one."); 8017 return BCEntryIt->second; 8018 } 8019 8020 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8021 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8022 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8023 assert(OrigLoop->getHeader() != BB && 8024 "Loop header must have cached block mask"); 8025 8026 // All-one mask is modelled as no-mask following the convention for masked 8027 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8028 VPValue *BlockMask = nullptr; 8029 // This is the block mask. We OR all incoming edges. 8030 for (auto *Predecessor : predecessors(BB)) { 8031 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8032 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8033 BlockMaskCache[BB] = EdgeMask; 8034 } 8035 8036 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8037 BlockMask = EdgeMask; 8038 continue; 8039 } 8040 8041 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8042 } 8043 8044 BlockMaskCache[BB] = BlockMask; 8045 } 8046 8047 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8048 ArrayRef<VPValue *> Operands, 8049 VFRange &Range, 8050 VPlanPtr &Plan) { 8051 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8052 "Must be called with either a load or store"); 8053 8054 auto willWiden = [&](ElementCount VF) -> bool { 8055 LoopVectorizationCostModel::InstWidening Decision = 8056 CM.getWideningDecision(I, VF); 8057 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8058 "CM decision should be taken at this point."); 8059 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8060 return true; 8061 if (CM.isScalarAfterVectorization(I, VF) || 8062 CM.isProfitableToScalarize(I, VF)) 8063 return false; 8064 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8065 }; 8066 8067 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8068 return nullptr; 8069 8070 VPValue *Mask = nullptr; 8071 if (Legal->isMaskRequired(I)) 8072 Mask = getBlockInMask(I->getParent()); 8073 8074 // Determine if the pointer operand of the access is either consecutive or 8075 // reverse consecutive. 8076 LoopVectorizationCostModel::InstWidening Decision = 8077 CM.getWideningDecision(I, Range.Start); 8078 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8079 bool Consecutive = 8080 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8081 8082 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8083 if (Consecutive) { 8084 auto *GEP = dyn_cast<GetElementPtrInst>( 8085 Ptr->getUnderlyingValue()->stripPointerCasts()); 8086 auto *VectorPtr = new VPVectorPointerRecipe( 8087 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, 8088 I->getDebugLoc()); 8089 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8090 Ptr = VectorPtr; 8091 } 8092 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8093 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, 8094 Reverse); 8095 8096 StoreInst *Store = cast<StoreInst>(I); 8097 return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask, 8098 Consecutive, Reverse); 8099 } 8100 8101 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8102 /// insert a recipe to expand the step for the induction recipe. 8103 static VPWidenIntOrFpInductionRecipe * 8104 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8105 VPValue *Start, const InductionDescriptor &IndDesc, 8106 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8107 VFRange &Range) { 8108 assert(IndDesc.getStartValue() == 8109 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8110 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8111 "step must be loop invariant"); 8112 8113 VPValue *Step = 8114 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8115 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8116 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8117 } 8118 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8119 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8120 } 8121 8122 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8123 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8124 8125 // Check if this is an integer or fp induction. If so, build the recipe that 8126 // produces its scalar and vector values. 8127 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8128 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8129 *PSE.getSE(), *OrigLoop, Range); 8130 8131 // Check if this is pointer induction. If so, build the recipe for it. 8132 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8133 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8134 *PSE.getSE()); 8135 return new VPWidenPointerInductionRecipe( 8136 Phi, Operands[0], Step, *II, 8137 LoopVectorizationPlanner::getDecisionAndClampRange( 8138 [&](ElementCount VF) { 8139 return CM.isScalarAfterVectorization(Phi, VF); 8140 }, 8141 Range)); 8142 } 8143 return nullptr; 8144 } 8145 8146 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8147 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8148 // Optimize the special case where the source is a constant integer 8149 // induction variable. Notice that we can only optimize the 'trunc' case 8150 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8151 // (c) other casts depend on pointer size. 8152 8153 // Determine whether \p K is a truncation based on an induction variable that 8154 // can be optimized. 8155 auto isOptimizableIVTruncate = 8156 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8157 return [=](ElementCount VF) -> bool { 8158 return CM.isOptimizableIVTruncate(K, VF); 8159 }; 8160 }; 8161 8162 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8163 isOptimizableIVTruncate(I), Range)) { 8164 8165 auto *Phi = cast<PHINode>(I->getOperand(0)); 8166 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8167 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8168 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8169 *OrigLoop, Range); 8170 } 8171 return nullptr; 8172 } 8173 8174 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8175 ArrayRef<VPValue *> Operands, 8176 VPlanPtr &Plan) { 8177 // If all incoming values are equal, the incoming VPValue can be used directly 8178 // instead of creating a new VPBlendRecipe. 8179 if (llvm::all_equal(Operands)) 8180 return Operands[0]; 8181 8182 unsigned NumIncoming = Phi->getNumIncomingValues(); 8183 // For in-loop reductions, we do not need to create an additional select. 8184 VPValue *InLoopVal = nullptr; 8185 for (unsigned In = 0; In < NumIncoming; In++) { 8186 PHINode *PhiOp = 8187 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8188 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8189 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8190 InLoopVal = Operands[In]; 8191 } 8192 } 8193 8194 assert((!InLoopVal || NumIncoming == 2) && 8195 "Found an in-loop reduction for PHI with unexpected number of " 8196 "incoming values"); 8197 if (InLoopVal) 8198 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8199 8200 // We know that all PHIs in non-header blocks are converted into selects, so 8201 // we don't have to worry about the insertion order and we can just use the 8202 // builder. At this point we generate the predication tree. There may be 8203 // duplications since this is a simple recursive scan, but future 8204 // optimizations will clean it up. 8205 SmallVector<VPValue *, 2> OperandsWithMask; 8206 8207 for (unsigned In = 0; In < NumIncoming; In++) { 8208 VPValue *EdgeMask = 8209 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8210 assert((EdgeMask || NumIncoming == 1) && 8211 "Multiple predecessors with one having a full mask"); 8212 OperandsWithMask.push_back(Operands[In]); 8213 if (EdgeMask) 8214 OperandsWithMask.push_back(EdgeMask); 8215 } 8216 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8217 } 8218 8219 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8220 ArrayRef<VPValue *> Operands, 8221 VFRange &Range, 8222 VPlanPtr &Plan) { 8223 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8224 [this, CI](ElementCount VF) { 8225 return CM.isScalarWithPredication(CI, VF); 8226 }, 8227 Range); 8228 8229 if (IsPredicated) 8230 return nullptr; 8231 8232 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8233 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8234 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8235 ID == Intrinsic::pseudoprobe || 8236 ID == Intrinsic::experimental_noalias_scope_decl)) 8237 return nullptr; 8238 8239 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8240 8241 // Is it beneficial to perform intrinsic call compared to lib call? 8242 bool ShouldUseVectorIntrinsic = 8243 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8244 [&](ElementCount VF) -> bool { 8245 return CM.getCallWideningDecision(CI, VF).Kind == 8246 LoopVectorizationCostModel::CM_IntrinsicCall; 8247 }, 8248 Range); 8249 if (ShouldUseVectorIntrinsic) 8250 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8251 8252 Function *Variant = nullptr; 8253 std::optional<unsigned> MaskPos; 8254 // Is better to call a vectorized version of the function than to to scalarize 8255 // the call? 8256 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8257 [&](ElementCount VF) -> bool { 8258 // The following case may be scalarized depending on the VF. 8259 // The flag shows whether we can use a usual Call for vectorized 8260 // version of the instruction. 8261 8262 // If we've found a variant at a previous VF, then stop looking. A 8263 // vectorized variant of a function expects input in a certain shape 8264 // -- basically the number of input registers, the number of lanes 8265 // per register, and whether there's a mask required. 8266 // We store a pointer to the variant in the VPWidenCallRecipe, so 8267 // once we have an appropriate variant it's only valid for that VF. 8268 // This will force a different vplan to be generated for each VF that 8269 // finds a valid variant. 8270 if (Variant) 8271 return false; 8272 LoopVectorizationCostModel::CallWideningDecision Decision = 8273 CM.getCallWideningDecision(CI, VF); 8274 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8275 Variant = Decision.Variant; 8276 MaskPos = Decision.MaskPos; 8277 return true; 8278 } 8279 8280 return false; 8281 }, 8282 Range); 8283 if (ShouldUseVectorCall) { 8284 if (MaskPos.has_value()) { 8285 // We have 2 cases that would require a mask: 8286 // 1) The block needs to be predicated, either due to a conditional 8287 // in the scalar loop or use of an active lane mask with 8288 // tail-folding, and we use the appropriate mask for the block. 8289 // 2) No mask is required for the block, but the only available 8290 // vector variant at this VF requires a mask, so we synthesize an 8291 // all-true mask. 8292 VPValue *Mask = nullptr; 8293 if (Legal->isMaskRequired(CI)) 8294 Mask = getBlockInMask(CI->getParent()); 8295 else 8296 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8297 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8298 8299 Ops.insert(Ops.begin() + *MaskPos, Mask); 8300 } 8301 8302 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8303 Intrinsic::not_intrinsic, Variant); 8304 } 8305 8306 return nullptr; 8307 } 8308 8309 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8310 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8311 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8312 // Instruction should be widened, unless it is scalar after vectorization, 8313 // scalarization is profitable or it is predicated. 8314 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8315 return CM.isScalarAfterVectorization(I, VF) || 8316 CM.isProfitableToScalarize(I, VF) || 8317 CM.isScalarWithPredication(I, VF); 8318 }; 8319 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8320 Range); 8321 } 8322 8323 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8324 ArrayRef<VPValue *> Operands, 8325 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8326 switch (I->getOpcode()) { 8327 default: 8328 return nullptr; 8329 case Instruction::SDiv: 8330 case Instruction::UDiv: 8331 case Instruction::SRem: 8332 case Instruction::URem: { 8333 // If not provably safe, use a select to form a safe divisor before widening the 8334 // div/rem operation itself. Otherwise fall through to general handling below. 8335 if (CM.isPredicatedInst(I)) { 8336 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8337 VPValue *Mask = getBlockInMask(I->getParent()); 8338 VPValue *One = Plan->getVPValueOrAddLiveIn( 8339 ConstantInt::get(I->getType(), 1u, false)); 8340 auto *SafeRHS = 8341 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8342 I->getDebugLoc()); 8343 VPBB->appendRecipe(SafeRHS); 8344 Ops[1] = SafeRHS; 8345 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8346 } 8347 [[fallthrough]]; 8348 } 8349 case Instruction::Add: 8350 case Instruction::And: 8351 case Instruction::AShr: 8352 case Instruction::FAdd: 8353 case Instruction::FCmp: 8354 case Instruction::FDiv: 8355 case Instruction::FMul: 8356 case Instruction::FNeg: 8357 case Instruction::FRem: 8358 case Instruction::FSub: 8359 case Instruction::ICmp: 8360 case Instruction::LShr: 8361 case Instruction::Mul: 8362 case Instruction::Or: 8363 case Instruction::Select: 8364 case Instruction::Shl: 8365 case Instruction::Sub: 8366 case Instruction::Xor: 8367 case Instruction::Freeze: 8368 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8369 }; 8370 } 8371 8372 void VPRecipeBuilder::fixHeaderPhis() { 8373 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8374 for (VPHeaderPHIRecipe *R : PhisToFix) { 8375 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8376 VPRecipeBase *IncR = 8377 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8378 R->addOperand(IncR->getVPSingleValue()); 8379 } 8380 } 8381 8382 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8383 VFRange &Range, 8384 VPlan &Plan) { 8385 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8386 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8387 Range); 8388 8389 bool IsPredicated = CM.isPredicatedInst(I); 8390 8391 // Even if the instruction is not marked as uniform, there are certain 8392 // intrinsic calls that can be effectively treated as such, so we check for 8393 // them here. Conservatively, we only do this for scalable vectors, since 8394 // for fixed-width VFs we can always fall back on full scalarization. 8395 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8396 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8397 case Intrinsic::assume: 8398 case Intrinsic::lifetime_start: 8399 case Intrinsic::lifetime_end: 8400 // For scalable vectors if one of the operands is variant then we still 8401 // want to mark as uniform, which will generate one instruction for just 8402 // the first lane of the vector. We can't scalarize the call in the same 8403 // way as for fixed-width vectors because we don't know how many lanes 8404 // there are. 8405 // 8406 // The reasons for doing it this way for scalable vectors are: 8407 // 1. For the assume intrinsic generating the instruction for the first 8408 // lane is still be better than not generating any at all. For 8409 // example, the input may be a splat across all lanes. 8410 // 2. For the lifetime start/end intrinsics the pointer operand only 8411 // does anything useful when the input comes from a stack object, 8412 // which suggests it should always be uniform. For non-stack objects 8413 // the effect is to poison the object, which still allows us to 8414 // remove the call. 8415 IsUniform = true; 8416 break; 8417 default: 8418 break; 8419 } 8420 } 8421 VPValue *BlockInMask = nullptr; 8422 if (!IsPredicated) { 8423 // Finalize the recipe for Instr, first if it is not predicated. 8424 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8425 } else { 8426 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8427 // Instructions marked for predication are replicated and a mask operand is 8428 // added initially. Masked replicate recipes will later be placed under an 8429 // if-then construct to prevent side-effects. Generate recipes to compute 8430 // the block mask for this region. 8431 BlockInMask = getBlockInMask(I->getParent()); 8432 } 8433 8434 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8435 IsUniform, BlockInMask); 8436 return toVPRecipeResult(Recipe); 8437 } 8438 8439 VPRecipeOrVPValueTy 8440 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8441 ArrayRef<VPValue *> Operands, 8442 VFRange &Range, VPBasicBlock *VPBB, 8443 VPlanPtr &Plan) { 8444 // First, check for specific widening recipes that deal with inductions, Phi 8445 // nodes, calls and memory operations. 8446 VPRecipeBase *Recipe; 8447 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8448 if (Phi->getParent() != OrigLoop->getHeader()) 8449 return tryToBlend(Phi, Operands, Plan); 8450 8451 // Always record recipes for header phis. Later first-order recurrence phis 8452 // can have earlier phis as incoming values. 8453 recordRecipeOf(Phi); 8454 8455 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8456 return toVPRecipeResult(Recipe); 8457 8458 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8459 assert((Legal->isReductionVariable(Phi) || 8460 Legal->isFixedOrderRecurrence(Phi)) && 8461 "can only widen reductions and fixed-order recurrences here"); 8462 VPValue *StartV = Operands[0]; 8463 if (Legal->isReductionVariable(Phi)) { 8464 const RecurrenceDescriptor &RdxDesc = 8465 Legal->getReductionVars().find(Phi)->second; 8466 assert(RdxDesc.getRecurrenceStartValue() == 8467 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8468 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8469 CM.isInLoopReduction(Phi), 8470 CM.useOrderedReductions(RdxDesc)); 8471 } else { 8472 // TODO: Currently fixed-order recurrences are modeled as chains of 8473 // first-order recurrences. If there are no users of the intermediate 8474 // recurrences in the chain, the fixed order recurrence should be modeled 8475 // directly, enabling more efficient codegen. 8476 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8477 } 8478 8479 // Record the incoming value from the backedge, so we can add the incoming 8480 // value from the backedge after all recipes have been created. 8481 auto *Inc = cast<Instruction>( 8482 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8483 auto RecipeIter = Ingredient2Recipe.find(Inc); 8484 if (RecipeIter == Ingredient2Recipe.end()) 8485 recordRecipeOf(Inc); 8486 8487 PhisToFix.push_back(PhiRecipe); 8488 return toVPRecipeResult(PhiRecipe); 8489 } 8490 8491 if (isa<TruncInst>(Instr) && 8492 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8493 Range, *Plan))) 8494 return toVPRecipeResult(Recipe); 8495 8496 // All widen recipes below deal only with VF > 1. 8497 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8498 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8499 return nullptr; 8500 8501 if (auto *CI = dyn_cast<CallInst>(Instr)) 8502 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8503 8504 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8505 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8506 8507 if (!shouldWiden(Instr, Range)) 8508 return nullptr; 8509 8510 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8511 return toVPRecipeResult(new VPWidenGEPRecipe( 8512 GEP, make_range(Operands.begin(), Operands.end()))); 8513 8514 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8515 return toVPRecipeResult(new VPWidenSelectRecipe( 8516 *SI, make_range(Operands.begin(), Operands.end()))); 8517 } 8518 8519 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8520 return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], 8521 CI->getType(), *CI)); 8522 } 8523 8524 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8525 } 8526 8527 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8528 ElementCount MaxVF) { 8529 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8530 8531 auto MaxVFTimes2 = MaxVF * 2; 8532 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8533 VFRange SubRange = {VF, MaxVFTimes2}; 8534 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8535 // Now optimize the initial VPlan. 8536 if (!Plan->hasVF(ElementCount::getFixed(1))) 8537 VPlanTransforms::truncateToMinimalBitwidths( 8538 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); 8539 VPlanTransforms::optimize(*Plan, *PSE.getSE()); 8540 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 8541 VPlans.push_back(std::move(Plan)); 8542 } 8543 VF = SubRange.End; 8544 } 8545 } 8546 8547 // Add the necessary canonical IV and branch recipes required to control the 8548 // loop. 8549 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8550 DebugLoc DL) { 8551 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8552 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8553 8554 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8555 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8556 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8557 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8558 Header->insert(CanonicalIVPHI, Header->begin()); 8559 8560 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8561 // IV by VF * UF. 8562 auto *CanonicalIVIncrement = 8563 new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, 8564 {HasNUW, false}, DL, "index.next"); 8565 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8566 8567 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8568 EB->appendRecipe(CanonicalIVIncrement); 8569 8570 // Add the BranchOnCount VPInstruction to the latch. 8571 VPInstruction *BranchBack = 8572 new VPInstruction(VPInstruction::BranchOnCount, 8573 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8574 EB->appendRecipe(BranchBack); 8575 } 8576 8577 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8578 // original exit block. 8579 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, 8580 VPlan &Plan) { 8581 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8582 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8583 // Only handle single-exit loops with unique exit blocks for now. 8584 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8585 return; 8586 8587 // Introduce VPUsers modeling the exit values. 8588 for (PHINode &ExitPhi : ExitBB->phis()) { 8589 Value *IncomingValue = 8590 ExitPhi.getIncomingValueForBlock(ExitingBB); 8591 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8592 Plan.addLiveOut(&ExitPhi, V); 8593 } 8594 } 8595 8596 VPlanPtr 8597 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 8598 8599 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8600 8601 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8602 8603 // --------------------------------------------------------------------------- 8604 // Pre-construction: record ingredients whose recipes we'll need to further 8605 // process after constructing the initial VPlan. 8606 // --------------------------------------------------------------------------- 8607 8608 // For each interleave group which is relevant for this (possibly trimmed) 8609 // Range, add it to the set of groups to be later applied to the VPlan and add 8610 // placeholders for its members' Recipes which we'll be replacing with a 8611 // single VPInterleaveRecipe. 8612 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8613 auto applyIG = [IG, this](ElementCount VF) -> bool { 8614 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8615 CM.getWideningDecision(IG->getInsertPos(), VF) == 8616 LoopVectorizationCostModel::CM_Interleave); 8617 // For scalable vectors, the only interleave factor currently supported 8618 // is 2 since we require the (de)interleave2 intrinsics instead of 8619 // shufflevectors. 8620 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8621 "Unsupported interleave factor for scalable vectors"); 8622 return Result; 8623 }; 8624 if (!getDecisionAndClampRange(applyIG, Range)) 8625 continue; 8626 InterleaveGroups.insert(IG); 8627 for (unsigned i = 0; i < IG->getFactor(); i++) 8628 if (Instruction *Member = IG->getMember(i)) 8629 RecipeBuilder.recordRecipeOf(Member); 8630 }; 8631 8632 // --------------------------------------------------------------------------- 8633 // Build initial VPlan: Scan the body of the loop in a topological order to 8634 // visit each basic block after having visited its predecessor basic blocks. 8635 // --------------------------------------------------------------------------- 8636 8637 // Create initial VPlan skeleton, having a basic block for the pre-header 8638 // which contains SCEV expansions that need to happen before the CFG is 8639 // modified; a basic block for the vector pre-header, followed by a region for 8640 // the vector loop, followed by the middle basic block. The skeleton vector 8641 // loop region contains a header and latch basic blocks. 8642 VPlanPtr Plan = VPlan::createInitialVPlan( 8643 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8644 *PSE.getSE()); 8645 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8646 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8647 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8648 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); 8649 Plan->getVectorLoopRegion()->setExiting(LatchVPBB); 8650 8651 // Don't use getDecisionAndClampRange here, because we don't know the UF 8652 // so this function is better to be conservative, rather than to split 8653 // it up into different VPlans. 8654 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 8655 bool IVUpdateMayOverflow = false; 8656 for (ElementCount VF : Range) 8657 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8658 8659 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8660 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 8661 // When not folding the tail, we know that the induction increment will not 8662 // overflow. 8663 bool HasNUW = Style == TailFoldingStyle::None; 8664 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 8665 8666 // Scan the body of the loop in a topological order to visit each basic block 8667 // after having visited its predecessor basic blocks. 8668 LoopBlocksDFS DFS(OrigLoop); 8669 DFS.perform(LI); 8670 8671 VPBasicBlock *VPBB = HeaderVPBB; 8672 bool NeedsMasks = CM.foldTailByMasking() || 8673 any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { 8674 return Legal->blockNeedsPredication(BB); 8675 }); 8676 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8677 // Relevant instructions from basic block BB will be grouped into VPRecipe 8678 // ingredients and fill a new VPBasicBlock. 8679 if (VPBB != HeaderVPBB) 8680 VPBB->setName(BB->getName()); 8681 Builder.setInsertPoint(VPBB); 8682 8683 if (VPBB == HeaderVPBB) 8684 RecipeBuilder.createHeaderMask(*Plan); 8685 else if (NeedsMasks) 8686 RecipeBuilder.createBlockInMask(BB, *Plan); 8687 8688 // Introduce each ingredient into VPlan. 8689 // TODO: Model and preserve debug intrinsics in VPlan. 8690 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 8691 Instruction *Instr = &I; 8692 SmallVector<VPValue *, 4> Operands; 8693 auto *Phi = dyn_cast<PHINode>(Instr); 8694 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8695 Operands.push_back(Plan->getVPValueOrAddLiveIn( 8696 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8697 } else { 8698 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8699 Operands = {OpRange.begin(), OpRange.end()}; 8700 } 8701 8702 // Invariant stores inside loop will be deleted and a single store 8703 // with the final reduction value will be added to the exit block 8704 StoreInst *SI; 8705 if ((SI = dyn_cast<StoreInst>(&I)) && 8706 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8707 continue; 8708 8709 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8710 Instr, Operands, Range, VPBB, Plan); 8711 if (!RecipeOrValue) 8712 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 8713 // If Instr can be simplified to an existing VPValue, use it. 8714 if (isa<VPValue *>(RecipeOrValue)) { 8715 auto *VPV = cast<VPValue *>(RecipeOrValue); 8716 Plan->addVPValue(Instr, VPV); 8717 // If the re-used value is a recipe, register the recipe for the 8718 // instruction, in case the recipe for Instr needs to be recorded. 8719 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8720 RecipeBuilder.setRecipe(Instr, R); 8721 continue; 8722 } 8723 // Otherwise, add the new recipe. 8724 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 8725 for (auto *Def : Recipe->definedValues()) { 8726 auto *UV = Def->getUnderlyingValue(); 8727 Plan->addVPValue(UV, Def); 8728 } 8729 8730 RecipeBuilder.setRecipe(Instr, Recipe); 8731 if (isa<VPHeaderPHIRecipe>(Recipe)) { 8732 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 8733 // the following cases, VPHeaderPHIRecipes may be created after non-phi 8734 // recipes and need to be moved to the phi section of HeaderVPBB: 8735 // * tail-folding (non-phi recipes computing the header mask are 8736 // introduced earlier than regular header phi recipes, and should appear 8737 // after them) 8738 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 8739 8740 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 8741 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 8742 "unexpected recipe needs moving"); 8743 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8744 } else 8745 VPBB->appendRecipe(Recipe); 8746 } 8747 8748 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8749 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8750 } 8751 8752 // After here, VPBB should not be used. 8753 VPBB = nullptr; 8754 8755 if (CM.requiresScalarEpilogue(Range)) { 8756 // No edge from the middle block to the unique exit block has been inserted 8757 // and there is nothing to fix from vector loop; phis should have incoming 8758 // from scalar loop only. 8759 } else 8760 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); 8761 8762 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8763 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8764 "entry block must be set to a VPRegionBlock having a non-empty entry " 8765 "VPBasicBlock"); 8766 RecipeBuilder.fixHeaderPhis(); 8767 8768 // --------------------------------------------------------------------------- 8769 // Transform initial VPlan: Apply previously taken decisions, in order, to 8770 // bring the VPlan to its final state. 8771 // --------------------------------------------------------------------------- 8772 8773 // Adjust the recipes for any inloop reductions. 8774 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); 8775 8776 // Interleave memory: for each Interleave Group we marked earlier as relevant 8777 // for this VPlan, replace the Recipes widening its memory instructions with a 8778 // single VPInterleaveRecipe at its insertion point. 8779 for (const auto *IG : InterleaveGroups) { 8780 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8781 RecipeBuilder.getRecipe(IG->getInsertPos())); 8782 SmallVector<VPValue *, 4> StoredValues; 8783 for (unsigned i = 0; i < IG->getFactor(); ++i) 8784 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8785 auto *StoreR = 8786 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8787 StoredValues.push_back(StoreR->getStoredValue()); 8788 } 8789 8790 bool NeedsMaskForGaps = 8791 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 8792 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8793 Recipe->getMask(), NeedsMaskForGaps); 8794 VPIG->insertBefore(Recipe); 8795 unsigned J = 0; 8796 for (unsigned i = 0; i < IG->getFactor(); ++i) 8797 if (Instruction *Member = IG->getMember(i)) { 8798 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 8799 if (!Member->getType()->isVoidTy()) { 8800 VPValue *OriginalV = MemberR->getVPSingleValue(); 8801 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8802 J++; 8803 } 8804 MemberR->eraseFromParent(); 8805 } 8806 } 8807 8808 for (ElementCount VF : Range) 8809 Plan->addVF(VF); 8810 Plan->setName("Initial VPlan"); 8811 8812 // Replace VPValues for known constant strides guaranteed by predicate scalar 8813 // evolution. 8814 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 8815 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 8816 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 8817 // Only handle constant strides for now. 8818 if (!ScevStride) 8819 continue; 8820 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 8821 8822 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 8823 // The versioned value may not be used in the loop directly, so just add a 8824 // new live-in in those cases. 8825 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 8826 } 8827 8828 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8829 // in ways that accessing values using original IR values is incorrect. 8830 Plan->disableValue2VPValue(); 8831 8832 // Sink users of fixed-order recurrence past the recipe defining the previous 8833 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 8834 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 8835 return nullptr; 8836 8837 if (useActiveLaneMask(Style)) { 8838 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 8839 // TailFoldingStyle is visible there. 8840 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 8841 bool WithoutRuntimeCheck = 8842 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 8843 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 8844 WithoutRuntimeCheck); 8845 } 8846 return Plan; 8847 } 8848 8849 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8850 // Outer loop handling: They may require CFG and instruction level 8851 // transformations before even evaluating whether vectorization is profitable. 8852 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8853 // the vectorization pipeline. 8854 assert(!OrigLoop->isInnermost()); 8855 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8856 8857 // Create new empty VPlan 8858 auto Plan = VPlan::createInitialVPlan( 8859 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8860 *PSE.getSE()); 8861 8862 // Build hierarchical CFG 8863 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8864 HCFGBuilder.buildHierarchicalCFG(); 8865 8866 for (ElementCount VF : Range) 8867 Plan->addVF(VF); 8868 8869 VPlanTransforms::VPInstructionsToVPRecipes( 8870 Plan, 8871 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 8872 *PSE.getSE(), *TLI); 8873 8874 // Remove the existing terminator of the exiting block of the top-most region. 8875 // A BranchOnCount will be added instead when adding the canonical IV recipes. 8876 auto *Term = 8877 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 8878 Term->eraseFromParent(); 8879 8880 // Tail folding is not supported for outer loops, so the induction increment 8881 // is guaranteed to not wrap. 8882 bool HasNUW = true; 8883 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 8884 DebugLoc()); 8885 return Plan; 8886 } 8887 8888 // Adjust the recipes for reductions. For in-loop reductions the chain of 8889 // instructions leading from the loop exit instr to the phi need to be converted 8890 // to reductions, with one operand being vector and the other being the scalar 8891 // reduction chain. For other reductions, a select is introduced between the phi 8892 // and live-out recipes when folding the tail. 8893 // 8894 // A ComputeReductionResult recipe is added to the middle block, also for 8895 // in-loop reductions which compute their result in-loop, because generating 8896 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 8897 void LoopVectorizationPlanner::adjustRecipesForReductions( 8898 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 8899 ElementCount MinVF) { 8900 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 8901 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 8902 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 8903 // sank outside of the loop would keep the same order as they had in the 8904 // original loop. 8905 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 8906 for (VPRecipeBase &R : Header->phis()) { 8907 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 8908 ReductionPHIList.emplace_back(ReductionPhi); 8909 } 8910 bool HasIntermediateStore = false; 8911 stable_sort(ReductionPHIList, 8912 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, 8913 const VPReductionPHIRecipe *R2) { 8914 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 8915 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 8916 HasIntermediateStore |= IS1 || IS2; 8917 8918 // If neither of the recipes has an intermediate store, keep the 8919 // order the same. 8920 if (!IS1 && !IS2) 8921 return false; 8922 8923 // If only one of the recipes has an intermediate store, then 8924 // move it towards the beginning of the list. 8925 if (IS1 && !IS2) 8926 return true; 8927 8928 if (!IS1 && IS2) 8929 return false; 8930 8931 // If both recipes have an intermediate store, then the recipe 8932 // with the later store should be processed earlier. So it 8933 // should go to the beginning of the list. 8934 return DT->dominates(IS2, IS1); 8935 }); 8936 8937 if (HasIntermediateStore && ReductionPHIList.size() > 1) 8938 for (VPRecipeBase *R : ReductionPHIList) 8939 R->moveBefore(*Header, Header->getFirstNonPhi()); 8940 8941 for (VPRecipeBase &R : Header->phis()) { 8942 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 8943 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 8944 continue; 8945 8946 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 8947 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8948 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 8949 "AnyOf reductions are not allowed for in-loop reductions"); 8950 8951 // Collect the chain of "link" recipes for the reduction starting at PhiR. 8952 SetVector<VPRecipeBase *> Worklist; 8953 Worklist.insert(PhiR); 8954 for (unsigned I = 0; I != Worklist.size(); ++I) { 8955 VPRecipeBase *Cur = Worklist[I]; 8956 for (VPUser *U : Cur->getVPSingleValue()->users()) { 8957 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 8958 if (!UserRecipe) 8959 continue; 8960 assert(UserRecipe->getNumDefinedValues() == 1 && 8961 "recipes must define exactly one result value"); 8962 Worklist.insert(UserRecipe); 8963 } 8964 } 8965 8966 // Visit operation "Links" along the reduction chain top-down starting from 8967 // the phi until LoopExitValue. We keep track of the previous item 8968 // (PreviousLink) to tell which of the two operands of a Link will remain 8969 // scalar and which will be reduced. For minmax by select(cmp), Link will be 8970 // the select instructions. 8971 VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0]. 8972 for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) { 8973 VPValue *PreviousLinkV = PreviousLink->getVPSingleValue(); 8974 8975 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 8976 8977 // Index of the first operand which holds a non-mask vector operand. 8978 unsigned IndexOfFirstOperand; 8979 // Recognize a call to the llvm.fmuladd intrinsic. 8980 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 8981 VPValue *VecOp; 8982 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 8983 if (IsFMulAdd) { 8984 assert( 8985 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 8986 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 8987 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 8988 isa<VPWidenCallRecipe>(CurrentLink)) && 8989 CurrentLink->getOperand(2) == PreviousLinkV && 8990 "expected a call where the previous link is the added operand"); 8991 8992 // If the instruction is a call to the llvm.fmuladd intrinsic then we 8993 // need to create an fmul recipe (multiplying the first two operands of 8994 // the fmuladd together) to use as the vector operand for the fadd 8995 // reduction. 8996 VPInstruction *FMulRecipe = new VPInstruction( 8997 Instruction::FMul, 8998 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 8999 CurrentLinkI->getFastMathFlags()); 9000 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9001 VecOp = FMulRecipe; 9002 } else { 9003 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9004 if (isa<VPWidenRecipe>(CurrentLink)) { 9005 assert(isa<CmpInst>(CurrentLinkI) && 9006 "need to have the compare of the select"); 9007 continue; 9008 } 9009 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9010 "must be a select recipe"); 9011 IndexOfFirstOperand = 1; 9012 } else { 9013 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9014 "Expected to replace a VPWidenSC"); 9015 IndexOfFirstOperand = 0; 9016 } 9017 // Note that for non-commutable operands (cmp-selects), the semantics of 9018 // the cmp-select are captured in the recurrence kind. 9019 unsigned VecOpId = 9020 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV 9021 ? IndexOfFirstOperand + 1 9022 : IndexOfFirstOperand; 9023 VecOp = CurrentLink->getOperand(VecOpId); 9024 assert(VecOp != PreviousLinkV && 9025 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9026 (VecOpId - IndexOfFirstOperand)) == 9027 PreviousLinkV && 9028 "PreviousLinkV must be the operand other than VecOp"); 9029 } 9030 9031 BasicBlock *BB = CurrentLinkI->getParent(); 9032 VPValue *CondOp = nullptr; 9033 if (CM.blockNeedsPredicationForAnyReason(BB)) { 9034 VPBuilder::InsertPointGuard Guard(Builder); 9035 Builder.setInsertPoint(CurrentLink); 9036 CondOp = RecipeBuilder.getBlockInMask(BB); 9037 } 9038 9039 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9040 RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp); 9041 // Append the recipe to the end of the VPBasicBlock because we need to 9042 // ensure that it comes after all of it's inputs, including CondOp. 9043 // Note that this transformation may leave over dead recipes (including 9044 // CurrentLink), which will be cleaned by a later VPlan transform. 9045 LinkVPBB->appendRecipe(RedRecipe); 9046 CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9047 PreviousLink = RedRecipe; 9048 } 9049 } 9050 Builder.setInsertPoint(&*LatchVPBB->begin()); 9051 for (VPRecipeBase &R : 9052 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9053 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9054 if (!PhiR) 9055 continue; 9056 9057 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9058 // If tail is folded by masking, introduce selects between the phi 9059 // and the live-out instruction of each reduction, at the beginning of the 9060 // dedicated latch block. 9061 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9062 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9063 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9064 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9065 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9066 "reduction recipe must be defined before latch"); 9067 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9068 std::optional<FastMathFlags> FMFs = 9069 PhiTy->isFloatingPointTy() 9070 ? std::make_optional(RdxDesc.getFastMathFlags()) 9071 : std::nullopt; 9072 NewExitingVPV = 9073 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9074 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9075 return isa<VPInstruction>(&U) && 9076 cast<VPInstruction>(&U)->getOpcode() == 9077 VPInstruction::ComputeReductionResult; 9078 }); 9079 if (PreferPredicatedReductionSelect || 9080 TTI.preferPredicatedReductionSelect( 9081 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, 9082 TargetTransformInfo::ReductionFlags())) 9083 PhiR->setOperand(1, NewExitingVPV); 9084 } 9085 9086 // If the vector reduction can be performed in a smaller type, we truncate 9087 // then extend the loop exit value to enable InstCombine to evaluate the 9088 // entire expression in the smaller type. 9089 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9090 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 9091 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9092 Type *RdxTy = RdxDesc.getRecurrenceType(); 9093 auto *Trunc = 9094 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9095 auto *Extnd = 9096 RdxDesc.isSigned() 9097 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9098 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9099 9100 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9101 Extnd->insertAfter(Trunc); 9102 if (PhiR->getOperand(1) == NewExitingVPV) 9103 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9104 NewExitingVPV = Extnd; 9105 } 9106 9107 // We want code in the middle block to appear to execute on the location of 9108 // the scalar loop's latch terminator because: (a) it is all compiler 9109 // generated, (b) these instructions are always executed after evaluating 9110 // the latch conditional branch, and (c) other passes may add new 9111 // predecessors which terminate on this line. This is the easiest way to 9112 // ensure we don't accidentally cause an extra step back into the loop while 9113 // debugging. 9114 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9115 9116 // TODO: At the moment ComputeReductionResult also drives creation of the 9117 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9118 // even for in-loop reductions, until the reduction resume value handling is 9119 // also modeled in VPlan. 9120 auto *FinalReductionResult = new VPInstruction( 9121 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9122 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) 9123 ->appendRecipe(FinalReductionResult); 9124 OrigExitingVPV->replaceUsesWithIf( 9125 FinalReductionResult, 9126 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); 9127 } 9128 9129 VPlanTransforms::clearReductionWrapFlags(*Plan); 9130 } 9131 9132 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9133 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9134 VPSlotTracker &SlotTracker) const { 9135 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9136 IG->getInsertPos()->printAsOperand(O, false); 9137 O << ", "; 9138 getAddr()->printAsOperand(O, SlotTracker); 9139 VPValue *Mask = getMask(); 9140 if (Mask) { 9141 O << ", "; 9142 Mask->printAsOperand(O, SlotTracker); 9143 } 9144 9145 unsigned OpIdx = 0; 9146 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9147 if (!IG->getMember(i)) 9148 continue; 9149 if (getNumStoreOperands() > 0) { 9150 O << "\n" << Indent << " store "; 9151 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9152 O << " to index " << i; 9153 } else { 9154 O << "\n" << Indent << " "; 9155 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9156 O << " = load from index " << i; 9157 } 9158 ++OpIdx; 9159 } 9160 } 9161 #endif 9162 9163 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9164 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9165 "Not a pointer induction according to InductionDescriptor!"); 9166 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9167 "Unexpected type."); 9168 9169 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9170 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9171 9172 if (onlyScalarsGenerated(State.VF)) { 9173 // This is the normalized GEP that starts counting at zero. 9174 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9175 CanonicalIV, IndDesc.getStep()->getType()); 9176 // Determine the number of scalars we need to generate for each unroll 9177 // iteration. If the instruction is uniform, we only need to generate the 9178 // first lane. Otherwise, we generate all VF values. 9179 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9180 assert((IsUniform || !State.VF.isScalable()) && 9181 "Cannot scalarize a scalable VF"); 9182 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9183 9184 for (unsigned Part = 0; Part < State.UF; ++Part) { 9185 Value *PartStart = 9186 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9187 9188 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9189 Value *Idx = State.Builder.CreateAdd( 9190 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9191 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9192 9193 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9194 Value *SclrGep = emitTransformedIndex( 9195 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, 9196 IndDesc.getKind(), IndDesc.getInductionBinOp()); 9197 SclrGep->setName("next.gep"); 9198 State.set(this, SclrGep, VPIteration(Part, Lane)); 9199 } 9200 } 9201 return; 9202 } 9203 9204 Type *PhiType = IndDesc.getStep()->getType(); 9205 9206 // Build a pointer phi 9207 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9208 Type *ScStValueType = ScalarStartValue->getType(); 9209 PHINode *NewPointerPhi = 9210 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9211 9212 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9213 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9214 9215 // A pointer induction, performed by using a gep 9216 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9217 9218 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9219 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9220 Value *NumUnrolledElems = 9221 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9222 Value *InductionGEP = GetElementPtrInst::Create( 9223 State.Builder.getInt8Ty(), NewPointerPhi, 9224 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9225 InductionLoc); 9226 // Add induction update using an incorrect block temporarily. The phi node 9227 // will be fixed after VPlan execution. Note that at this point the latch 9228 // block cannot be used, as it does not exist yet. 9229 // TODO: Model increment value in VPlan, by turning the recipe into a 9230 // multi-def and a subclass of VPHeaderPHIRecipe. 9231 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9232 9233 // Create UF many actual address geps that use the pointer 9234 // phi as base and a vectorized version of the step value 9235 // (<step*0, ..., step*N>) as offset. 9236 for (unsigned Part = 0; Part < State.UF; ++Part) { 9237 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9238 Value *StartOffsetScalar = 9239 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9240 Value *StartOffset = 9241 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9242 // Create a vector of consecutive numbers from zero to VF. 9243 StartOffset = State.Builder.CreateAdd( 9244 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9245 9246 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9247 "scalar step must be the same across all parts"); 9248 Value *GEP = State.Builder.CreateGEP( 9249 State.Builder.getInt8Ty(), NewPointerPhi, 9250 State.Builder.CreateMul( 9251 StartOffset, 9252 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9253 "vector.gep")); 9254 State.set(this, GEP, Part); 9255 } 9256 } 9257 9258 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9259 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9260 9261 // Fast-math-flags propagate from the original induction instruction. 9262 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9263 if (FPBinOp) 9264 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9265 9266 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9267 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9268 Value *DerivedIV = emitTransformedIndex( 9269 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, 9270 Kind, cast_if_present<BinaryOperator>(FPBinOp)); 9271 DerivedIV->setName("offset.idx"); 9272 if (TruncResultTy) { 9273 assert(TruncResultTy != DerivedIV->getType() && 9274 Step->getType()->isIntegerTy() && 9275 "Truncation requires an integer step"); 9276 DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); 9277 } 9278 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9279 9280 State.set(this, DerivedIV, VPIteration(0, 0)); 9281 } 9282 9283 void VPInterleaveRecipe::execute(VPTransformState &State) { 9284 assert(!State.Instance && "Interleave group being replicated."); 9285 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9286 getStoredValues(), getMask(), 9287 NeedsMaskForGaps); 9288 } 9289 9290 void VPReductionRecipe::execute(VPTransformState &State) { 9291 assert(!State.Instance && "Reduction being replicated."); 9292 Value *PrevInChain = State.get(getChainOp(), 0); 9293 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9294 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); 9295 // Propagate the fast-math flags carried by the underlying instruction. 9296 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9297 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 9298 for (unsigned Part = 0; Part < State.UF; ++Part) { 9299 Value *NewVecOp = State.get(getVecOp(), Part); 9300 if (VPValue *Cond = getCondOp()) { 9301 Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) 9302 : State.get(Cond, {Part, 0}); 9303 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 9304 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 9305 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 9306 RdxDesc.getFastMathFlags()); 9307 if (State.VF.isVector()) { 9308 Iden = 9309 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9310 } 9311 9312 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 9313 NewVecOp = Select; 9314 } 9315 Value *NewRed; 9316 Value *NextInChain; 9317 if (IsOrdered) { 9318 if (State.VF.isVector()) 9319 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 9320 PrevInChain); 9321 else 9322 NewRed = State.Builder.CreateBinOp( 9323 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 9324 NewVecOp); 9325 PrevInChain = NewRed; 9326 } else { 9327 PrevInChain = State.get(getChainOp(), Part); 9328 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 9329 } 9330 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9331 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 9332 NewRed, PrevInChain); 9333 } else if (IsOrdered) 9334 NextInChain = NewRed; 9335 else 9336 NextInChain = State.Builder.CreateBinOp( 9337 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 9338 State.set(this, NextInChain, Part); 9339 } 9340 } 9341 9342 void VPReplicateRecipe::execute(VPTransformState &State) { 9343 Instruction *UI = getUnderlyingInstr(); 9344 if (State.Instance) { // Generate a single instance. 9345 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9346 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9347 // Insert scalar instance packing it into a vector. 9348 if (State.VF.isVector() && shouldPack()) { 9349 // If we're constructing lane 0, initialize to start from poison. 9350 if (State.Instance->Lane.isFirstLane()) { 9351 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9352 Value *Poison = PoisonValue::get( 9353 VectorType::get(UI->getType(), State.VF)); 9354 State.set(this, Poison, State.Instance->Part); 9355 } 9356 State.packScalarIntoVectorValue(this, *State.Instance); 9357 } 9358 return; 9359 } 9360 9361 if (IsUniform) { 9362 // If the recipe is uniform across all parts (instead of just per VF), only 9363 // generate a single instance. 9364 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9365 all_of(operands(), [](VPValue *Op) { 9366 return Op->isDefinedOutsideVectorRegions(); 9367 })) { 9368 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9369 if (user_begin() != user_end()) { 9370 for (unsigned Part = 1; Part < State.UF; ++Part) 9371 State.set(this, State.get(this, VPIteration(0, 0)), 9372 VPIteration(Part, 0)); 9373 } 9374 return; 9375 } 9376 9377 // Uniform within VL means we need to generate lane 0 only for each 9378 // unrolled copy. 9379 for (unsigned Part = 0; Part < State.UF; ++Part) 9380 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9381 return; 9382 } 9383 9384 // A store of a loop varying value to a uniform address only needs the last 9385 // copy of the store. 9386 if (isa<StoreInst>(UI) && 9387 vputils::isUniformAfterVectorization(getOperand(1))) { 9388 auto Lane = VPLane::getLastLaneForVF(State.VF); 9389 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9390 State); 9391 return; 9392 } 9393 9394 // Generate scalar instances for all VF lanes of all UF parts. 9395 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9396 const unsigned EndLane = State.VF.getKnownMinValue(); 9397 for (unsigned Part = 0; Part < State.UF; ++Part) 9398 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9399 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9400 } 9401 9402 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9403 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9404 9405 // Attempt to issue a wide load. 9406 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9407 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9408 9409 assert((LI || SI) && "Invalid Load/Store instruction"); 9410 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9411 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9412 9413 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9414 9415 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9416 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9417 bool CreateGatherScatter = !isConsecutive(); 9418 9419 auto &Builder = State.Builder; 9420 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9421 bool isMaskRequired = getMask(); 9422 if (isMaskRequired) { 9423 // Mask reversal is only needed for non-all-one (null) masks, as reverse of 9424 // a null all-one mask is a null mask. 9425 for (unsigned Part = 0; Part < State.UF; ++Part) { 9426 Value *Mask = State.get(getMask(), Part); 9427 if (isReverse()) 9428 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9429 BlockInMaskParts[Part] = Mask; 9430 } 9431 } 9432 9433 // Handle Stores: 9434 if (SI) { 9435 State.setDebugLocFrom(SI->getDebugLoc()); 9436 9437 for (unsigned Part = 0; Part < State.UF; ++Part) { 9438 Instruction *NewSI = nullptr; 9439 Value *StoredVal = State.get(StoredValue, Part); 9440 if (CreateGatherScatter) { 9441 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9442 Value *VectorGep = State.get(getAddr(), Part); 9443 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9444 MaskPart); 9445 } else { 9446 if (isReverse()) { 9447 // If we store to reverse consecutive memory locations, then we need 9448 // to reverse the order of elements in the stored value. 9449 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9450 // We don't want to update the value in the map as it might be used in 9451 // another expression. So don't call resetVectorValue(StoredVal). 9452 } 9453 auto *VecPtr = State.get(getAddr(), Part); 9454 if (isMaskRequired) 9455 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9456 BlockInMaskParts[Part]); 9457 else 9458 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9459 } 9460 State.addMetadata(NewSI, SI); 9461 } 9462 return; 9463 } 9464 9465 // Handle loads. 9466 assert(LI && "Must have a load instruction"); 9467 State.setDebugLocFrom(LI->getDebugLoc()); 9468 for (unsigned Part = 0; Part < State.UF; ++Part) { 9469 Value *NewLI; 9470 if (CreateGatherScatter) { 9471 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9472 Value *VectorGep = State.get(getAddr(), Part); 9473 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9474 nullptr, "wide.masked.gather"); 9475 State.addMetadata(NewLI, LI); 9476 } else { 9477 auto *VecPtr = State.get(getAddr(), Part); 9478 if (isMaskRequired) 9479 NewLI = Builder.CreateMaskedLoad( 9480 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9481 PoisonValue::get(DataTy), "wide.masked.load"); 9482 else 9483 NewLI = 9484 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9485 9486 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9487 State.addMetadata(NewLI, LI); 9488 if (Reverse) 9489 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9490 } 9491 9492 State.set(getVPSingleValue(), NewLI, Part); 9493 } 9494 } 9495 9496 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9497 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9498 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9499 // for predication. 9500 static ScalarEpilogueLowering getScalarEpilogueLowering( 9501 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9502 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9503 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9504 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9505 // don't look at hints or options, and don't request a scalar epilogue. 9506 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9507 // LoopAccessInfo (due to code dependency and not being able to reliably get 9508 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9509 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9510 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9511 // back to the old way and vectorize with versioning when forced. See D81345.) 9512 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9513 PGSOQueryType::IRPass) && 9514 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9515 return CM_ScalarEpilogueNotAllowedOptSize; 9516 9517 // 2) If set, obey the directives 9518 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9519 switch (PreferPredicateOverEpilogue) { 9520 case PreferPredicateTy::ScalarEpilogue: 9521 return CM_ScalarEpilogueAllowed; 9522 case PreferPredicateTy::PredicateElseScalarEpilogue: 9523 return CM_ScalarEpilogueNotNeededUsePredicate; 9524 case PreferPredicateTy::PredicateOrDontVectorize: 9525 return CM_ScalarEpilogueNotAllowedUsePredicate; 9526 }; 9527 } 9528 9529 // 3) If set, obey the hints 9530 switch (Hints.getPredicate()) { 9531 case LoopVectorizeHints::FK_Enabled: 9532 return CM_ScalarEpilogueNotNeededUsePredicate; 9533 case LoopVectorizeHints::FK_Disabled: 9534 return CM_ScalarEpilogueAllowed; 9535 }; 9536 9537 // 4) if the TTI hook indicates this is profitable, request predication. 9538 TailFoldingInfo TFI(TLI, &LVL, IAI); 9539 if (TTI->preferPredicateOverEpilogue(&TFI)) 9540 return CM_ScalarEpilogueNotNeededUsePredicate; 9541 9542 return CM_ScalarEpilogueAllowed; 9543 } 9544 9545 // Process the loop in the VPlan-native vectorization path. This path builds 9546 // VPlan upfront in the vectorization pipeline, which allows to apply 9547 // VPlan-to-VPlan transformations from the very beginning without modifying the 9548 // input LLVM IR. 9549 static bool processLoopInVPlanNativePath( 9550 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9551 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9552 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9553 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9554 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9555 LoopVectorizationRequirements &Requirements) { 9556 9557 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9558 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9559 return false; 9560 } 9561 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9562 Function *F = L->getHeader()->getParent(); 9563 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9564 9565 ScalarEpilogueLowering SEL = 9566 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9567 9568 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9569 &Hints, IAI); 9570 // Use the planner for outer loop vectorization. 9571 // TODO: CM is not used at this point inside the planner. Turn CM into an 9572 // optional argument if we don't need it in the future. 9573 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9574 ORE); 9575 9576 // Get user vectorization factor. 9577 ElementCount UserVF = Hints.getWidth(); 9578 9579 CM.collectElementTypesForWidening(); 9580 9581 // Plan how to best vectorize, return the best VF and its cost. 9582 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9583 9584 // If we are stress testing VPlan builds, do not attempt to generate vector 9585 // code. Masked vector code generation support will follow soon. 9586 // Also, do not attempt to vectorize if no vector code will be produced. 9587 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9588 return false; 9589 9590 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9591 9592 { 9593 bool AddBranchWeights = 9594 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9595 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9596 F->getParent()->getDataLayout(), AddBranchWeights); 9597 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9598 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9599 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9600 << L->getHeader()->getParent()->getName() << "\"\n"); 9601 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9602 } 9603 9604 reportVectorization(ORE, L, VF, 1); 9605 9606 // Mark the loop as already vectorized to avoid vectorizing again. 9607 Hints.setAlreadyVectorized(); 9608 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9609 return true; 9610 } 9611 9612 // Emit a remark if there are stores to floats that required a floating point 9613 // extension. If the vectorized loop was generated with floating point there 9614 // will be a performance penalty from the conversion overhead and the change in 9615 // the vector width. 9616 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9617 SmallVector<Instruction *, 4> Worklist; 9618 for (BasicBlock *BB : L->getBlocks()) { 9619 for (Instruction &Inst : *BB) { 9620 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9621 if (S->getValueOperand()->getType()->isFloatTy()) 9622 Worklist.push_back(S); 9623 } 9624 } 9625 } 9626 9627 // Traverse the floating point stores upwards searching, for floating point 9628 // conversions. 9629 SmallPtrSet<const Instruction *, 4> Visited; 9630 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9631 while (!Worklist.empty()) { 9632 auto *I = Worklist.pop_back_val(); 9633 if (!L->contains(I)) 9634 continue; 9635 if (!Visited.insert(I).second) 9636 continue; 9637 9638 // Emit a remark if the floating point store required a floating 9639 // point conversion. 9640 // TODO: More work could be done to identify the root cause such as a 9641 // constant or a function return type and point the user to it. 9642 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9643 ORE->emit([&]() { 9644 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9645 I->getDebugLoc(), L->getHeader()) 9646 << "floating point conversion changes vector width. " 9647 << "Mixed floating point precision requires an up/down " 9648 << "cast that will negatively impact performance."; 9649 }); 9650 9651 for (Use &Op : I->operands()) 9652 if (auto *OpI = dyn_cast<Instruction>(Op)) 9653 Worklist.push_back(OpI); 9654 } 9655 } 9656 9657 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9658 VectorizationFactor &VF, 9659 std::optional<unsigned> VScale, Loop *L, 9660 ScalarEvolution &SE, 9661 ScalarEpilogueLowering SEL) { 9662 InstructionCost CheckCost = Checks.getCost(); 9663 if (!CheckCost.isValid()) 9664 return false; 9665 9666 // When interleaving only scalar and vector cost will be equal, which in turn 9667 // would lead to a divide by 0. Fall back to hard threshold. 9668 if (VF.Width.isScalar()) { 9669 if (CheckCost > VectorizeMemoryCheckThreshold) { 9670 LLVM_DEBUG( 9671 dbgs() 9672 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9673 return false; 9674 } 9675 return true; 9676 } 9677 9678 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9679 double ScalarC = *VF.ScalarCost.getValue(); 9680 if (ScalarC == 0) 9681 return true; 9682 9683 // First, compute the minimum iteration count required so that the vector 9684 // loop outperforms the scalar loop. 9685 // The total cost of the scalar loop is 9686 // ScalarC * TC 9687 // where 9688 // * TC is the actual trip count of the loop. 9689 // * ScalarC is the cost of a single scalar iteration. 9690 // 9691 // The total cost of the vector loop is 9692 // RtC + VecC * (TC / VF) + EpiC 9693 // where 9694 // * RtC is the cost of the generated runtime checks 9695 // * VecC is the cost of a single vector iteration. 9696 // * TC is the actual trip count of the loop 9697 // * VF is the vectorization factor 9698 // * EpiCost is the cost of the generated epilogue, including the cost 9699 // of the remaining scalar operations. 9700 // 9701 // Vectorization is profitable once the total vector cost is less than the 9702 // total scalar cost: 9703 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9704 // 9705 // Now we can compute the minimum required trip count TC as 9706 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 9707 // 9708 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9709 // the computations are performed on doubles, not integers and the result 9710 // is rounded up, hence we get an upper estimate of the TC. 9711 unsigned IntVF = VF.Width.getKnownMinValue(); 9712 if (VF.Width.isScalable()) { 9713 unsigned AssumedMinimumVscale = 1; 9714 if (VScale) 9715 AssumedMinimumVscale = *VScale; 9716 IntVF *= AssumedMinimumVscale; 9717 } 9718 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 9719 double RtC = *CheckCost.getValue(); 9720 double MinTC1 = RtC / (ScalarC - VecCOverVF); 9721 9722 // Second, compute a minimum iteration count so that the cost of the 9723 // runtime checks is only a fraction of the total scalar loop cost. This 9724 // adds a loop-dependent bound on the overhead incurred if the runtime 9725 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 9726 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 9727 // cost, compute 9728 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 9729 double MinTC2 = RtC * 10 / ScalarC; 9730 9731 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 9732 // epilogue is allowed, choose the next closest multiple of VF. This should 9733 // partly compensate for ignoring the epilogue cost. 9734 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 9735 if (SEL == CM_ScalarEpilogueAllowed) 9736 MinTC = alignTo(MinTC, IntVF); 9737 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 9738 9739 LLVM_DEBUG( 9740 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 9741 << VF.MinProfitableTripCount << "\n"); 9742 9743 // Skip vectorization if the expected trip count is less than the minimum 9744 // required trip count. 9745 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 9746 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 9747 VF.MinProfitableTripCount)) { 9748 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 9749 "trip count < minimum profitable VF (" 9750 << *ExpectedTC << " < " << VF.MinProfitableTripCount 9751 << ")\n"); 9752 9753 return false; 9754 } 9755 } 9756 return true; 9757 } 9758 9759 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9760 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9761 !EnableLoopInterleaving), 9762 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9763 !EnableLoopVectorization) {} 9764 9765 bool LoopVectorizePass::processLoop(Loop *L) { 9766 assert((EnableVPlanNativePath || L->isInnermost()) && 9767 "VPlan-native path is not enabled. Only process inner loops."); 9768 9769 #ifndef NDEBUG 9770 const std::string DebugLocStr = getDebugLocString(L); 9771 #endif /* NDEBUG */ 9772 9773 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 9774 << L->getHeader()->getParent()->getName() << "' from " 9775 << DebugLocStr << "\n"); 9776 9777 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 9778 9779 LLVM_DEBUG( 9780 dbgs() << "LV: Loop hints:" 9781 << " force=" 9782 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9783 ? "disabled" 9784 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9785 ? "enabled" 9786 : "?")) 9787 << " width=" << Hints.getWidth() 9788 << " interleave=" << Hints.getInterleave() << "\n"); 9789 9790 // Function containing loop 9791 Function *F = L->getHeader()->getParent(); 9792 9793 // Looking at the diagnostic output is the only way to determine if a loop 9794 // was vectorized (other than looking at the IR or machine code), so it 9795 // is important to generate an optimization remark for each loop. Most of 9796 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9797 // generated as OptimizationRemark and OptimizationRemarkMissed are 9798 // less verbose reporting vectorized loops and unvectorized loops that may 9799 // benefit from vectorization, respectively. 9800 9801 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9802 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9803 return false; 9804 } 9805 9806 PredicatedScalarEvolution PSE(*SE, *L); 9807 9808 // Check if it is legal to vectorize the loop. 9809 LoopVectorizationRequirements Requirements; 9810 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 9811 &Requirements, &Hints, DB, AC, BFI, PSI); 9812 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9813 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9814 Hints.emitRemarkWithHints(); 9815 return false; 9816 } 9817 9818 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9819 // here. They may require CFG and instruction level transformations before 9820 // even evaluating whether vectorization is profitable. Since we cannot modify 9821 // the incoming IR, we need to build VPlan upfront in the vectorization 9822 // pipeline. 9823 if (!L->isInnermost()) 9824 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9825 ORE, BFI, PSI, Hints, Requirements); 9826 9827 assert(L->isInnermost() && "Inner loop expected."); 9828 9829 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9830 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9831 9832 // If an override option has been passed in for interleaved accesses, use it. 9833 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9834 UseInterleaved = EnableInterleavedMemAccesses; 9835 9836 // Analyze interleaved memory accesses. 9837 if (UseInterleaved) 9838 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9839 9840 // Check the function attributes and profiles to find out if this function 9841 // should be optimized for size. 9842 ScalarEpilogueLowering SEL = 9843 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 9844 9845 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9846 // count by optimizing for size, to minimize overheads. 9847 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9848 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9849 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9850 << "This loop is worth vectorizing only if no scalar " 9851 << "iteration overheads are incurred."); 9852 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9853 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9854 else { 9855 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 9856 LLVM_DEBUG(dbgs() << "\n"); 9857 // Predicate tail-folded loops are efficient even when the loop 9858 // iteration count is low. However, setting the epilogue policy to 9859 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 9860 // with runtime checks. It's more effective to let 9861 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 9862 // for the loop. 9863 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 9864 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9865 } else { 9866 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 9867 "small to consider vectorizing.\n"); 9868 reportVectorizationFailure( 9869 "The trip count is below the minial threshold value.", 9870 "loop trip count is too low, avoiding vectorization", 9871 "LowTripCount", ORE, L); 9872 Hints.emitRemarkWithHints(); 9873 return false; 9874 } 9875 } 9876 } 9877 9878 // Check the function attributes to see if implicit floats or vectors are 9879 // allowed. 9880 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9881 reportVectorizationFailure( 9882 "Can't vectorize when the NoImplicitFloat attribute is used", 9883 "loop not vectorized due to NoImplicitFloat attribute", 9884 "NoImplicitFloat", ORE, L); 9885 Hints.emitRemarkWithHints(); 9886 return false; 9887 } 9888 9889 // Check if the target supports potentially unsafe FP vectorization. 9890 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9891 // for the target we're vectorizing for, to make sure none of the 9892 // additional fp-math flags can help. 9893 if (Hints.isPotentiallyUnsafe() && 9894 TTI->isFPVectorizationPotentiallyUnsafe()) { 9895 reportVectorizationFailure( 9896 "Potentially unsafe FP op prevents vectorization", 9897 "loop not vectorized due to unsafe FP support.", 9898 "UnsafeFP", ORE, L); 9899 Hints.emitRemarkWithHints(); 9900 return false; 9901 } 9902 9903 bool AllowOrderedReductions; 9904 // If the flag is set, use that instead and override the TTI behaviour. 9905 if (ForceOrderedReductions.getNumOccurrences() > 0) 9906 AllowOrderedReductions = ForceOrderedReductions; 9907 else 9908 AllowOrderedReductions = TTI->enableOrderedReductions(); 9909 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 9910 ORE->emit([&]() { 9911 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9912 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9913 ExactFPMathInst->getDebugLoc(), 9914 ExactFPMathInst->getParent()) 9915 << "loop not vectorized: cannot prove it is safe to reorder " 9916 "floating-point operations"; 9917 }); 9918 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9919 "reorder floating-point operations\n"); 9920 Hints.emitRemarkWithHints(); 9921 return false; 9922 } 9923 9924 // Use the cost model. 9925 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9926 F, &Hints, IAI); 9927 // Use the planner for vectorization. 9928 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 9929 ORE); 9930 9931 // Get user vectorization factor and interleave count. 9932 ElementCount UserVF = Hints.getWidth(); 9933 unsigned UserIC = Hints.getInterleave(); 9934 9935 // Plan how to best vectorize, return the best VF and its cost. 9936 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9937 9938 VectorizationFactor VF = VectorizationFactor::Disabled(); 9939 unsigned IC = 1; 9940 9941 bool AddBranchWeights = 9942 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9943 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9944 F->getParent()->getDataLayout(), AddBranchWeights); 9945 if (MaybeVF) { 9946 VF = *MaybeVF; 9947 // Select the interleave count. 9948 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9949 9950 unsigned SelectedIC = std::max(IC, UserIC); 9951 // Optimistically generate runtime checks if they are needed. Drop them if 9952 // they turn out to not be profitable. 9953 if (VF.Width.isVector() || SelectedIC > 1) 9954 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 9955 9956 // Check if it is profitable to vectorize with runtime checks. 9957 bool ForceVectorization = 9958 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 9959 if (!ForceVectorization && 9960 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 9961 *PSE.getSE(), SEL)) { 9962 ORE->emit([&]() { 9963 return OptimizationRemarkAnalysisAliasing( 9964 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 9965 L->getHeader()) 9966 << "loop not vectorized: cannot prove it is safe to reorder " 9967 "memory operations"; 9968 }); 9969 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 9970 Hints.emitRemarkWithHints(); 9971 return false; 9972 } 9973 } 9974 9975 // Identify the diagnostic messages that should be produced. 9976 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9977 bool VectorizeLoop = true, InterleaveLoop = true; 9978 if (VF.Width.isScalar()) { 9979 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9980 VecDiagMsg = std::make_pair( 9981 "VectorizationNotBeneficial", 9982 "the cost-model indicates that vectorization is not beneficial"); 9983 VectorizeLoop = false; 9984 } 9985 9986 if (!MaybeVF && UserIC > 1) { 9987 // Tell the user interleaving was avoided up-front, despite being explicitly 9988 // requested. 9989 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9990 "interleaving should be avoided up front\n"); 9991 IntDiagMsg = std::make_pair( 9992 "InterleavingAvoided", 9993 "Ignoring UserIC, because interleaving was avoided up front"); 9994 InterleaveLoop = false; 9995 } else if (IC == 1 && UserIC <= 1) { 9996 // Tell the user interleaving is not beneficial. 9997 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9998 IntDiagMsg = std::make_pair( 9999 "InterleavingNotBeneficial", 10000 "the cost-model indicates that interleaving is not beneficial"); 10001 InterleaveLoop = false; 10002 if (UserIC == 1) { 10003 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10004 IntDiagMsg.second += 10005 " and is explicitly disabled or interleave count is set to 1"; 10006 } 10007 } else if (IC > 1 && UserIC == 1) { 10008 // Tell the user interleaving is beneficial, but it explicitly disabled. 10009 LLVM_DEBUG( 10010 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10011 IntDiagMsg = std::make_pair( 10012 "InterleavingBeneficialButDisabled", 10013 "the cost-model indicates that interleaving is beneficial " 10014 "but is explicitly disabled or interleave count is set to 1"); 10015 InterleaveLoop = false; 10016 } 10017 10018 // Override IC if user provided an interleave count. 10019 IC = UserIC > 0 ? UserIC : IC; 10020 10021 // Emit diagnostic messages, if any. 10022 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10023 if (!VectorizeLoop && !InterleaveLoop) { 10024 // Do not vectorize or interleaving the loop. 10025 ORE->emit([&]() { 10026 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10027 L->getStartLoc(), L->getHeader()) 10028 << VecDiagMsg.second; 10029 }); 10030 ORE->emit([&]() { 10031 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10032 L->getStartLoc(), L->getHeader()) 10033 << IntDiagMsg.second; 10034 }); 10035 return false; 10036 } else if (!VectorizeLoop && InterleaveLoop) { 10037 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10038 ORE->emit([&]() { 10039 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10040 L->getStartLoc(), L->getHeader()) 10041 << VecDiagMsg.second; 10042 }); 10043 } else if (VectorizeLoop && !InterleaveLoop) { 10044 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10045 << ") in " << DebugLocStr << '\n'); 10046 ORE->emit([&]() { 10047 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10048 L->getStartLoc(), L->getHeader()) 10049 << IntDiagMsg.second; 10050 }); 10051 } else if (VectorizeLoop && InterleaveLoop) { 10052 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10053 << ") in " << DebugLocStr << '\n'); 10054 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10055 } 10056 10057 bool DisableRuntimeUnroll = false; 10058 MDNode *OrigLoopID = L->getLoopID(); 10059 { 10060 using namespace ore; 10061 if (!VectorizeLoop) { 10062 assert(IC > 1 && "interleave count should not be 1 or 0"); 10063 // If we decided that it is not legal to vectorize the loop, then 10064 // interleave it. 10065 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10066 &CM, BFI, PSI, Checks); 10067 10068 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10069 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10070 10071 ORE->emit([&]() { 10072 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10073 L->getHeader()) 10074 << "interleaved loop (interleaved count: " 10075 << NV("InterleaveCount", IC) << ")"; 10076 }); 10077 } else { 10078 // If we decided that it is *legal* to vectorize the loop, then do it. 10079 10080 // Consider vectorizing the epilogue too if it's profitable. 10081 VectorizationFactor EpilogueVF = 10082 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10083 if (EpilogueVF.Width.isVector()) { 10084 10085 // The first pass vectorizes the main loop and creates a scalar epilogue 10086 // to be vectorized by executing the plan (potentially with a different 10087 // factor) again shortly afterwards. 10088 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10089 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10090 EPI, &LVL, &CM, BFI, PSI, Checks); 10091 10092 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10093 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( 10094 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); 10095 ++LoopsVectorized; 10096 10097 // Second pass vectorizes the epilogue and adjusts the control flow 10098 // edges from the first pass. 10099 EPI.MainLoopVF = EPI.EpilogueVF; 10100 EPI.MainLoopUF = EPI.EpilogueUF; 10101 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10102 ORE, EPI, &LVL, &CM, BFI, PSI, 10103 Checks); 10104 10105 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10106 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10107 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10108 Header->setName("vec.epilog.vector.body"); 10109 10110 // Re-use the trip count and steps expanded for the main loop, as 10111 // skeleton creation needs it as a value that dominates both the scalar 10112 // and vector epilogue loops 10113 // TODO: This is a workaround needed for epilogue vectorization and it 10114 // should be removed once induction resume value creation is done 10115 // directly in VPlan. 10116 EpilogILV.setTripCount(MainILV.getTripCount()); 10117 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10118 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10119 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10120 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10121 ExpandR->replaceAllUsesWith(ExpandedVal); 10122 ExpandR->eraseFromParent(); 10123 } 10124 10125 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10126 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10127 // before vectorizing the epilogue loop. 10128 for (VPRecipeBase &R : Header->phis()) { 10129 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10130 continue; 10131 10132 Value *ResumeV = nullptr; 10133 // TODO: Move setting of resume values to prepareToExecute. 10134 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10135 ResumeV = ReductionResumeValues 10136 .find(&ReductionPhi->getRecurrenceDescriptor()) 10137 ->second; 10138 } else { 10139 // Create induction resume values for both widened pointer and 10140 // integer/fp inductions and update the start value of the induction 10141 // recipes to use the resume value. 10142 PHINode *IndPhi = nullptr; 10143 const InductionDescriptor *ID; 10144 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10145 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10146 ID = &Ind->getInductionDescriptor(); 10147 } else { 10148 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10149 IndPhi = WidenInd->getPHINode(); 10150 ID = &WidenInd->getInductionDescriptor(); 10151 } 10152 10153 ResumeV = MainILV.createInductionResumeValue( 10154 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10155 {EPI.MainLoopIterationCountCheck}); 10156 } 10157 assert(ResumeV && "Must have a resume value"); 10158 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10159 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10160 } 10161 10162 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10163 DT, true, &ExpandedSCEVs); 10164 ++LoopsEpilogueVectorized; 10165 10166 if (!MainILV.areSafetyChecksAdded()) 10167 DisableRuntimeUnroll = true; 10168 } else { 10169 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10170 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10171 PSI, Checks); 10172 10173 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10174 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10175 ++LoopsVectorized; 10176 10177 // Add metadata to disable runtime unrolling a scalar loop when there 10178 // are no runtime checks about strides and memory. A scalar loop that is 10179 // rarely used is not worth unrolling. 10180 if (!LB.areSafetyChecksAdded()) 10181 DisableRuntimeUnroll = true; 10182 } 10183 // Report the vectorization decision. 10184 reportVectorization(ORE, L, VF, IC); 10185 } 10186 10187 if (ORE->allowExtraAnalysis(LV_NAME)) 10188 checkMixedPrecision(L, ORE); 10189 } 10190 10191 std::optional<MDNode *> RemainderLoopID = 10192 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10193 LLVMLoopVectorizeFollowupEpilogue}); 10194 if (RemainderLoopID) { 10195 L->setLoopID(*RemainderLoopID); 10196 } else { 10197 if (DisableRuntimeUnroll) 10198 AddRuntimeUnrollDisableMetaData(L); 10199 10200 // Mark the loop as already vectorized to avoid vectorizing again. 10201 Hints.setAlreadyVectorized(); 10202 } 10203 10204 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10205 return true; 10206 } 10207 10208 LoopVectorizeResult LoopVectorizePass::runImpl( 10209 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10210 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10211 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10212 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10213 SE = &SE_; 10214 LI = &LI_; 10215 TTI = &TTI_; 10216 DT = &DT_; 10217 BFI = BFI_; 10218 TLI = TLI_; 10219 AC = &AC_; 10220 LAIs = &LAIs_; 10221 DB = &DB_; 10222 ORE = &ORE_; 10223 PSI = PSI_; 10224 10225 // Don't attempt if 10226 // 1. the target claims to have no vector registers, and 10227 // 2. interleaving won't help ILP. 10228 // 10229 // The second condition is necessary because, even if the target has no 10230 // vector registers, loop vectorization may still enable scalar 10231 // interleaving. 10232 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10233 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10234 return LoopVectorizeResult(false, false); 10235 10236 bool Changed = false, CFGChanged = false; 10237 10238 // The vectorizer requires loops to be in simplified form. 10239 // Since simplification may add new inner loops, it has to run before the 10240 // legality and profitability checks. This means running the loop vectorizer 10241 // will simplify all loops, regardless of whether anything end up being 10242 // vectorized. 10243 for (const auto &L : *LI) 10244 Changed |= CFGChanged |= 10245 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10246 10247 // Build up a worklist of inner-loops to vectorize. This is necessary as 10248 // the act of vectorizing or partially unrolling a loop creates new loops 10249 // and can invalidate iterators across the loops. 10250 SmallVector<Loop *, 8> Worklist; 10251 10252 for (Loop *L : *LI) 10253 collectSupportedLoops(*L, LI, ORE, Worklist); 10254 10255 LoopsAnalyzed += Worklist.size(); 10256 10257 // Now walk the identified inner loops. 10258 while (!Worklist.empty()) { 10259 Loop *L = Worklist.pop_back_val(); 10260 10261 // For the inner loops we actually process, form LCSSA to simplify the 10262 // transform. 10263 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10264 10265 Changed |= CFGChanged |= processLoop(L); 10266 10267 if (Changed) { 10268 LAIs->clear(); 10269 10270 #ifndef NDEBUG 10271 if (VerifySCEV) 10272 SE->verify(); 10273 #endif 10274 } 10275 } 10276 10277 // Process each loop nest in the function. 10278 return LoopVectorizeResult(Changed, CFGChanged); 10279 } 10280 10281 PreservedAnalyses LoopVectorizePass::run(Function &F, 10282 FunctionAnalysisManager &AM) { 10283 auto &LI = AM.getResult<LoopAnalysis>(F); 10284 // There are no loops in the function. Return before computing other expensive 10285 // analyses. 10286 if (LI.empty()) 10287 return PreservedAnalyses::all(); 10288 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10289 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10290 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10291 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10292 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10293 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10294 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10295 10296 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10297 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10298 ProfileSummaryInfo *PSI = 10299 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10300 BlockFrequencyInfo *BFI = nullptr; 10301 if (PSI && PSI->hasProfileSummary()) 10302 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10303 LoopVectorizeResult Result = 10304 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10305 if (!Result.MadeAnyChange) 10306 return PreservedAnalyses::all(); 10307 PreservedAnalyses PA; 10308 10309 if (isAssignmentTrackingEnabled(*F.getParent())) { 10310 for (auto &BB : F) 10311 RemoveRedundantDbgInstrs(&BB); 10312 } 10313 10314 // We currently do not preserve loopinfo/dominator analyses with outer loop 10315 // vectorization. Until this is addressed, mark these analyses as preserved 10316 // only for non-VPlan-native path. 10317 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10318 if (!EnableVPlanNativePath) { 10319 PA.preserve<LoopAnalysis>(); 10320 PA.preserve<DominatorTreeAnalysis>(); 10321 PA.preserve<ScalarEvolutionAnalysis>(); 10322 } 10323 10324 if (Result.MadeCFGChange) { 10325 // Making CFG changes likely means a loop got vectorized. Indicate that 10326 // extra simplification passes should be run. 10327 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10328 // be run if runtime checks have been added. 10329 AM.getResult<ShouldRunExtraVectorPasses>(F); 10330 PA.preserve<ShouldRunExtraVectorPasses>(); 10331 } else { 10332 PA.preserveSet<CFGAnalyses>(); 10333 } 10334 return PA; 10335 } 10336 10337 void LoopVectorizePass::printPipeline( 10338 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10339 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10340 OS, MapClassName2PassName); 10341 10342 OS << '<'; 10343 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10344 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10345 OS << '>'; 10346 } 10347