1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/STLExtras.h" 70 #include "llvm/ADT/SmallPtrSet.h" 71 #include "llvm/ADT/SmallSet.h" 72 #include "llvm/ADT/SmallVector.h" 73 #include "llvm/ADT/Statistic.h" 74 #include "llvm/ADT/StringRef.h" 75 #include "llvm/ADT/Twine.h" 76 #include "llvm/ADT/iterator_range.h" 77 #include "llvm/Analysis/AssumptionCache.h" 78 #include "llvm/Analysis/BasicAliasAnalysis.h" 79 #include "llvm/Analysis/BlockFrequencyInfo.h" 80 #include "llvm/Analysis/CFG.h" 81 #include "llvm/Analysis/CodeMetrics.h" 82 #include "llvm/Analysis/DemandedBits.h" 83 #include "llvm/Analysis/GlobalsModRef.h" 84 #include "llvm/Analysis/LoopAccessAnalysis.h" 85 #include "llvm/Analysis/LoopAnalysisManager.h" 86 #include "llvm/Analysis/LoopInfo.h" 87 #include "llvm/Analysis/LoopIterator.h" 88 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 89 #include "llvm/Analysis/ProfileSummaryInfo.h" 90 #include "llvm/Analysis/ScalarEvolution.h" 91 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 92 #include "llvm/Analysis/TargetLibraryInfo.h" 93 #include "llvm/Analysis/TargetTransformInfo.h" 94 #include "llvm/Analysis/ValueTracking.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfo.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/MDBuilder.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/ProfDataUtils.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cmath> 146 #include <cstdint> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <map> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN( 251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 252 "data-and-control-without-rt-check", 253 "Similar to data-and-control, but remove the runtime check"))); 254 255 static cl::opt<bool> MaximizeBandwidth( 256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 257 cl::desc("Maximize bandwidth when selecting vectorization factor which " 258 "will be determined by the smallest type in loop.")); 259 260 static cl::opt<bool> EnableInterleavedMemAccesses( 261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 262 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 263 264 /// An interleave-group may need masking if it resides in a block that needs 265 /// predication, or in order to mask away gaps. 266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 269 270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 272 cl::desc("We don't interleave loops with a estimated constant trip count " 273 "below this number")); 274 275 static cl::opt<unsigned> ForceTargetNumScalarRegs( 276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's number of scalar registers.")); 278 279 static cl::opt<unsigned> ForceTargetNumVectorRegs( 280 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 281 cl::desc("A flag that overrides the target's number of vector registers.")); 282 283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 285 cl::desc("A flag that overrides the target's max interleave factor for " 286 "scalar loops.")); 287 288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 290 cl::desc("A flag that overrides the target's max interleave factor for " 291 "vectorized loops.")); 292 293 static cl::opt<unsigned> ForceTargetInstructionCost( 294 "force-target-instruction-cost", cl::init(0), cl::Hidden, 295 cl::desc("A flag that overrides the target's expected cost for " 296 "an instruction to a single constant value. Mostly " 297 "useful for getting consistent testing.")); 298 299 static cl::opt<bool> ForceTargetSupportsScalableVectors( 300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 301 cl::desc( 302 "Pretend that scalable vectors are supported, even if the target does " 303 "not support them. This flag should only be used for testing.")); 304 305 static cl::opt<unsigned> SmallLoopCost( 306 "small-loop-cost", cl::init(20), cl::Hidden, 307 cl::desc( 308 "The cost of a loop that is considered 'small' by the interleaver.")); 309 310 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 312 cl::desc("Enable the use of the block frequency analysis to access PGO " 313 "heuristics minimizing code growth in cold regions and being more " 314 "aggressive in hot regions.")); 315 316 // Runtime interleave loops for load/store throughput. 317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 319 cl::desc( 320 "Enable runtime interleaving until load/store ports are saturated")); 321 322 /// Interleave small loops with scalar reductions. 323 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 325 cl::desc("Enable interleaving for loops with small iteration counts that " 326 "contain scalar reductions to expose ILP.")); 327 328 /// The number of stores in a loop that are allowed to need predication. 329 static cl::opt<unsigned> NumberOfStoresToPredicate( 330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 331 cl::desc("Max number of stores to be predicated behind an if.")); 332 333 static cl::opt<bool> EnableIndVarRegisterHeur( 334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 335 cl::desc("Count the induction variable only once when interleaving")); 336 337 static cl::opt<bool> EnableCondStoresVectorization( 338 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 339 cl::desc("Enable if predication of stores during vectorization.")); 340 341 static cl::opt<unsigned> MaxNestedScalarReductionIC( 342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 343 cl::desc("The maximum interleave count to use when interleaving a scalar " 344 "reduction in a nested loop.")); 345 346 static cl::opt<bool> 347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 348 cl::Hidden, 349 cl::desc("Prefer in-loop vector reductions, " 350 "overriding the targets preference.")); 351 352 static cl::opt<bool> ForceOrderedReductions( 353 "force-ordered-reductions", cl::init(false), cl::Hidden, 354 cl::desc("Enable the vectorisation of loops with in-order (strict) " 355 "FP reductions")); 356 357 static cl::opt<bool> PreferPredicatedReductionSelect( 358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 359 cl::desc( 360 "Prefer predicating a reduction operation over an after loop select.")); 361 362 namespace llvm { 363 cl::opt<bool> EnableVPlanNativePath( 364 "enable-vplan-native-path", cl::Hidden, 365 cl::desc("Enable VPlan-native vectorization path with " 366 "support for outer loop vectorization.")); 367 } 368 369 // This flag enables the stress testing of the VPlan H-CFG construction in the 370 // VPlan-native vectorization path. It must be used in conjuction with 371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 372 // verification of the H-CFGs built. 373 static cl::opt<bool> VPlanBuildStressTest( 374 "vplan-build-stress-test", cl::init(false), cl::Hidden, 375 cl::desc( 376 "Build VPlan for every supported loop nest in the function and bail " 377 "out right after the build (stress test the VPlan H-CFG construction " 378 "in the VPlan-native vectorization path).")); 379 380 cl::opt<bool> llvm::EnableLoopInterleaving( 381 "interleave-loops", cl::init(true), cl::Hidden, 382 cl::desc("Enable loop interleaving in Loop vectorization passes")); 383 cl::opt<bool> llvm::EnableLoopVectorization( 384 "vectorize-loops", cl::init(true), cl::Hidden, 385 cl::desc("Run the Loop vectorization passes")); 386 387 static cl::opt<bool> PrintVPlansInDotFormat( 388 "vplan-print-in-dot-format", cl::Hidden, 389 cl::desc("Use dot format instead of plain text when dumping VPlans")); 390 391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 392 "force-widen-divrem-via-safe-divisor", cl::Hidden, 393 cl::desc( 394 "Override cost based safe divisor widening for div/rem instructions")); 395 396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 398 cl::Hidden, 399 cl::desc("Try wider VFs if they enable the use of vector variants")); 400 401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 402 // variables not overflowing do not hold. See `emitSCEVChecks`. 403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 405 // `emitMemRuntimeChecks`. 406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 407 // Likelyhood of bypassing the vectorized loop because there are zero trips left 408 // after prolog. See `emitIterationCountCheck`. 409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 410 411 /// A helper function that returns true if the given type is irregular. The 412 /// type is irregular if its allocated size doesn't equal the store size of an 413 /// element of the corresponding vector type. 414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 415 // Determine if an array of N elements of type Ty is "bitcast compatible" 416 // with a <N x Ty> vector. 417 // This is only true if there is no padding between the array elements. 418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 419 } 420 421 /// A helper function that returns the reciprocal of the block probability of 422 /// predicated blocks. If we return X, we are assuming the predicated block 423 /// will execute once for every X iterations of the loop header. 424 /// 425 /// TODO: We should use actual block probability here, if available. Currently, 426 /// we always assume predicated blocks have a 50% chance of executing. 427 static unsigned getReciprocalPredBlockProb() { return 2; } 428 429 /// Returns "best known" trip count for the specified loop \p L as defined by 430 /// the following procedure: 431 /// 1) Returns exact trip count if it is known. 432 /// 2) Returns expected trip count according to profile data if any. 433 /// 3) Returns upper bound estimate if it is known. 434 /// 4) Returns std::nullopt if all of the above failed. 435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 436 Loop *L) { 437 // Check if exact trip count is known. 438 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 439 return ExpectedTC; 440 441 // Check if there is an expected trip count available from profile data. 442 if (LoopVectorizeWithBlockFrequency) 443 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 444 return *EstimatedTC; 445 446 // Check if upper bound estimate is known. 447 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 448 return ExpectedTC; 449 450 return std::nullopt; 451 } 452 453 /// Return a vector containing interleaved elements from multiple 454 /// smaller input vectors. 455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 456 const Twine &Name) { 457 unsigned Factor = Vals.size(); 458 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 459 460 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 461 #ifndef NDEBUG 462 for (Value *Val : Vals) 463 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 464 #endif 465 466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 467 // must use intrinsics to interleave. 468 if (VecTy->isScalableTy()) { 469 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 470 return Builder.CreateIntrinsic( 471 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 472 /*FMFSource=*/nullptr, Name); 473 } 474 475 // Fixed length. Start by concatenating all vectors into a wide vector. 476 Value *WideVec = concatenateVectors(Builder, Vals); 477 478 // Interleave the elements into the wide vector. 479 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 480 return Builder.CreateShuffleVector( 481 WideVec, createInterleaveMask(NumElts, Factor), Name); 482 } 483 484 namespace { 485 // Forward declare GeneratedRTChecks. 486 class GeneratedRTChecks; 487 488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 489 } // namespace 490 491 namespace llvm { 492 493 AnalysisKey ShouldRunExtraVectorPasses::Key; 494 495 /// InnerLoopVectorizer vectorizes loops which contain only one basic 496 /// block to a specified vectorization factor (VF). 497 /// This class performs the widening of scalars into vectors, or multiple 498 /// scalars. This class also implements the following features: 499 /// * It inserts an epilogue loop for handling loops that don't have iteration 500 /// counts that are known to be a multiple of the vectorization factor. 501 /// * It handles the code generation for reduction variables. 502 /// * Scalarization (implementation using scalars) of un-vectorizable 503 /// instructions. 504 /// InnerLoopVectorizer does not perform any vectorization-legality 505 /// checks, and relies on the caller to check for the different legality 506 /// aspects. The InnerLoopVectorizer relies on the 507 /// LoopVectorizationLegality class to provide information about the induction 508 /// and reduction variables that were found to a given vectorization factor. 509 class InnerLoopVectorizer { 510 public: 511 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 512 LoopInfo *LI, DominatorTree *DT, 513 const TargetLibraryInfo *TLI, 514 const TargetTransformInfo *TTI, AssumptionCache *AC, 515 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 516 ElementCount MinProfitableTripCount, 517 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 518 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 519 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 520 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 521 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 522 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 523 PSI(PSI), RTChecks(RTChecks) { 524 // Query this against the original loop and save it here because the profile 525 // of the original loop header may change as the transformation happens. 526 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 527 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 528 529 if (MinProfitableTripCount.isZero()) 530 this->MinProfitableTripCount = VecWidth; 531 else 532 this->MinProfitableTripCount = MinProfitableTripCount; 533 } 534 535 virtual ~InnerLoopVectorizer() = default; 536 537 /// Create a new empty loop that will contain vectorized instructions later 538 /// on, while the old loop will be used as the scalar remainder. Control flow 539 /// is generated around the vectorized (and scalar epilogue) loops consisting 540 /// of various checks and bypasses. Return the pre-header block of the new 541 /// loop and the start value for the canonical induction, if it is != 0. The 542 /// latter is the case when vectorizing the epilogue loop. In the case of 543 /// epilogue vectorization, this function is overriden to handle the more 544 /// complex control flow around the loops. \p ExpandedSCEVs is used to 545 /// look up SCEV expansions for expressions needed during skeleton creation. 546 virtual std::pair<BasicBlock *, Value *> 547 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 548 549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 550 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 551 552 // Return true if any runtime check is added. 553 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 554 555 /// A type for vectorized values in the new loop. Each value from the 556 /// original loop, when vectorized, is represented by UF vector values in the 557 /// new unrolled loop, where UF is the unroll factor. 558 using VectorParts = SmallVector<Value *, 2>; 559 560 /// A helper function to scalarize a single Instruction in the innermost loop. 561 /// Generates a sequence of scalar instances for each lane between \p MinLane 562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 564 /// Instr's operands. 565 void scalarizeInstruction(const Instruction *Instr, 566 VPReplicateRecipe *RepRecipe, 567 const VPIteration &Instance, 568 VPTransformState &State); 569 570 /// Try to vectorize interleaved access group \p Group with the base address 571 /// given in \p Addr, optionally masking the vector operations if \p 572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 573 /// values in the vectorized loop. 574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 575 ArrayRef<VPValue *> VPDefs, 576 VPTransformState &State, VPValue *Addr, 577 ArrayRef<VPValue *> StoredValues, 578 VPValue *BlockInMask, bool NeedsMaskForGaps); 579 580 /// Fix the non-induction PHIs in \p Plan. 581 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 582 583 /// Returns true if the reordering of FP operations is not allowed, but we are 584 /// able to vectorize with strict in-order reductions for the given RdxDesc. 585 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 586 587 // Returns the resume value (bc.merge.rdx) for a reduction as 588 // generated by fixReduction. 589 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 590 591 /// Create a new phi node for the induction variable \p OrigPhi to resume 592 /// iteration count in the scalar epilogue, from where the vectorized loop 593 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 594 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 595 /// and the resume values can come from an additional bypass block, the \p 596 /// AdditionalBypass pair provides information about the bypass block and the 597 /// end value on the edge from bypass to this loop. 598 PHINode *createInductionResumeValue( 599 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 600 ArrayRef<BasicBlock *> BypassBlocks, 601 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 602 603 /// Returns the original loop trip count. 604 Value *getTripCount() const { return TripCount; } 605 606 /// Used to set the trip count after ILV's construction and after the 607 /// preheader block has been executed. Note that this always holds the trip 608 /// count of the original loop for both main loop and epilogue vectorization. 609 void setTripCount(Value *TC) { TripCount = TC; } 610 611 protected: 612 friend class LoopVectorizationPlanner; 613 614 /// A small list of PHINodes. 615 using PhiVector = SmallVector<PHINode *, 4>; 616 617 /// A type for scalarized values in the new loop. Each value from the 618 /// original loop, when scalarized, is represented by UF x VF scalar values 619 /// in the new unrolled loop, where UF is the unroll factor and VF is the 620 /// vectorization factor. 621 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 622 623 /// Set up the values of the IVs correctly when exiting the vector loop. 624 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 625 Value *VectorTripCount, Value *EndValue, 626 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 627 VPlan &Plan, VPTransformState &State); 628 629 /// Handle all cross-iteration phis in the header. 630 void fixCrossIterationPHIs(VPTransformState &State); 631 632 /// Create the exit value of first order recurrences in the middle block and 633 /// update their users. 634 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 635 VPTransformState &State); 636 637 /// Create code for the loop exit value of the reduction. 638 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 639 640 /// Iteratively sink the scalarized operands of a predicated instruction into 641 /// the block that was created for it. 642 void sinkScalarOperands(Instruction *PredInst); 643 644 /// Returns (and creates if needed) the trip count of the widened loop. 645 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 646 647 /// Returns a bitcasted value to the requested vector type. 648 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 649 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 650 const DataLayout &DL); 651 652 /// Emit a bypass check to see if the vector trip count is zero, including if 653 /// it overflows. 654 void emitIterationCountCheck(BasicBlock *Bypass); 655 656 /// Emit a bypass check to see if all of the SCEV assumptions we've 657 /// had to make are correct. Returns the block containing the checks or 658 /// nullptr if no checks have been added. 659 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 660 661 /// Emit bypass checks to check any memory assumptions we may have made. 662 /// Returns the block containing the checks or nullptr if no checks have been 663 /// added. 664 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 665 666 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 667 /// vector loop preheader, middle block and scalar preheader. 668 void createVectorLoopSkeleton(StringRef Prefix); 669 670 /// Create new phi nodes for the induction variables to resume iteration count 671 /// in the scalar epilogue, from where the vectorized loop left off. 672 /// In cases where the loop skeleton is more complicated (eg. epilogue 673 /// vectorization) and the resume values can come from an additional bypass 674 /// block, the \p AdditionalBypass pair provides information about the bypass 675 /// block and the end value on the edge from bypass to this loop. 676 void createInductionResumeValues( 677 const SCEV2ValueTy &ExpandedSCEVs, 678 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 679 680 /// Complete the loop skeleton by adding debug MDs, creating appropriate 681 /// conditional branches in the middle block, preparing the builder and 682 /// running the verifier. Return the preheader of the completed vector loop. 683 BasicBlock *completeLoopSkeleton(); 684 685 /// Collect poison-generating recipes that may generate a poison value that is 686 /// used after vectorization, even when their operands are not poison. Those 687 /// recipes meet the following conditions: 688 /// * Contribute to the address computation of a recipe generating a widen 689 /// memory load/store (VPWidenMemoryInstructionRecipe or 690 /// VPInterleaveRecipe). 691 /// * Such a widen memory load/store has at least one underlying Instruction 692 /// that is in a basic block that needs predication and after vectorization 693 /// the generated instruction won't be predicated. 694 void collectPoisonGeneratingRecipes(VPTransformState &State); 695 696 /// Allow subclasses to override and print debug traces before/after vplan 697 /// execution, when trace information is requested. 698 virtual void printDebugTracesAtStart(){}; 699 virtual void printDebugTracesAtEnd(){}; 700 701 /// The original loop. 702 Loop *OrigLoop; 703 704 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 705 /// dynamic knowledge to simplify SCEV expressions and converts them to a 706 /// more usable form. 707 PredicatedScalarEvolution &PSE; 708 709 /// Loop Info. 710 LoopInfo *LI; 711 712 /// Dominator Tree. 713 DominatorTree *DT; 714 715 /// Target Library Info. 716 const TargetLibraryInfo *TLI; 717 718 /// Target Transform Info. 719 const TargetTransformInfo *TTI; 720 721 /// Assumption Cache. 722 AssumptionCache *AC; 723 724 /// Interface to emit optimization remarks. 725 OptimizationRemarkEmitter *ORE; 726 727 /// The vectorization SIMD factor to use. Each vector will have this many 728 /// vector elements. 729 ElementCount VF; 730 731 ElementCount MinProfitableTripCount; 732 733 /// The vectorization unroll factor to use. Each scalar is vectorized to this 734 /// many different vector instructions. 735 unsigned UF; 736 737 /// The builder that we use 738 IRBuilder<> Builder; 739 740 // --- Vectorization state --- 741 742 /// The vector-loop preheader. 743 BasicBlock *LoopVectorPreHeader; 744 745 /// The scalar-loop preheader. 746 BasicBlock *LoopScalarPreHeader; 747 748 /// Middle Block between the vector and the scalar. 749 BasicBlock *LoopMiddleBlock; 750 751 /// The unique ExitBlock of the scalar loop if one exists. Note that 752 /// there can be multiple exiting edges reaching this block. 753 BasicBlock *LoopExitBlock; 754 755 /// The scalar loop body. 756 BasicBlock *LoopScalarBody; 757 758 /// A list of all bypass blocks. The first block is the entry of the loop. 759 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 760 761 /// Store instructions that were predicated. 762 SmallVector<Instruction *, 4> PredicatedInstructions; 763 764 /// Trip count of the original loop. 765 Value *TripCount = nullptr; 766 767 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 768 Value *VectorTripCount = nullptr; 769 770 /// The legality analysis. 771 LoopVectorizationLegality *Legal; 772 773 /// The profitablity analysis. 774 LoopVectorizationCostModel *Cost; 775 776 // Record whether runtime checks are added. 777 bool AddedSafetyChecks = false; 778 779 // Holds the end values for each induction variable. We save the end values 780 // so we can later fix-up the external users of the induction variables. 781 DenseMap<PHINode *, Value *> IVEndValues; 782 783 /// BFI and PSI are used to check for profile guided size optimizations. 784 BlockFrequencyInfo *BFI; 785 ProfileSummaryInfo *PSI; 786 787 // Whether this loop should be optimized for size based on profile guided size 788 // optimizatios. 789 bool OptForSizeBasedOnProfile; 790 791 /// Structure to hold information about generated runtime checks, responsible 792 /// for cleaning the checks, if vectorization turns out unprofitable. 793 GeneratedRTChecks &RTChecks; 794 795 // Holds the resume values for reductions in the loops, used to set the 796 // correct start value of reduction PHIs when vectorizing the epilogue. 797 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 798 ReductionResumeValues; 799 }; 800 801 class InnerLoopUnroller : public InnerLoopVectorizer { 802 public: 803 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 804 LoopInfo *LI, DominatorTree *DT, 805 const TargetLibraryInfo *TLI, 806 const TargetTransformInfo *TTI, AssumptionCache *AC, 807 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 808 LoopVectorizationLegality *LVL, 809 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 810 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 811 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 812 ElementCount::getFixed(1), 813 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 814 BFI, PSI, Check) {} 815 }; 816 817 /// Encapsulate information regarding vectorization of a loop and its epilogue. 818 /// This information is meant to be updated and used across two stages of 819 /// epilogue vectorization. 820 struct EpilogueLoopVectorizationInfo { 821 ElementCount MainLoopVF = ElementCount::getFixed(0); 822 unsigned MainLoopUF = 0; 823 ElementCount EpilogueVF = ElementCount::getFixed(0); 824 unsigned EpilogueUF = 0; 825 BasicBlock *MainLoopIterationCountCheck = nullptr; 826 BasicBlock *EpilogueIterationCountCheck = nullptr; 827 BasicBlock *SCEVSafetyCheck = nullptr; 828 BasicBlock *MemSafetyCheck = nullptr; 829 Value *TripCount = nullptr; 830 Value *VectorTripCount = nullptr; 831 832 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 833 ElementCount EVF, unsigned EUF) 834 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 835 assert(EUF == 1 && 836 "A high UF for the epilogue loop is likely not beneficial."); 837 } 838 }; 839 840 /// An extension of the inner loop vectorizer that creates a skeleton for a 841 /// vectorized loop that has its epilogue (residual) also vectorized. 842 /// The idea is to run the vplan on a given loop twice, firstly to setup the 843 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 844 /// from the first step and vectorize the epilogue. This is achieved by 845 /// deriving two concrete strategy classes from this base class and invoking 846 /// them in succession from the loop vectorizer planner. 847 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 848 public: 849 InnerLoopAndEpilogueVectorizer( 850 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 851 DominatorTree *DT, const TargetLibraryInfo *TLI, 852 const TargetTransformInfo *TTI, AssumptionCache *AC, 853 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 854 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 855 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 856 GeneratedRTChecks &Checks) 857 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 858 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 859 CM, BFI, PSI, Checks), 860 EPI(EPI) {} 861 862 // Override this function to handle the more complex control flow around the 863 // three loops. 864 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 865 const SCEV2ValueTy &ExpandedSCEVs) final { 866 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 867 } 868 869 /// The interface for creating a vectorized skeleton using one of two 870 /// different strategies, each corresponding to one execution of the vplan 871 /// as described above. 872 virtual std::pair<BasicBlock *, Value *> 873 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 874 875 /// Holds and updates state information required to vectorize the main loop 876 /// and its epilogue in two separate passes. This setup helps us avoid 877 /// regenerating and recomputing runtime safety checks. It also helps us to 878 /// shorten the iteration-count-check path length for the cases where the 879 /// iteration count of the loop is so small that the main vector loop is 880 /// completely skipped. 881 EpilogueLoopVectorizationInfo &EPI; 882 }; 883 884 /// A specialized derived class of inner loop vectorizer that performs 885 /// vectorization of *main* loops in the process of vectorizing loops and their 886 /// epilogues. 887 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 888 public: 889 EpilogueVectorizerMainLoop( 890 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 891 DominatorTree *DT, const TargetLibraryInfo *TLI, 892 const TargetTransformInfo *TTI, AssumptionCache *AC, 893 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 894 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 895 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 896 GeneratedRTChecks &Check) 897 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 898 EPI, LVL, CM, BFI, PSI, Check) {} 899 /// Implements the interface for creating a vectorized skeleton using the 900 /// *main loop* strategy (ie the first pass of vplan execution). 901 std::pair<BasicBlock *, Value *> 902 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 903 904 protected: 905 /// Emits an iteration count bypass check once for the main loop (when \p 906 /// ForEpilogue is false) and once for the epilogue loop (when \p 907 /// ForEpilogue is true). 908 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 909 void printDebugTracesAtStart() override; 910 void printDebugTracesAtEnd() override; 911 }; 912 913 // A specialized derived class of inner loop vectorizer that performs 914 // vectorization of *epilogue* loops in the process of vectorizing loops and 915 // their epilogues. 916 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 917 public: 918 EpilogueVectorizerEpilogueLoop( 919 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 920 DominatorTree *DT, const TargetLibraryInfo *TLI, 921 const TargetTransformInfo *TTI, AssumptionCache *AC, 922 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 923 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 924 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 925 GeneratedRTChecks &Checks) 926 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 927 EPI, LVL, CM, BFI, PSI, Checks) { 928 TripCount = EPI.TripCount; 929 } 930 /// Implements the interface for creating a vectorized skeleton using the 931 /// *epilogue loop* strategy (ie the second pass of vplan execution). 932 std::pair<BasicBlock *, Value *> 933 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 934 935 protected: 936 /// Emits an iteration count bypass check after the main vector loop has 937 /// finished to see if there are any iterations left to execute by either 938 /// the vector epilogue or the scalar epilogue. 939 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 940 BasicBlock *Bypass, 941 BasicBlock *Insert); 942 void printDebugTracesAtStart() override; 943 void printDebugTracesAtEnd() override; 944 }; 945 } // end namespace llvm 946 947 /// Look for a meaningful debug location on the instruction or it's 948 /// operands. 949 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 950 if (!I) 951 return DebugLoc(); 952 953 DebugLoc Empty; 954 if (I->getDebugLoc() != Empty) 955 return I->getDebugLoc(); 956 957 for (Use &Op : I->operands()) { 958 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 959 if (OpInst->getDebugLoc() != Empty) 960 return OpInst->getDebugLoc(); 961 } 962 963 return I->getDebugLoc(); 964 } 965 966 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 967 /// is passed, the message relates to that particular instruction. 968 #ifndef NDEBUG 969 static void debugVectorizationMessage(const StringRef Prefix, 970 const StringRef DebugMsg, 971 Instruction *I) { 972 dbgs() << "LV: " << Prefix << DebugMsg; 973 if (I != nullptr) 974 dbgs() << " " << *I; 975 else 976 dbgs() << '.'; 977 dbgs() << '\n'; 978 } 979 #endif 980 981 /// Create an analysis remark that explains why vectorization failed 982 /// 983 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 984 /// RemarkName is the identifier for the remark. If \p I is passed it is an 985 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 986 /// the location of the remark. \return the remark object that can be 987 /// streamed to. 988 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 989 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 990 Value *CodeRegion = TheLoop->getHeader(); 991 DebugLoc DL = TheLoop->getStartLoc(); 992 993 if (I) { 994 CodeRegion = I->getParent(); 995 // If there is no debug location attached to the instruction, revert back to 996 // using the loop's. 997 if (I->getDebugLoc()) 998 DL = I->getDebugLoc(); 999 } 1000 1001 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1002 } 1003 1004 namespace llvm { 1005 1006 /// Return a value for Step multiplied by VF. 1007 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1008 int64_t Step) { 1009 assert(Ty->isIntegerTy() && "Expected an integer step"); 1010 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1011 } 1012 1013 /// Return the runtime value for VF. 1014 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1015 return B.CreateElementCount(Ty, VF); 1016 } 1017 1018 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1019 Loop *OrigLoop) { 1020 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1021 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1022 1023 ScalarEvolution &SE = *PSE.getSE(); 1024 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1025 } 1026 1027 void reportVectorizationFailure(const StringRef DebugMsg, 1028 const StringRef OREMsg, const StringRef ORETag, 1029 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1030 Instruction *I) { 1031 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1032 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1033 ORE->emit( 1034 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1035 << "loop not vectorized: " << OREMsg); 1036 } 1037 1038 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1039 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1040 Instruction *I) { 1041 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1042 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1043 ORE->emit( 1044 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1045 << Msg); 1046 } 1047 1048 /// Report successful vectorization of the loop. In case an outer loop is 1049 /// vectorized, prepend "outer" to the vectorization remark. 1050 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1051 VectorizationFactor VF, unsigned IC) { 1052 LLVM_DEBUG(debugVectorizationMessage( 1053 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 1054 nullptr)); 1055 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 1056 ORE->emit([&]() { 1057 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 1058 TheLoop->getHeader()) 1059 << "vectorized " << LoopType << "loop (vectorization width: " 1060 << ore::NV("VectorizationFactor", VF.Width) 1061 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 1062 }); 1063 } 1064 1065 } // end namespace llvm 1066 1067 #ifndef NDEBUG 1068 /// \return string containing a file name and a line # for the given loop. 1069 static std::string getDebugLocString(const Loop *L) { 1070 std::string Result; 1071 if (L) { 1072 raw_string_ostream OS(Result); 1073 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1074 LoopDbgLoc.print(OS); 1075 else 1076 // Just print the module name. 1077 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1078 OS.flush(); 1079 } 1080 return Result; 1081 } 1082 #endif 1083 1084 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1085 VPTransformState &State) { 1086 1087 // Collect recipes in the backward slice of `Root` that may generate a poison 1088 // value that is used after vectorization. 1089 SmallPtrSet<VPRecipeBase *, 16> Visited; 1090 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1091 SmallVector<VPRecipeBase *, 16> Worklist; 1092 Worklist.push_back(Root); 1093 1094 // Traverse the backward slice of Root through its use-def chain. 1095 while (!Worklist.empty()) { 1096 VPRecipeBase *CurRec = Worklist.back(); 1097 Worklist.pop_back(); 1098 1099 if (!Visited.insert(CurRec).second) 1100 continue; 1101 1102 // Prune search if we find another recipe generating a widen memory 1103 // instruction. Widen memory instructions involved in address computation 1104 // will lead to gather/scatter instructions, which don't need to be 1105 // handled. 1106 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1107 isa<VPInterleaveRecipe>(CurRec) || 1108 isa<VPScalarIVStepsRecipe>(CurRec) || 1109 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1110 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1111 continue; 1112 1113 // This recipe contributes to the address computation of a widen 1114 // load/store. If the underlying instruction has poison-generating flags, 1115 // drop them directly. 1116 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1117 RecWithFlags->dropPoisonGeneratingFlags(); 1118 } else { 1119 Instruction *Instr = dyn_cast_or_null<Instruction>( 1120 CurRec->getVPSingleValue()->getUnderlyingValue()); 1121 (void)Instr; 1122 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1123 "found instruction with poison generating flags not covered by " 1124 "VPRecipeWithIRFlags"); 1125 } 1126 1127 // Add new definitions to the worklist. 1128 for (VPValue *operand : CurRec->operands()) 1129 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1130 Worklist.push_back(OpDef); 1131 } 1132 }); 1133 1134 // Traverse all the recipes in the VPlan and collect the poison-generating 1135 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1136 // VPInterleaveRecipe. 1137 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1138 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1139 for (VPRecipeBase &Recipe : *VPBB) { 1140 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1141 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1142 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1143 if (AddrDef && WidenRec->isConsecutive() && 1144 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1145 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1146 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1147 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1148 if (AddrDef) { 1149 // Check if any member of the interleave group needs predication. 1150 const InterleaveGroup<Instruction> *InterGroup = 1151 InterleaveRec->getInterleaveGroup(); 1152 bool NeedPredication = false; 1153 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1154 I < NumMembers; ++I) { 1155 Instruction *Member = InterGroup->getMember(I); 1156 if (Member) 1157 NeedPredication |= 1158 Legal->blockNeedsPredication(Member->getParent()); 1159 } 1160 1161 if (NeedPredication) 1162 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1163 } 1164 } 1165 } 1166 } 1167 } 1168 1169 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1170 const RecurrenceDescriptor &RdxDesc) { 1171 auto It = ReductionResumeValues.find(&RdxDesc); 1172 assert(It != ReductionResumeValues.end() && 1173 "Expected to find a resume value for the reduction."); 1174 return It->second; 1175 } 1176 1177 namespace llvm { 1178 1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1180 // lowered. 1181 enum ScalarEpilogueLowering { 1182 1183 // The default: allowing scalar epilogues. 1184 CM_ScalarEpilogueAllowed, 1185 1186 // Vectorization with OptForSize: don't allow epilogues. 1187 CM_ScalarEpilogueNotAllowedOptSize, 1188 1189 // A special case of vectorisation with OptForSize: loops with a very small 1190 // trip count are considered for vectorization under OptForSize, thereby 1191 // making sure the cost of their loop body is dominant, free of runtime 1192 // guards and scalar iteration overheads. 1193 CM_ScalarEpilogueNotAllowedLowTripLoop, 1194 1195 // Loop hint predicate indicating an epilogue is undesired. 1196 CM_ScalarEpilogueNotNeededUsePredicate, 1197 1198 // Directive indicating we must either tail fold or not vectorize 1199 CM_ScalarEpilogueNotAllowedUsePredicate 1200 }; 1201 1202 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1203 1204 /// LoopVectorizationCostModel - estimates the expected speedups due to 1205 /// vectorization. 1206 /// In many cases vectorization is not profitable. This can happen because of 1207 /// a number of reasons. In this class we mainly attempt to predict the 1208 /// expected speedup/slowdowns due to the supported instruction set. We use the 1209 /// TargetTransformInfo to query the different backends for the cost of 1210 /// different operations. 1211 class LoopVectorizationCostModel { 1212 public: 1213 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1214 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1215 LoopVectorizationLegality *Legal, 1216 const TargetTransformInfo &TTI, 1217 const TargetLibraryInfo *TLI, DemandedBits *DB, 1218 AssumptionCache *AC, 1219 OptimizationRemarkEmitter *ORE, const Function *F, 1220 const LoopVectorizeHints *Hints, 1221 InterleavedAccessInfo &IAI) 1222 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1223 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1224 Hints(Hints), InterleaveInfo(IAI) {} 1225 1226 /// \return An upper bound for the vectorization factors (both fixed and 1227 /// scalable). If the factors are 0, vectorization and interleaving should be 1228 /// avoided up front. 1229 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1230 1231 /// \return True if runtime checks are required for vectorization, and false 1232 /// otherwise. 1233 bool runtimeChecksRequired(); 1234 1235 /// Setup cost-based decisions for user vectorization factor. 1236 /// \return true if the UserVF is a feasible VF to be chosen. 1237 bool selectUserVectorizationFactor(ElementCount UserVF) { 1238 collectUniformsAndScalars(UserVF); 1239 collectInstsToScalarize(UserVF); 1240 return expectedCost(UserVF).first.isValid(); 1241 } 1242 1243 /// \return The size (in bits) of the smallest and widest types in the code 1244 /// that needs to be vectorized. We ignore values that remain scalar such as 1245 /// 64 bit loop indices. 1246 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1247 1248 /// \return The desired interleave count. 1249 /// If interleave count has been specified by metadata it will be returned. 1250 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1251 /// are the selected vectorization factor and the cost of the selected VF. 1252 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1253 1254 /// Memory access instruction may be vectorized in more than one way. 1255 /// Form of instruction after vectorization depends on cost. 1256 /// This function takes cost-based decisions for Load/Store instructions 1257 /// and collects them in a map. This decisions map is used for building 1258 /// the lists of loop-uniform and loop-scalar instructions. 1259 /// The calculated cost is saved with widening decision in order to 1260 /// avoid redundant calculations. 1261 void setCostBasedWideningDecision(ElementCount VF); 1262 1263 /// A call may be vectorized in different ways depending on whether we have 1264 /// vectorized variants available and whether the target supports masking. 1265 /// This function analyzes all calls in the function at the supplied VF, 1266 /// makes a decision based on the costs of available options, and stores that 1267 /// decision in a map for use in planning and plan execution. 1268 void setVectorizedCallDecision(ElementCount VF); 1269 1270 /// A struct that represents some properties of the register usage 1271 /// of a loop. 1272 struct RegisterUsage { 1273 /// Holds the number of loop invariant values that are used in the loop. 1274 /// The key is ClassID of target-provided register class. 1275 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1276 /// Holds the maximum number of concurrent live intervals in the loop. 1277 /// The key is ClassID of target-provided register class. 1278 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1279 }; 1280 1281 /// \return Returns information about the register usages of the loop for the 1282 /// given vectorization factors. 1283 SmallVector<RegisterUsage, 8> 1284 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1285 1286 /// Collect values we want to ignore in the cost model. 1287 void collectValuesToIgnore(); 1288 1289 /// Collect all element types in the loop for which widening is needed. 1290 void collectElementTypesForWidening(); 1291 1292 /// Split reductions into those that happen in the loop, and those that happen 1293 /// outside. In loop reductions are collected into InLoopReductions. 1294 void collectInLoopReductions(); 1295 1296 /// Returns true if we should use strict in-order reductions for the given 1297 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1298 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1299 /// of FP operations. 1300 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1301 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1302 } 1303 1304 /// \returns The smallest bitwidth each instruction can be represented with. 1305 /// The vector equivalents of these instructions should be truncated to this 1306 /// type. 1307 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1308 return MinBWs; 1309 } 1310 1311 /// \returns True if it is more profitable to scalarize instruction \p I for 1312 /// vectorization factor \p VF. 1313 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1314 assert(VF.isVector() && 1315 "Profitable to scalarize relevant only for VF > 1."); 1316 1317 // Cost model is not run in the VPlan-native path - return conservative 1318 // result until this changes. 1319 if (EnableVPlanNativePath) 1320 return false; 1321 1322 auto Scalars = InstsToScalarize.find(VF); 1323 assert(Scalars != InstsToScalarize.end() && 1324 "VF not yet analyzed for scalarization profitability"); 1325 return Scalars->second.contains(I); 1326 } 1327 1328 /// Returns true if \p I is known to be uniform after vectorization. 1329 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1330 // Pseudo probe needs to be duplicated for each unrolled iteration and 1331 // vector lane so that profiled loop trip count can be accurately 1332 // accumulated instead of being under counted. 1333 if (isa<PseudoProbeInst>(I)) 1334 return false; 1335 1336 if (VF.isScalar()) 1337 return true; 1338 1339 // Cost model is not run in the VPlan-native path - return conservative 1340 // result until this changes. 1341 if (EnableVPlanNativePath) 1342 return false; 1343 1344 auto UniformsPerVF = Uniforms.find(VF); 1345 assert(UniformsPerVF != Uniforms.end() && 1346 "VF not yet analyzed for uniformity"); 1347 return UniformsPerVF->second.count(I); 1348 } 1349 1350 /// Returns true if \p I is known to be scalar after vectorization. 1351 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1352 if (VF.isScalar()) 1353 return true; 1354 1355 // Cost model is not run in the VPlan-native path - return conservative 1356 // result until this changes. 1357 if (EnableVPlanNativePath) 1358 return false; 1359 1360 auto ScalarsPerVF = Scalars.find(VF); 1361 assert(ScalarsPerVF != Scalars.end() && 1362 "Scalar values are not calculated for VF"); 1363 return ScalarsPerVF->second.count(I); 1364 } 1365 1366 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1367 /// for vectorization factor \p VF. 1368 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1369 return VF.isVector() && MinBWs.contains(I) && 1370 !isProfitableToScalarize(I, VF) && 1371 !isScalarAfterVectorization(I, VF); 1372 } 1373 1374 /// Decision that was taken during cost calculation for memory instruction. 1375 enum InstWidening { 1376 CM_Unknown, 1377 CM_Widen, // For consecutive accesses with stride +1. 1378 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1379 CM_Interleave, 1380 CM_GatherScatter, 1381 CM_Scalarize, 1382 CM_VectorCall, 1383 CM_IntrinsicCall 1384 }; 1385 1386 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1387 /// instruction \p I and vector width \p VF. 1388 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1389 InstructionCost Cost) { 1390 assert(VF.isVector() && "Expected VF >=2"); 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 } 1393 1394 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1395 /// interleaving group \p Grp and vector width \p VF. 1396 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1397 ElementCount VF, InstWidening W, 1398 InstructionCost Cost) { 1399 assert(VF.isVector() && "Expected VF >=2"); 1400 /// Broadcast this decicion to all instructions inside the group. 1401 /// But the cost will be assigned to one instruction only. 1402 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1403 if (auto *I = Grp->getMember(i)) { 1404 if (Grp->getInsertPos() == I) 1405 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1406 else 1407 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1408 } 1409 } 1410 } 1411 1412 /// Return the cost model decision for the given instruction \p I and vector 1413 /// width \p VF. Return CM_Unknown if this instruction did not pass 1414 /// through the cost modeling. 1415 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1416 assert(VF.isVector() && "Expected VF to be a vector VF"); 1417 // Cost model is not run in the VPlan-native path - return conservative 1418 // result until this changes. 1419 if (EnableVPlanNativePath) 1420 return CM_GatherScatter; 1421 1422 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1423 auto Itr = WideningDecisions.find(InstOnVF); 1424 if (Itr == WideningDecisions.end()) 1425 return CM_Unknown; 1426 return Itr->second.first; 1427 } 1428 1429 /// Return the vectorization cost for the given instruction \p I and vector 1430 /// width \p VF. 1431 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1432 assert(VF.isVector() && "Expected VF >=2"); 1433 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1434 assert(WideningDecisions.contains(InstOnVF) && 1435 "The cost is not calculated"); 1436 return WideningDecisions[InstOnVF].second; 1437 } 1438 1439 struct CallWideningDecision { 1440 InstWidening Kind; 1441 Function *Variant; 1442 Intrinsic::ID IID; 1443 std::optional<unsigned> MaskPos; 1444 InstructionCost Cost; 1445 }; 1446 1447 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1448 Function *Variant, Intrinsic::ID IID, 1449 std::optional<unsigned> MaskPos, 1450 InstructionCost Cost) { 1451 assert(!VF.isScalar() && "Expected vector VF"); 1452 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1453 MaskPos, Cost}; 1454 } 1455 1456 CallWideningDecision getCallWideningDecision(CallInst *CI, 1457 ElementCount VF) const { 1458 assert(!VF.isScalar() && "Expected vector VF"); 1459 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1460 } 1461 1462 /// Return True if instruction \p I is an optimizable truncate whose operand 1463 /// is an induction variable. Such a truncate will be removed by adding a new 1464 /// induction variable with the destination type. 1465 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1466 // If the instruction is not a truncate, return false. 1467 auto *Trunc = dyn_cast<TruncInst>(I); 1468 if (!Trunc) 1469 return false; 1470 1471 // Get the source and destination types of the truncate. 1472 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1473 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1474 1475 // If the truncate is free for the given types, return false. Replacing a 1476 // free truncate with an induction variable would add an induction variable 1477 // update instruction to each iteration of the loop. We exclude from this 1478 // check the primary induction variable since it will need an update 1479 // instruction regardless. 1480 Value *Op = Trunc->getOperand(0); 1481 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1482 return false; 1483 1484 // If the truncated value is not an induction variable, return false. 1485 return Legal->isInductionPhi(Op); 1486 } 1487 1488 /// Collects the instructions to scalarize for each predicated instruction in 1489 /// the loop. 1490 void collectInstsToScalarize(ElementCount VF); 1491 1492 /// Collect Uniform and Scalar values for the given \p VF. 1493 /// The sets depend on CM decision for Load/Store instructions 1494 /// that may be vectorized as interleave, gather-scatter or scalarized. 1495 /// Also make a decision on what to do about call instructions in the loop 1496 /// at that VF -- scalarize, call a known vector routine, or call a 1497 /// vector intrinsic. 1498 void collectUniformsAndScalars(ElementCount VF) { 1499 // Do the analysis once. 1500 if (VF.isScalar() || Uniforms.contains(VF)) 1501 return; 1502 setCostBasedWideningDecision(VF); 1503 setVectorizedCallDecision(VF); 1504 collectLoopUniforms(VF); 1505 collectLoopScalars(VF); 1506 } 1507 1508 /// Returns true if the target machine supports masked store operation 1509 /// for the given \p DataType and kind of access to \p Ptr. 1510 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1511 return Legal->isConsecutivePtr(DataType, Ptr) && 1512 TTI.isLegalMaskedStore(DataType, Alignment); 1513 } 1514 1515 /// Returns true if the target machine supports masked load operation 1516 /// for the given \p DataType and kind of access to \p Ptr. 1517 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1518 return Legal->isConsecutivePtr(DataType, Ptr) && 1519 TTI.isLegalMaskedLoad(DataType, Alignment); 1520 } 1521 1522 /// Returns true if the target machine can represent \p V as a masked gather 1523 /// or scatter operation. 1524 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1525 bool LI = isa<LoadInst>(V); 1526 bool SI = isa<StoreInst>(V); 1527 if (!LI && !SI) 1528 return false; 1529 auto *Ty = getLoadStoreType(V); 1530 Align Align = getLoadStoreAlignment(V); 1531 if (VF.isVector()) 1532 Ty = VectorType::get(Ty, VF); 1533 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1534 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1535 } 1536 1537 /// Returns true if the target machine supports all of the reduction 1538 /// variables found for the given VF. 1539 bool canVectorizeReductions(ElementCount VF) const { 1540 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1541 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1542 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1543 })); 1544 } 1545 1546 /// Given costs for both strategies, return true if the scalar predication 1547 /// lowering should be used for div/rem. This incorporates an override 1548 /// option so it is not simply a cost comparison. 1549 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1550 InstructionCost SafeDivisorCost) const { 1551 switch (ForceSafeDivisor) { 1552 case cl::BOU_UNSET: 1553 return ScalarCost < SafeDivisorCost; 1554 case cl::BOU_TRUE: 1555 return false; 1556 case cl::BOU_FALSE: 1557 return true; 1558 }; 1559 llvm_unreachable("impossible case value"); 1560 } 1561 1562 /// Returns true if \p I is an instruction which requires predication and 1563 /// for which our chosen predication strategy is scalarization (i.e. we 1564 /// don't have an alternate strategy such as masking available). 1565 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1566 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1567 1568 /// Returns true if \p I is an instruction that needs to be predicated 1569 /// at runtime. The result is independent of the predication mechanism. 1570 /// Superset of instructions that return true for isScalarWithPredication. 1571 bool isPredicatedInst(Instruction *I) const; 1572 1573 /// Return the costs for our two available strategies for lowering a 1574 /// div/rem operation which requires speculating at least one lane. 1575 /// First result is for scalarization (will be invalid for scalable 1576 /// vectors); second is for the safe-divisor strategy. 1577 std::pair<InstructionCost, InstructionCost> 1578 getDivRemSpeculationCost(Instruction *I, 1579 ElementCount VF) const; 1580 1581 /// Returns true if \p I is a memory instruction with consecutive memory 1582 /// access that can be widened. 1583 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1584 1585 /// Returns true if \p I is a memory instruction in an interleaved-group 1586 /// of memory accesses that can be vectorized with wide vector loads/stores 1587 /// and shuffles. 1588 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1589 1590 /// Check if \p Instr belongs to any interleaved access group. 1591 bool isAccessInterleaved(Instruction *Instr) { 1592 return InterleaveInfo.isInterleaved(Instr); 1593 } 1594 1595 /// Get the interleaved access group that \p Instr belongs to. 1596 const InterleaveGroup<Instruction> * 1597 getInterleavedAccessGroup(Instruction *Instr) { 1598 return InterleaveInfo.getInterleaveGroup(Instr); 1599 } 1600 1601 /// Returns true if we're required to use a scalar epilogue for at least 1602 /// the final iteration of the original loop. 1603 bool requiresScalarEpilogue(bool IsVectorizing) const { 1604 if (!isScalarEpilogueAllowed()) 1605 return false; 1606 // If we might exit from anywhere but the latch, must run the exiting 1607 // iteration in scalar form. 1608 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1609 return true; 1610 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1611 } 1612 1613 /// Returns true if we're required to use a scalar epilogue for at least 1614 /// the final iteration of the original loop for all VFs in \p Range. 1615 /// A scalar epilogue must either be required for all VFs in \p Range or for 1616 /// none. 1617 bool requiresScalarEpilogue(VFRange Range) const { 1618 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1619 return requiresScalarEpilogue(VF.isVector()); 1620 }; 1621 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1622 assert( 1623 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1624 "all VFs in range must agree on whether a scalar epilogue is required"); 1625 return IsRequired; 1626 } 1627 1628 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1629 /// loop hint annotation. 1630 bool isScalarEpilogueAllowed() const { 1631 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1632 } 1633 1634 /// Returns the TailFoldingStyle that is best for the current loop. 1635 TailFoldingStyle 1636 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1637 if (!CanFoldTailByMasking) 1638 return TailFoldingStyle::None; 1639 1640 if (ForceTailFoldingStyle.getNumOccurrences()) 1641 return ForceTailFoldingStyle; 1642 1643 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1644 } 1645 1646 /// Returns true if all loop blocks should be masked to fold tail loop. 1647 bool foldTailByMasking() const { 1648 return getTailFoldingStyle() != TailFoldingStyle::None; 1649 } 1650 1651 /// Returns true if the instructions in this block requires predication 1652 /// for any reason, e.g. because tail folding now requires a predicate 1653 /// or because the block in the original loop was predicated. 1654 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1655 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1656 } 1657 1658 /// Returns true if the Phi is part of an inloop reduction. 1659 bool isInLoopReduction(PHINode *Phi) const { 1660 return InLoopReductions.contains(Phi); 1661 } 1662 1663 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1664 /// with factor VF. Return the cost of the instruction, including 1665 /// scalarization overhead if it's needed. 1666 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1667 1668 /// Estimate cost of a call instruction CI if it were vectorized with factor 1669 /// VF. Return the cost of the instruction, including scalarization overhead 1670 /// if it's needed. 1671 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1672 1673 /// Invalidates decisions already taken by the cost model. 1674 void invalidateCostModelingDecisions() { 1675 WideningDecisions.clear(); 1676 CallWideningDecisions.clear(); 1677 Uniforms.clear(); 1678 Scalars.clear(); 1679 } 1680 1681 /// The vectorization cost is a combination of the cost itself and a boolean 1682 /// indicating whether any of the contributing operations will actually 1683 /// operate on vector values after type legalization in the backend. If this 1684 /// latter value is false, then all operations will be scalarized (i.e. no 1685 /// vectorization has actually taken place). 1686 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1687 1688 /// Returns the expected execution cost. The unit of the cost does 1689 /// not matter because we use the 'cost' units to compare different 1690 /// vector widths. The cost that is returned is *not* normalized by 1691 /// the factor width. If \p Invalid is not nullptr, this function 1692 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1693 /// each instruction that has an Invalid cost for the given VF. 1694 VectorizationCostTy 1695 expectedCost(ElementCount VF, 1696 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1697 1698 bool hasPredStores() const { return NumPredStores > 0; } 1699 1700 /// Returns true if epilogue vectorization is considered profitable, and 1701 /// false otherwise. 1702 /// \p VF is the vectorization factor chosen for the original loop. 1703 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1704 1705 private: 1706 unsigned NumPredStores = 0; 1707 1708 /// \return An upper bound for the vectorization factors for both 1709 /// fixed and scalable vectorization, where the minimum-known number of 1710 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1711 /// disabled or unsupported, then the scalable part will be equal to 1712 /// ElementCount::getScalable(0). 1713 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1714 ElementCount UserVF, 1715 bool FoldTailByMasking); 1716 1717 /// \return the maximized element count based on the targets vector 1718 /// registers and the loop trip-count, but limited to a maximum safe VF. 1719 /// This is a helper function of computeFeasibleMaxVF. 1720 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1721 unsigned SmallestType, 1722 unsigned WidestType, 1723 ElementCount MaxSafeVF, 1724 bool FoldTailByMasking); 1725 1726 /// \return the maximum legal scalable VF, based on the safe max number 1727 /// of elements. 1728 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1729 1730 /// Returns the execution time cost of an instruction for a given vector 1731 /// width. Vector width of one means scalar. 1732 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1733 1734 /// The cost-computation logic from getInstructionCost which provides 1735 /// the vector type as an output parameter. 1736 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1737 Type *&VectorTy); 1738 1739 /// Return the cost of instructions in an inloop reduction pattern, if I is 1740 /// part of that pattern. 1741 std::optional<InstructionCost> 1742 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1743 TTI::TargetCostKind CostKind) const; 1744 1745 /// Calculate vectorization cost of memory instruction \p I. 1746 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1747 1748 /// The cost computation for scalarized memory instruction. 1749 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1750 1751 /// The cost computation for interleaving group of memory instructions. 1752 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1753 1754 /// The cost computation for Gather/Scatter instruction. 1755 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1756 1757 /// The cost computation for widening instruction \p I with consecutive 1758 /// memory access. 1759 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1760 1761 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1762 /// Load: scalar load + broadcast. 1763 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1764 /// element) 1765 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1766 1767 /// Estimate the overhead of scalarizing an instruction. This is a 1768 /// convenience wrapper for the type-based getScalarizationOverhead API. 1769 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1770 TTI::TargetCostKind CostKind) const; 1771 1772 /// Returns true if an artificially high cost for emulated masked memrefs 1773 /// should be used. 1774 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1775 1776 /// Map of scalar integer values to the smallest bitwidth they can be legally 1777 /// represented as. The vector equivalents of these values should be truncated 1778 /// to this type. 1779 MapVector<Instruction *, uint64_t> MinBWs; 1780 1781 /// A type representing the costs for instructions if they were to be 1782 /// scalarized rather than vectorized. The entries are Instruction-Cost 1783 /// pairs. 1784 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1785 1786 /// A set containing all BasicBlocks that are known to present after 1787 /// vectorization as a predicated block. 1788 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1789 PredicatedBBsAfterVectorization; 1790 1791 /// Records whether it is allowed to have the original scalar loop execute at 1792 /// least once. This may be needed as a fallback loop in case runtime 1793 /// aliasing/dependence checks fail, or to handle the tail/remainder 1794 /// iterations when the trip count is unknown or doesn't divide by the VF, 1795 /// or as a peel-loop to handle gaps in interleave-groups. 1796 /// Under optsize and when the trip count is very small we don't allow any 1797 /// iterations to execute in the scalar loop. 1798 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1799 1800 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1801 bool CanFoldTailByMasking = false; 1802 1803 /// A map holding scalar costs for different vectorization factors. The 1804 /// presence of a cost for an instruction in the mapping indicates that the 1805 /// instruction will be scalarized when vectorizing with the associated 1806 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1807 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1808 1809 /// Holds the instructions known to be uniform after vectorization. 1810 /// The data is collected per VF. 1811 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1812 1813 /// Holds the instructions known to be scalar after vectorization. 1814 /// The data is collected per VF. 1815 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1816 1817 /// Holds the instructions (address computations) that are forced to be 1818 /// scalarized. 1819 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1820 1821 /// PHINodes of the reductions that should be expanded in-loop. 1822 SmallPtrSet<PHINode *, 4> InLoopReductions; 1823 1824 /// A Map of inloop reduction operations and their immediate chain operand. 1825 /// FIXME: This can be removed once reductions can be costed correctly in 1826 /// VPlan. This was added to allow quick lookup of the inloop operations. 1827 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1828 1829 /// Returns the expected difference in cost from scalarizing the expression 1830 /// feeding a predicated instruction \p PredInst. The instructions to 1831 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1832 /// non-negative return value implies the expression will be scalarized. 1833 /// Currently, only single-use chains are considered for scalarization. 1834 InstructionCost computePredInstDiscount(Instruction *PredInst, 1835 ScalarCostsTy &ScalarCosts, 1836 ElementCount VF); 1837 1838 /// Collect the instructions that are uniform after vectorization. An 1839 /// instruction is uniform if we represent it with a single scalar value in 1840 /// the vectorized loop corresponding to each vector iteration. Examples of 1841 /// uniform instructions include pointer operands of consecutive or 1842 /// interleaved memory accesses. Note that although uniformity implies an 1843 /// instruction will be scalar, the reverse is not true. In general, a 1844 /// scalarized instruction will be represented by VF scalar values in the 1845 /// vectorized loop, each corresponding to an iteration of the original 1846 /// scalar loop. 1847 void collectLoopUniforms(ElementCount VF); 1848 1849 /// Collect the instructions that are scalar after vectorization. An 1850 /// instruction is scalar if it is known to be uniform or will be scalarized 1851 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1852 /// to the list if they are used by a load/store instruction that is marked as 1853 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1854 /// VF values in the vectorized loop, each corresponding to an iteration of 1855 /// the original scalar loop. 1856 void collectLoopScalars(ElementCount VF); 1857 1858 /// Keeps cost model vectorization decision and cost for instructions. 1859 /// Right now it is used for memory instructions only. 1860 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1861 std::pair<InstWidening, InstructionCost>>; 1862 1863 DecisionList WideningDecisions; 1864 1865 using CallDecisionList = 1866 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1867 1868 CallDecisionList CallWideningDecisions; 1869 1870 /// Returns true if \p V is expected to be vectorized and it needs to be 1871 /// extracted. 1872 bool needsExtract(Value *V, ElementCount VF) const { 1873 Instruction *I = dyn_cast<Instruction>(V); 1874 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1875 TheLoop->isLoopInvariant(I)) 1876 return false; 1877 1878 // Assume we can vectorize V (and hence we need extraction) if the 1879 // scalars are not computed yet. This can happen, because it is called 1880 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1881 // the scalars are collected. That should be a safe assumption in most 1882 // cases, because we check if the operands have vectorizable types 1883 // beforehand in LoopVectorizationLegality. 1884 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1885 }; 1886 1887 /// Returns a range containing only operands needing to be extracted. 1888 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1889 ElementCount VF) const { 1890 return SmallVector<Value *, 4>(make_filter_range( 1891 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1892 } 1893 1894 public: 1895 /// The loop that we evaluate. 1896 Loop *TheLoop; 1897 1898 /// Predicated scalar evolution analysis. 1899 PredicatedScalarEvolution &PSE; 1900 1901 /// Loop Info analysis. 1902 LoopInfo *LI; 1903 1904 /// Vectorization legality. 1905 LoopVectorizationLegality *Legal; 1906 1907 /// Vector target information. 1908 const TargetTransformInfo &TTI; 1909 1910 /// Target Library Info. 1911 const TargetLibraryInfo *TLI; 1912 1913 /// Demanded bits analysis. 1914 DemandedBits *DB; 1915 1916 /// Assumption cache. 1917 AssumptionCache *AC; 1918 1919 /// Interface to emit optimization remarks. 1920 OptimizationRemarkEmitter *ORE; 1921 1922 const Function *TheFunction; 1923 1924 /// Loop Vectorize Hint. 1925 const LoopVectorizeHints *Hints; 1926 1927 /// The interleave access information contains groups of interleaved accesses 1928 /// with the same stride and close to each other. 1929 InterleavedAccessInfo &InterleaveInfo; 1930 1931 /// Values to ignore in the cost model. 1932 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1933 1934 /// Values to ignore in the cost model when VF > 1. 1935 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1936 1937 /// All element types found in the loop. 1938 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1939 }; 1940 } // end namespace llvm 1941 1942 namespace { 1943 /// Helper struct to manage generating runtime checks for vectorization. 1944 /// 1945 /// The runtime checks are created up-front in temporary blocks to allow better 1946 /// estimating the cost and un-linked from the existing IR. After deciding to 1947 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1948 /// temporary blocks are completely removed. 1949 class GeneratedRTChecks { 1950 /// Basic block which contains the generated SCEV checks, if any. 1951 BasicBlock *SCEVCheckBlock = nullptr; 1952 1953 /// The value representing the result of the generated SCEV checks. If it is 1954 /// nullptr, either no SCEV checks have been generated or they have been used. 1955 Value *SCEVCheckCond = nullptr; 1956 1957 /// Basic block which contains the generated memory runtime checks, if any. 1958 BasicBlock *MemCheckBlock = nullptr; 1959 1960 /// The value representing the result of the generated memory runtime checks. 1961 /// If it is nullptr, either no memory runtime checks have been generated or 1962 /// they have been used. 1963 Value *MemRuntimeCheckCond = nullptr; 1964 1965 DominatorTree *DT; 1966 LoopInfo *LI; 1967 TargetTransformInfo *TTI; 1968 1969 SCEVExpander SCEVExp; 1970 SCEVExpander MemCheckExp; 1971 1972 bool CostTooHigh = false; 1973 const bool AddBranchWeights; 1974 1975 public: 1976 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1977 TargetTransformInfo *TTI, const DataLayout &DL, 1978 bool AddBranchWeights) 1979 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1980 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} 1981 1982 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1983 /// accurately estimate the cost of the runtime checks. The blocks are 1984 /// un-linked from the IR and is added back during vector code generation. If 1985 /// there is no vector code generation, the check blocks are removed 1986 /// completely. 1987 void Create(Loop *L, const LoopAccessInfo &LAI, 1988 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1989 1990 // Hard cutoff to limit compile-time increase in case a very large number of 1991 // runtime checks needs to be generated. 1992 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1993 // profile info. 1994 CostTooHigh = 1995 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1996 if (CostTooHigh) 1997 return; 1998 1999 BasicBlock *LoopHeader = L->getHeader(); 2000 BasicBlock *Preheader = L->getLoopPreheader(); 2001 2002 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2003 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2004 // may be used by SCEVExpander. The blocks will be un-linked from their 2005 // predecessors and removed from LI & DT at the end of the function. 2006 if (!UnionPred.isAlwaysTrue()) { 2007 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2008 nullptr, "vector.scevcheck"); 2009 2010 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2011 &UnionPred, SCEVCheckBlock->getTerminator()); 2012 } 2013 2014 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2015 if (RtPtrChecking.Need) { 2016 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2017 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2018 "vector.memcheck"); 2019 2020 auto DiffChecks = RtPtrChecking.getDiffChecks(); 2021 if (DiffChecks) { 2022 Value *RuntimeVF = nullptr; 2023 MemRuntimeCheckCond = addDiffRuntimeChecks( 2024 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 2025 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 2026 if (!RuntimeVF) 2027 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 2028 return RuntimeVF; 2029 }, 2030 IC); 2031 } else { 2032 MemRuntimeCheckCond = addRuntimeChecks( 2033 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 2034 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 2035 } 2036 assert(MemRuntimeCheckCond && 2037 "no RT checks generated although RtPtrChecking " 2038 "claimed checks are required"); 2039 } 2040 2041 if (!MemCheckBlock && !SCEVCheckBlock) 2042 return; 2043 2044 // Unhook the temporary block with the checks, update various places 2045 // accordingly. 2046 if (SCEVCheckBlock) 2047 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2048 if (MemCheckBlock) 2049 MemCheckBlock->replaceAllUsesWith(Preheader); 2050 2051 if (SCEVCheckBlock) { 2052 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2053 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2054 Preheader->getTerminator()->eraseFromParent(); 2055 } 2056 if (MemCheckBlock) { 2057 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2058 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2059 Preheader->getTerminator()->eraseFromParent(); 2060 } 2061 2062 DT->changeImmediateDominator(LoopHeader, Preheader); 2063 if (MemCheckBlock) { 2064 DT->eraseNode(MemCheckBlock); 2065 LI->removeBlock(MemCheckBlock); 2066 } 2067 if (SCEVCheckBlock) { 2068 DT->eraseNode(SCEVCheckBlock); 2069 LI->removeBlock(SCEVCheckBlock); 2070 } 2071 } 2072 2073 InstructionCost getCost() { 2074 if (SCEVCheckBlock || MemCheckBlock) 2075 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2076 2077 if (CostTooHigh) { 2078 InstructionCost Cost; 2079 Cost.setInvalid(); 2080 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2081 return Cost; 2082 } 2083 2084 InstructionCost RTCheckCost = 0; 2085 if (SCEVCheckBlock) 2086 for (Instruction &I : *SCEVCheckBlock) { 2087 if (SCEVCheckBlock->getTerminator() == &I) 2088 continue; 2089 InstructionCost C = 2090 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2091 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2092 RTCheckCost += C; 2093 } 2094 if (MemCheckBlock) 2095 for (Instruction &I : *MemCheckBlock) { 2096 if (MemCheckBlock->getTerminator() == &I) 2097 continue; 2098 InstructionCost C = 2099 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2100 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2101 RTCheckCost += C; 2102 } 2103 2104 if (SCEVCheckBlock || MemCheckBlock) 2105 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2106 << "\n"); 2107 2108 return RTCheckCost; 2109 } 2110 2111 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2112 /// unused. 2113 ~GeneratedRTChecks() { 2114 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2115 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2116 if (!SCEVCheckCond) 2117 SCEVCleaner.markResultUsed(); 2118 2119 if (!MemRuntimeCheckCond) 2120 MemCheckCleaner.markResultUsed(); 2121 2122 if (MemRuntimeCheckCond) { 2123 auto &SE = *MemCheckExp.getSE(); 2124 // Memory runtime check generation creates compares that use expanded 2125 // values. Remove them before running the SCEVExpanderCleaners. 2126 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2127 if (MemCheckExp.isInsertedInstruction(&I)) 2128 continue; 2129 SE.forgetValue(&I); 2130 I.eraseFromParent(); 2131 } 2132 } 2133 MemCheckCleaner.cleanup(); 2134 SCEVCleaner.cleanup(); 2135 2136 if (SCEVCheckCond) 2137 SCEVCheckBlock->eraseFromParent(); 2138 if (MemRuntimeCheckCond) 2139 MemCheckBlock->eraseFromParent(); 2140 } 2141 2142 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2143 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2144 /// depending on the generated condition. 2145 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2146 BasicBlock *LoopVectorPreHeader, 2147 BasicBlock *LoopExitBlock) { 2148 if (!SCEVCheckCond) 2149 return nullptr; 2150 2151 Value *Cond = SCEVCheckCond; 2152 // Mark the check as used, to prevent it from being removed during cleanup. 2153 SCEVCheckCond = nullptr; 2154 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2155 if (C->isZero()) 2156 return nullptr; 2157 2158 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2159 2160 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2161 // Create new preheader for vector loop. 2162 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2163 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2164 2165 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2166 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2167 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2168 SCEVCheckBlock); 2169 2170 DT->addNewBlock(SCEVCheckBlock, Pred); 2171 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2172 2173 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2174 if (AddBranchWeights) 2175 setBranchWeights(BI, SCEVCheckBypassWeights); 2176 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2177 return SCEVCheckBlock; 2178 } 2179 2180 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2181 /// the branches to branch to the vector preheader or \p Bypass, depending on 2182 /// the generated condition. 2183 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2184 BasicBlock *LoopVectorPreHeader) { 2185 // Check if we generated code that checks in runtime if arrays overlap. 2186 if (!MemRuntimeCheckCond) 2187 return nullptr; 2188 2189 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2190 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2191 MemCheckBlock); 2192 2193 DT->addNewBlock(MemCheckBlock, Pred); 2194 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2195 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2196 2197 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2198 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2199 2200 BranchInst &BI = 2201 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2202 if (AddBranchWeights) { 2203 setBranchWeights(BI, MemCheckBypassWeights); 2204 } 2205 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2206 MemCheckBlock->getTerminator()->setDebugLoc( 2207 Pred->getTerminator()->getDebugLoc()); 2208 2209 // Mark the check as used, to prevent it from being removed during cleanup. 2210 MemRuntimeCheckCond = nullptr; 2211 return MemCheckBlock; 2212 } 2213 }; 2214 } // namespace 2215 2216 static bool useActiveLaneMask(TailFoldingStyle Style) { 2217 return Style == TailFoldingStyle::Data || 2218 Style == TailFoldingStyle::DataAndControlFlow || 2219 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2220 } 2221 2222 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2223 return Style == TailFoldingStyle::DataAndControlFlow || 2224 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2225 } 2226 2227 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2228 // vectorization. The loop needs to be annotated with #pragma omp simd 2229 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2230 // vector length information is not provided, vectorization is not considered 2231 // explicit. Interleave hints are not allowed either. These limitations will be 2232 // relaxed in the future. 2233 // Please, note that we are currently forced to abuse the pragma 'clang 2234 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2235 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2236 // provides *explicit vectorization hints* (LV can bypass legal checks and 2237 // assume that vectorization is legal). However, both hints are implemented 2238 // using the same metadata (llvm.loop.vectorize, processed by 2239 // LoopVectorizeHints). This will be fixed in the future when the native IR 2240 // representation for pragma 'omp simd' is introduced. 2241 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2242 OptimizationRemarkEmitter *ORE) { 2243 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2244 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2245 2246 // Only outer loops with an explicit vectorization hint are supported. 2247 // Unannotated outer loops are ignored. 2248 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2249 return false; 2250 2251 Function *Fn = OuterLp->getHeader()->getParent(); 2252 if (!Hints.allowVectorization(Fn, OuterLp, 2253 true /*VectorizeOnlyWhenForced*/)) { 2254 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2255 return false; 2256 } 2257 2258 if (Hints.getInterleave() > 1) { 2259 // TODO: Interleave support is future work. 2260 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2261 "outer loops.\n"); 2262 Hints.emitRemarkWithHints(); 2263 return false; 2264 } 2265 2266 return true; 2267 } 2268 2269 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2270 OptimizationRemarkEmitter *ORE, 2271 SmallVectorImpl<Loop *> &V) { 2272 // Collect inner loops and outer loops without irreducible control flow. For 2273 // now, only collect outer loops that have explicit vectorization hints. If we 2274 // are stress testing the VPlan H-CFG construction, we collect the outermost 2275 // loop of every loop nest. 2276 if (L.isInnermost() || VPlanBuildStressTest || 2277 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2278 LoopBlocksRPO RPOT(&L); 2279 RPOT.perform(LI); 2280 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2281 V.push_back(&L); 2282 // TODO: Collect inner loops inside marked outer loops in case 2283 // vectorization fails for the outer loop. Do not invoke 2284 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2285 // already known to be reducible. We can use an inherited attribute for 2286 // that. 2287 return; 2288 } 2289 } 2290 for (Loop *InnerL : L) 2291 collectSupportedLoops(*InnerL, LI, ORE, V); 2292 } 2293 2294 //===----------------------------------------------------------------------===// 2295 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2296 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2297 //===----------------------------------------------------------------------===// 2298 2299 /// Compute the transformed value of Index at offset StartValue using step 2300 /// StepValue. 2301 /// For integer induction, returns StartValue + Index * StepValue. 2302 /// For pointer induction, returns StartValue[Index * StepValue]. 2303 /// FIXME: The newly created binary instructions should contain nsw/nuw 2304 /// flags, which can be found from the original scalar operations. 2305 static Value * 2306 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2307 Value *Step, 2308 InductionDescriptor::InductionKind InductionKind, 2309 const BinaryOperator *InductionBinOp) { 2310 Type *StepTy = Step->getType(); 2311 Value *CastedIndex = StepTy->isIntegerTy() 2312 ? B.CreateSExtOrTrunc(Index, StepTy) 2313 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2314 if (CastedIndex != Index) { 2315 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2316 Index = CastedIndex; 2317 } 2318 2319 // Note: the IR at this point is broken. We cannot use SE to create any new 2320 // SCEV and then expand it, hoping that SCEV's simplification will give us 2321 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2322 // lead to various SCEV crashes. So all we can do is to use builder and rely 2323 // on InstCombine for future simplifications. Here we handle some trivial 2324 // cases only. 2325 auto CreateAdd = [&B](Value *X, Value *Y) { 2326 assert(X->getType() == Y->getType() && "Types don't match!"); 2327 if (auto *CX = dyn_cast<ConstantInt>(X)) 2328 if (CX->isZero()) 2329 return Y; 2330 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2331 if (CY->isZero()) 2332 return X; 2333 return B.CreateAdd(X, Y); 2334 }; 2335 2336 // We allow X to be a vector type, in which case Y will potentially be 2337 // splatted into a vector with the same element count. 2338 auto CreateMul = [&B](Value *X, Value *Y) { 2339 assert(X->getType()->getScalarType() == Y->getType() && 2340 "Types don't match!"); 2341 if (auto *CX = dyn_cast<ConstantInt>(X)) 2342 if (CX->isOne()) 2343 return Y; 2344 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2345 if (CY->isOne()) 2346 return X; 2347 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2348 if (XVTy && !isa<VectorType>(Y->getType())) 2349 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2350 return B.CreateMul(X, Y); 2351 }; 2352 2353 switch (InductionKind) { 2354 case InductionDescriptor::IK_IntInduction: { 2355 assert(!isa<VectorType>(Index->getType()) && 2356 "Vector indices not supported for integer inductions yet"); 2357 assert(Index->getType() == StartValue->getType() && 2358 "Index type does not match StartValue type"); 2359 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2360 return B.CreateSub(StartValue, Index); 2361 auto *Offset = CreateMul(Index, Step); 2362 return CreateAdd(StartValue, Offset); 2363 } 2364 case InductionDescriptor::IK_PtrInduction: { 2365 return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); 2366 } 2367 case InductionDescriptor::IK_FpInduction: { 2368 assert(!isa<VectorType>(Index->getType()) && 2369 "Vector indices not supported for FP inductions yet"); 2370 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2371 assert(InductionBinOp && 2372 (InductionBinOp->getOpcode() == Instruction::FAdd || 2373 InductionBinOp->getOpcode() == Instruction::FSub) && 2374 "Original bin op should be defined for FP induction"); 2375 2376 Value *MulExp = B.CreateFMul(Step, Index); 2377 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2378 "induction"); 2379 } 2380 case InductionDescriptor::IK_NoInduction: 2381 return nullptr; 2382 } 2383 llvm_unreachable("invalid enum"); 2384 } 2385 2386 std::optional<unsigned> getMaxVScale(const Function &F, 2387 const TargetTransformInfo &TTI) { 2388 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2389 return MaxVScale; 2390 2391 if (F.hasFnAttribute(Attribute::VScaleRange)) 2392 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2393 2394 return std::nullopt; 2395 } 2396 2397 /// For the given VF and UF and maximum trip count computed for the loop, return 2398 /// whether the induction variable might overflow in the vectorized loop. If not, 2399 /// then we know a runtime overflow check always evaluates to false and can be 2400 /// removed. 2401 static bool isIndvarOverflowCheckKnownFalse( 2402 const LoopVectorizationCostModel *Cost, 2403 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2404 // Always be conservative if we don't know the exact unroll factor. 2405 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2406 2407 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2408 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2409 2410 // We know the runtime overflow check is known false iff the (max) trip-count 2411 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2412 // the vector loop induction variable. 2413 if (unsigned TC = 2414 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2415 uint64_t MaxVF = VF.getKnownMinValue(); 2416 if (VF.isScalable()) { 2417 std::optional<unsigned> MaxVScale = 2418 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2419 if (!MaxVScale) 2420 return false; 2421 MaxVF *= *MaxVScale; 2422 } 2423 2424 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2425 } 2426 2427 return false; 2428 } 2429 2430 // Return whether we allow using masked interleave-groups (for dealing with 2431 // strided loads/stores that reside in predicated blocks, or for dealing 2432 // with gaps). 2433 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2434 // If an override option has been passed in for interleaved accesses, use it. 2435 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2436 return EnableMaskedInterleavedMemAccesses; 2437 2438 return TTI.enableMaskedInterleavedAccessVectorization(); 2439 } 2440 2441 // Try to vectorize the interleave group that \p Instr belongs to. 2442 // 2443 // E.g. Translate following interleaved load group (factor = 3): 2444 // for (i = 0; i < N; i+=3) { 2445 // R = Pic[i]; // Member of index 0 2446 // G = Pic[i+1]; // Member of index 1 2447 // B = Pic[i+2]; // Member of index 2 2448 // ... // do something to R, G, B 2449 // } 2450 // To: 2451 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2452 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2453 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2454 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2455 // 2456 // Or translate following interleaved store group (factor = 3): 2457 // for (i = 0; i < N; i+=3) { 2458 // ... do something to R, G, B 2459 // Pic[i] = R; // Member of index 0 2460 // Pic[i+1] = G; // Member of index 1 2461 // Pic[i+2] = B; // Member of index 2 2462 // } 2463 // To: 2464 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2465 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2466 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2467 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2468 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2469 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2470 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2471 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2472 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2473 Instruction *Instr = Group->getInsertPos(); 2474 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2475 2476 // Prepare for the vector type of the interleaved load/store. 2477 Type *ScalarTy = getLoadStoreType(Instr); 2478 unsigned InterleaveFactor = Group->getFactor(); 2479 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2480 2481 // Prepare for the new pointers. 2482 SmallVector<Value *, 2> AddrParts; 2483 unsigned Index = Group->getIndex(Instr); 2484 2485 // TODO: extend the masked interleaved-group support to reversed access. 2486 assert((!BlockInMask || !Group->isReverse()) && 2487 "Reversed masked interleave-group not supported."); 2488 2489 Value *Idx; 2490 // If the group is reverse, adjust the index to refer to the last vector lane 2491 // instead of the first. We adjust the index from the first vector lane, 2492 // rather than directly getting the pointer for lane VF - 1, because the 2493 // pointer operand of the interleaved access is supposed to be uniform. For 2494 // uniform instructions, we're only required to generate a value for the 2495 // first vector lane in each unroll iteration. 2496 if (Group->isReverse()) { 2497 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2498 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2499 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2500 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2501 Idx = Builder.CreateNeg(Idx); 2502 } else 2503 Idx = Builder.getInt32(-Index); 2504 2505 for (unsigned Part = 0; Part < UF; Part++) { 2506 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2507 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2508 State.setDebugLocFrom(I->getDebugLoc()); 2509 2510 // Notice current instruction could be any index. Need to adjust the address 2511 // to the member of index 0. 2512 // 2513 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2514 // b = A[i]; // Member of index 0 2515 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2516 // 2517 // E.g. A[i+1] = a; // Member of index 1 2518 // A[i] = b; // Member of index 0 2519 // A[i+2] = c; // Member of index 2 (Current instruction) 2520 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2521 2522 bool InBounds = false; 2523 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2524 InBounds = gep->isInBounds(); 2525 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2526 AddrParts.push_back(AddrPart); 2527 } 2528 2529 State.setDebugLocFrom(Instr->getDebugLoc()); 2530 Value *PoisonVec = PoisonValue::get(VecTy); 2531 2532 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2533 unsigned Part, Value *MaskForGaps) -> Value * { 2534 if (VF.isScalable()) { 2535 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2536 assert(InterleaveFactor == 2 && 2537 "Unsupported deinterleave factor for scalable vectors"); 2538 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2539 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2540 auto *MaskTy = 2541 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2542 return Builder.CreateIntrinsic( 2543 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2544 /*FMFSource=*/nullptr, "interleaved.mask"); 2545 } 2546 2547 if (!BlockInMask) 2548 return MaskForGaps; 2549 2550 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2551 Value *ShuffledMask = Builder.CreateShuffleVector( 2552 BlockInMaskPart, 2553 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2554 "interleaved.mask"); 2555 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2556 MaskForGaps) 2557 : ShuffledMask; 2558 }; 2559 2560 // Vectorize the interleaved load group. 2561 if (isa<LoadInst>(Instr)) { 2562 Value *MaskForGaps = nullptr; 2563 if (NeedsMaskForGaps) { 2564 MaskForGaps = 2565 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2566 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2567 } 2568 2569 // For each unroll part, create a wide load for the group. 2570 SmallVector<Value *, 2> NewLoads; 2571 for (unsigned Part = 0; Part < UF; Part++) { 2572 Instruction *NewLoad; 2573 if (BlockInMask || MaskForGaps) { 2574 assert(useMaskedInterleavedAccesses(*TTI) && 2575 "masked interleaved groups are not allowed."); 2576 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2577 NewLoad = 2578 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2579 GroupMask, PoisonVec, "wide.masked.vec"); 2580 } 2581 else 2582 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2583 Group->getAlign(), "wide.vec"); 2584 Group->addMetadata(NewLoad); 2585 NewLoads.push_back(NewLoad); 2586 } 2587 2588 if (VecTy->isScalableTy()) { 2589 assert(InterleaveFactor == 2 && 2590 "Unsupported deinterleave factor for scalable vectors"); 2591 2592 for (unsigned Part = 0; Part < UF; ++Part) { 2593 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2594 // so must use intrinsics to deinterleave. 2595 Value *DI = Builder.CreateIntrinsic( 2596 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2597 /*FMFSource=*/nullptr, "strided.vec"); 2598 unsigned J = 0; 2599 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2600 Instruction *Member = Group->getMember(I); 2601 2602 if (!Member) 2603 continue; 2604 2605 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2606 // If this member has different type, cast the result type. 2607 if (Member->getType() != ScalarTy) { 2608 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2609 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2610 } 2611 2612 if (Group->isReverse()) 2613 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2614 2615 State.set(VPDefs[J], StridedVec, Part); 2616 ++J; 2617 } 2618 } 2619 2620 return; 2621 } 2622 2623 // For each member in the group, shuffle out the appropriate data from the 2624 // wide loads. 2625 unsigned J = 0; 2626 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2627 Instruction *Member = Group->getMember(I); 2628 2629 // Skip the gaps in the group. 2630 if (!Member) 2631 continue; 2632 2633 auto StrideMask = 2634 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2635 for (unsigned Part = 0; Part < UF; Part++) { 2636 Value *StridedVec = Builder.CreateShuffleVector( 2637 NewLoads[Part], StrideMask, "strided.vec"); 2638 2639 // If this member has different type, cast the result type. 2640 if (Member->getType() != ScalarTy) { 2641 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2642 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2643 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2644 } 2645 2646 if (Group->isReverse()) 2647 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2648 2649 State.set(VPDefs[J], StridedVec, Part); 2650 } 2651 ++J; 2652 } 2653 return; 2654 } 2655 2656 // The sub vector type for current instruction. 2657 auto *SubVT = VectorType::get(ScalarTy, VF); 2658 2659 // Vectorize the interleaved store group. 2660 Value *MaskForGaps = 2661 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2662 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2663 "masked interleaved groups are not allowed."); 2664 assert((!MaskForGaps || !VF.isScalable()) && 2665 "masking gaps for scalable vectors is not yet supported."); 2666 for (unsigned Part = 0; Part < UF; Part++) { 2667 // Collect the stored vector from each member. 2668 SmallVector<Value *, 4> StoredVecs; 2669 unsigned StoredIdx = 0; 2670 for (unsigned i = 0; i < InterleaveFactor; i++) { 2671 assert((Group->getMember(i) || MaskForGaps) && 2672 "Fail to get a member from an interleaved store group"); 2673 Instruction *Member = Group->getMember(i); 2674 2675 // Skip the gaps in the group. 2676 if (!Member) { 2677 Value *Undef = PoisonValue::get(SubVT); 2678 StoredVecs.push_back(Undef); 2679 continue; 2680 } 2681 2682 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2683 ++StoredIdx; 2684 2685 if (Group->isReverse()) 2686 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2687 2688 // If this member has different type, cast it to a unified type. 2689 2690 if (StoredVec->getType() != SubVT) 2691 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2692 2693 StoredVecs.push_back(StoredVec); 2694 } 2695 2696 // Interleave all the smaller vectors into one wider vector. 2697 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2698 Instruction *NewStoreInstr; 2699 if (BlockInMask || MaskForGaps) { 2700 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2701 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2702 Group->getAlign(), GroupMask); 2703 } else 2704 NewStoreInstr = 2705 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2706 2707 Group->addMetadata(NewStoreInstr); 2708 } 2709 } 2710 2711 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2712 VPReplicateRecipe *RepRecipe, 2713 const VPIteration &Instance, 2714 VPTransformState &State) { 2715 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2716 2717 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2718 // the first lane and part. 2719 if (isa<NoAliasScopeDeclInst>(Instr)) 2720 if (!Instance.isFirstIteration()) 2721 return; 2722 2723 // Does this instruction return a value ? 2724 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2725 2726 Instruction *Cloned = Instr->clone(); 2727 if (!IsVoidRetTy) { 2728 Cloned->setName(Instr->getName() + ".cloned"); 2729 #if !defined(NDEBUG) 2730 // Verify that VPlan type inference results agree with the type of the 2731 // generated values. 2732 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2733 "inferred type and type from generated instructions do not match"); 2734 #endif 2735 } 2736 2737 RepRecipe->setFlags(Cloned); 2738 2739 if (auto DL = Instr->getDebugLoc()) 2740 State.setDebugLocFrom(DL); 2741 2742 // Replace the operands of the cloned instructions with their scalar 2743 // equivalents in the new loop. 2744 for (const auto &I : enumerate(RepRecipe->operands())) { 2745 auto InputInstance = Instance; 2746 VPValue *Operand = I.value(); 2747 if (vputils::isUniformAfterVectorization(Operand)) 2748 InputInstance.Lane = VPLane::getFirstLane(); 2749 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2750 } 2751 State.addNewMetadata(Cloned, Instr); 2752 2753 // Place the cloned scalar in the new loop. 2754 State.Builder.Insert(Cloned); 2755 2756 State.set(RepRecipe, Cloned, Instance); 2757 2758 // If we just cloned a new assumption, add it the assumption cache. 2759 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2760 AC->registerAssumption(II); 2761 2762 // End if-block. 2763 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2764 if (IfPredicateInstr) 2765 PredicatedInstructions.push_back(Cloned); 2766 } 2767 2768 Value * 2769 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2770 if (VectorTripCount) 2771 return VectorTripCount; 2772 2773 Value *TC = getTripCount(); 2774 IRBuilder<> Builder(InsertBlock->getTerminator()); 2775 2776 Type *Ty = TC->getType(); 2777 // This is where we can make the step a runtime constant. 2778 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2779 2780 // If the tail is to be folded by masking, round the number of iterations N 2781 // up to a multiple of Step instead of rounding down. This is done by first 2782 // adding Step-1 and then rounding down. Note that it's ok if this addition 2783 // overflows: the vector induction variable will eventually wrap to zero given 2784 // that it starts at zero and its Step is a power of two; the loop will then 2785 // exit, with the last early-exit vector comparison also producing all-true. 2786 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2787 // is accounted for in emitIterationCountCheck that adds an overflow check. 2788 if (Cost->foldTailByMasking()) { 2789 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2790 "VF*UF must be a power of 2 when folding tail by masking"); 2791 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2792 TC = Builder.CreateAdd( 2793 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2794 } 2795 2796 // Now we need to generate the expression for the part of the loop that the 2797 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2798 // iterations are not required for correctness, or N - Step, otherwise. Step 2799 // is equal to the vectorization factor (number of SIMD elements) times the 2800 // unroll factor (number of SIMD instructions). 2801 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2802 2803 // There are cases where we *must* run at least one iteration in the remainder 2804 // loop. See the cost model for when this can happen. If the step evenly 2805 // divides the trip count, we set the remainder to be equal to the step. If 2806 // the step does not evenly divide the trip count, no adjustment is necessary 2807 // since there will already be scalar iterations. Note that the minimum 2808 // iterations check ensures that N >= Step. 2809 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2810 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2811 R = Builder.CreateSelect(IsZero, Step, R); 2812 } 2813 2814 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2815 2816 return VectorTripCount; 2817 } 2818 2819 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2820 const DataLayout &DL) { 2821 // Verify that V is a vector type with same number of elements as DstVTy. 2822 auto *DstFVTy = cast<VectorType>(DstVTy); 2823 auto VF = DstFVTy->getElementCount(); 2824 auto *SrcVecTy = cast<VectorType>(V->getType()); 2825 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2826 Type *SrcElemTy = SrcVecTy->getElementType(); 2827 Type *DstElemTy = DstFVTy->getElementType(); 2828 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2829 "Vector elements must have same size"); 2830 2831 // Do a direct cast if element types are castable. 2832 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2833 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2834 } 2835 // V cannot be directly casted to desired vector type. 2836 // May happen when V is a floating point vector but DstVTy is a vector of 2837 // pointers or vice-versa. Handle this using a two-step bitcast using an 2838 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2839 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2840 "Only one type should be a pointer type"); 2841 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2842 "Only one type should be a floating point type"); 2843 Type *IntTy = 2844 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2845 auto *VecIntTy = VectorType::get(IntTy, VF); 2846 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2847 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2848 } 2849 2850 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2851 Value *Count = getTripCount(); 2852 // Reuse existing vector loop preheader for TC checks. 2853 // Note that new preheader block is generated for vector loop. 2854 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2855 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2856 2857 // Generate code to check if the loop's trip count is less than VF * UF, or 2858 // equal to it in case a scalar epilogue is required; this implies that the 2859 // vector trip count is zero. This check also covers the case where adding one 2860 // to the backedge-taken count overflowed leading to an incorrect trip count 2861 // of zero. In this case we will also jump to the scalar loop. 2862 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2863 : ICmpInst::ICMP_ULT; 2864 2865 // If tail is to be folded, vector loop takes care of all iterations. 2866 Type *CountTy = Count->getType(); 2867 Value *CheckMinIters = Builder.getFalse(); 2868 auto CreateStep = [&]() -> Value * { 2869 // Create step with max(MinProTripCount, UF * VF). 2870 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2871 return createStepForVF(Builder, CountTy, VF, UF); 2872 2873 Value *MinProfTC = 2874 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2875 if (!VF.isScalable()) 2876 return MinProfTC; 2877 return Builder.CreateBinaryIntrinsic( 2878 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2879 }; 2880 2881 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2882 if (Style == TailFoldingStyle::None) 2883 CheckMinIters = 2884 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2885 else if (VF.isScalable() && 2886 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2887 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2888 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2889 // an overflow to zero when updating induction variables and so an 2890 // additional overflow check is required before entering the vector loop. 2891 2892 // Get the maximum unsigned value for the type. 2893 Value *MaxUIntTripCount = 2894 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2895 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2896 2897 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2898 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2899 } 2900 2901 // Create new preheader for vector loop. 2902 LoopVectorPreHeader = 2903 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2904 "vector.ph"); 2905 2906 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2907 DT->getNode(Bypass)->getIDom()) && 2908 "TC check is expected to dominate Bypass"); 2909 2910 // Update dominator for Bypass & LoopExit (if needed). 2911 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2912 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2913 // If there is an epilogue which must run, there's no edge from the 2914 // middle block to exit blocks and thus no need to update the immediate 2915 // dominator of the exit blocks. 2916 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2917 2918 BranchInst &BI = 2919 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2920 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2921 setBranchWeights(BI, MinItersBypassWeights); 2922 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2923 LoopBypassBlocks.push_back(TCCheckBlock); 2924 } 2925 2926 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2927 BasicBlock *const SCEVCheckBlock = 2928 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2929 if (!SCEVCheckBlock) 2930 return nullptr; 2931 2932 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2933 (OptForSizeBasedOnProfile && 2934 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2935 "Cannot SCEV check stride or overflow when optimizing for size"); 2936 2937 2938 // Update dominator only if this is first RT check. 2939 if (LoopBypassBlocks.empty()) { 2940 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2941 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2942 // If there is an epilogue which must run, there's no edge from the 2943 // middle block to exit blocks and thus no need to update the immediate 2944 // dominator of the exit blocks. 2945 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2946 } 2947 2948 LoopBypassBlocks.push_back(SCEVCheckBlock); 2949 AddedSafetyChecks = true; 2950 return SCEVCheckBlock; 2951 } 2952 2953 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2954 // VPlan-native path does not do any analysis for runtime checks currently. 2955 if (EnableVPlanNativePath) 2956 return nullptr; 2957 2958 BasicBlock *const MemCheckBlock = 2959 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2960 2961 // Check if we generated code that checks in runtime if arrays overlap. We put 2962 // the checks into a separate block to make the more common case of few 2963 // elements faster. 2964 if (!MemCheckBlock) 2965 return nullptr; 2966 2967 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2968 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2969 "Cannot emit memory checks when optimizing for size, unless forced " 2970 "to vectorize."); 2971 ORE->emit([&]() { 2972 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2973 OrigLoop->getStartLoc(), 2974 OrigLoop->getHeader()) 2975 << "Code-size may be reduced by not forcing " 2976 "vectorization, or by source-code modifications " 2977 "eliminating the need for runtime checks " 2978 "(e.g., adding 'restrict')."; 2979 }); 2980 } 2981 2982 LoopBypassBlocks.push_back(MemCheckBlock); 2983 2984 AddedSafetyChecks = true; 2985 2986 return MemCheckBlock; 2987 } 2988 2989 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2990 LoopScalarBody = OrigLoop->getHeader(); 2991 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2992 assert(LoopVectorPreHeader && "Invalid loop structure"); 2993 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 2994 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 2995 "multiple exit loop without required epilogue?"); 2996 2997 LoopMiddleBlock = 2998 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2999 LI, nullptr, Twine(Prefix) + "middle.block"); 3000 LoopScalarPreHeader = 3001 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3002 nullptr, Twine(Prefix) + "scalar.ph"); 3003 3004 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3005 3006 // Set up the middle block terminator. Two cases: 3007 // 1) If we know that we must execute the scalar epilogue, emit an 3008 // unconditional branch. 3009 // 2) Otherwise, we must have a single unique exit block (due to how we 3010 // implement the multiple exit case). In this case, set up a conditional 3011 // branch from the middle block to the loop scalar preheader, and the 3012 // exit block. completeLoopSkeleton will update the condition to use an 3013 // iteration check, if required to decide whether to execute the remainder. 3014 BranchInst *BrInst = 3015 Cost->requiresScalarEpilogue(VF.isVector()) 3016 ? BranchInst::Create(LoopScalarPreHeader) 3017 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3018 Builder.getTrue()); 3019 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3020 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3021 3022 // Update dominator for loop exit. During skeleton creation, only the vector 3023 // pre-header and the middle block are created. The vector loop is entirely 3024 // created during VPlan exection. 3025 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3026 // If there is an epilogue which must run, there's no edge from the 3027 // middle block to exit blocks and thus no need to update the immediate 3028 // dominator of the exit blocks. 3029 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3030 } 3031 3032 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3033 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3034 ArrayRef<BasicBlock *> BypassBlocks, 3035 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3036 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3037 assert(VectorTripCount && "Expected valid arguments"); 3038 3039 Instruction *OldInduction = Legal->getPrimaryInduction(); 3040 Value *&EndValue = IVEndValues[OrigPhi]; 3041 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3042 if (OrigPhi == OldInduction) { 3043 // We know what the end value is. 3044 EndValue = VectorTripCount; 3045 } else { 3046 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3047 3048 // Fast-math-flags propagate from the original induction instruction. 3049 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3050 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3051 3052 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), 3053 Step, II.getKind(), II.getInductionBinOp()); 3054 EndValue->setName("ind.end"); 3055 3056 // Compute the end value for the additional bypass (if applicable). 3057 if (AdditionalBypass.first) { 3058 B.SetInsertPoint(AdditionalBypass.first, 3059 AdditionalBypass.first->getFirstInsertionPt()); 3060 EndValueFromAdditionalBypass = 3061 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), 3062 Step, II.getKind(), II.getInductionBinOp()); 3063 EndValueFromAdditionalBypass->setName("ind.end"); 3064 } 3065 } 3066 3067 // Create phi nodes to merge from the backedge-taken check block. 3068 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3069 LoopScalarPreHeader->getTerminator()); 3070 // Copy original phi DL over to the new one. 3071 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3072 3073 // The new PHI merges the original incoming value, in case of a bypass, 3074 // or the value at the end of the vectorized loop. 3075 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3076 3077 // Fix the scalar body counter (PHI node). 3078 // The old induction's phi node in the scalar body needs the truncated 3079 // value. 3080 for (BasicBlock *BB : BypassBlocks) 3081 BCResumeVal->addIncoming(II.getStartValue(), BB); 3082 3083 if (AdditionalBypass.first) 3084 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3085 EndValueFromAdditionalBypass); 3086 return BCResumeVal; 3087 } 3088 3089 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3090 /// expansion results. 3091 static Value *getExpandedStep(const InductionDescriptor &ID, 3092 const SCEV2ValueTy &ExpandedSCEVs) { 3093 const SCEV *Step = ID.getStep(); 3094 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3095 return C->getValue(); 3096 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3097 return U->getValue(); 3098 auto I = ExpandedSCEVs.find(Step); 3099 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3100 return I->second; 3101 } 3102 3103 void InnerLoopVectorizer::createInductionResumeValues( 3104 const SCEV2ValueTy &ExpandedSCEVs, 3105 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3106 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3107 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3108 "Inconsistent information about additional bypass."); 3109 // We are going to resume the execution of the scalar loop. 3110 // Go over all of the induction variables that we found and fix the 3111 // PHIs that are left in the scalar version of the loop. 3112 // The starting values of PHI nodes depend on the counter of the last 3113 // iteration in the vectorized loop. 3114 // If we come from a bypass edge then we need to start from the original 3115 // start value. 3116 for (const auto &InductionEntry : Legal->getInductionVars()) { 3117 PHINode *OrigPhi = InductionEntry.first; 3118 const InductionDescriptor &II = InductionEntry.second; 3119 PHINode *BCResumeVal = createInductionResumeValue( 3120 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3121 AdditionalBypass); 3122 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3123 } 3124 } 3125 3126 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3127 // The trip counts should be cached by now. 3128 Value *Count = getTripCount(); 3129 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3130 3131 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3132 3133 // Add a check in the middle block to see if we have completed 3134 // all of the iterations in the first vector loop. Three cases: 3135 // 1) If we require a scalar epilogue, there is no conditional branch as 3136 // we unconditionally branch to the scalar preheader. Do nothing. 3137 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3138 // Thus if tail is to be folded, we know we don't need to run the 3139 // remainder and we can use the previous value for the condition (true). 3140 // 3) Otherwise, construct a runtime check. 3141 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3142 !Cost->foldTailByMasking()) { 3143 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3144 // of the corresponding compare because they may have ended up with 3145 // different line numbers and we want to avoid awkward line stepping while 3146 // debugging. Eg. if the compare has got a line number inside the loop. 3147 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant 3148 // operands. Perform simplification directly on VPlan once the branch is 3149 // modeled there. 3150 IRBuilder<> B(LoopMiddleBlock->getTerminator()); 3151 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); 3152 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); 3153 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); 3154 BI.setCondition(CmpN); 3155 if (hasBranchWeightMD(*ScalarLatchTerm)) { 3156 // Assume that `Count % VectorTripCount` is equally distributed. 3157 unsigned TripCount = UF * VF.getKnownMinValue(); 3158 assert(TripCount > 0 && "trip count should not be zero"); 3159 const uint32_t Weights[] = {1, TripCount - 1}; 3160 setBranchWeights(BI, Weights); 3161 } 3162 } 3163 3164 #ifdef EXPENSIVE_CHECKS 3165 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3166 #endif 3167 3168 return LoopVectorPreHeader; 3169 } 3170 3171 std::pair<BasicBlock *, Value *> 3172 InnerLoopVectorizer::createVectorizedLoopSkeleton( 3173 const SCEV2ValueTy &ExpandedSCEVs) { 3174 /* 3175 In this function we generate a new loop. The new loop will contain 3176 the vectorized instructions while the old loop will continue to run the 3177 scalar remainder. 3178 3179 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3180 / | preheader are expanded here. Eventually all required SCEV 3181 / | expansion should happen here. 3182 / v 3183 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3184 | / | 3185 | / v 3186 || [ ] <-- vector pre header. 3187 |/ | 3188 | v 3189 | [ ] \ 3190 | [ ]_| <-- vector loop (created during VPlan execution). 3191 | | 3192 | v 3193 \ -[ ] <--- middle-block. 3194 \/ | 3195 /\ v 3196 | ->[ ] <--- new preheader. 3197 | | 3198 (opt) v <-- edge from middle to exit iff epilogue is not required. 3199 | [ ] \ 3200 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3201 \ | 3202 \ v 3203 >[ ] <-- exit block(s). 3204 ... 3205 */ 3206 3207 // Create an empty vector loop, and prepare basic blocks for the runtime 3208 // checks. 3209 createVectorLoopSkeleton(""); 3210 3211 // Now, compare the new count to zero. If it is zero skip the vector loop and 3212 // jump to the scalar loop. This check also covers the case where the 3213 // backedge-taken count is uint##_max: adding one to it will overflow leading 3214 // to an incorrect trip count of zero. In this (rare) case we will also jump 3215 // to the scalar loop. 3216 emitIterationCountCheck(LoopScalarPreHeader); 3217 3218 // Generate the code to check any assumptions that we've made for SCEV 3219 // expressions. 3220 emitSCEVChecks(LoopScalarPreHeader); 3221 3222 // Generate the code that checks in runtime if arrays overlap. We put the 3223 // checks into a separate block to make the more common case of few elements 3224 // faster. 3225 emitMemRuntimeChecks(LoopScalarPreHeader); 3226 3227 // Emit phis for the new starting index of the scalar loop. 3228 createInductionResumeValues(ExpandedSCEVs); 3229 3230 return {completeLoopSkeleton(), nullptr}; 3231 } 3232 3233 // Fix up external users of the induction variable. At this point, we are 3234 // in LCSSA form, with all external PHIs that use the IV having one input value, 3235 // coming from the remainder loop. We need those PHIs to also have a correct 3236 // value for the IV when arriving directly from the middle block. 3237 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3238 const InductionDescriptor &II, 3239 Value *VectorTripCount, Value *EndValue, 3240 BasicBlock *MiddleBlock, 3241 BasicBlock *VectorHeader, VPlan &Plan, 3242 VPTransformState &State) { 3243 // There are two kinds of external IV usages - those that use the value 3244 // computed in the last iteration (the PHI) and those that use the penultimate 3245 // value (the value that feeds into the phi from the loop latch). 3246 // We allow both, but they, obviously, have different values. 3247 3248 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3249 3250 DenseMap<Value *, Value *> MissingVals; 3251 3252 // An external user of the last iteration's value should see the value that 3253 // the remainder loop uses to initialize its own IV. 3254 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3255 for (User *U : PostInc->users()) { 3256 Instruction *UI = cast<Instruction>(U); 3257 if (!OrigLoop->contains(UI)) { 3258 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3259 MissingVals[UI] = EndValue; 3260 } 3261 } 3262 3263 // An external user of the penultimate value need to see EndValue - Step. 3264 // The simplest way to get this is to recompute it from the constituent SCEVs, 3265 // that is Start + (Step * (CRD - 1)). 3266 for (User *U : OrigPhi->users()) { 3267 auto *UI = cast<Instruction>(U); 3268 if (!OrigLoop->contains(UI)) { 3269 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3270 IRBuilder<> B(MiddleBlock->getTerminator()); 3271 3272 // Fast-math-flags propagate from the original induction instruction. 3273 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3274 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3275 3276 Value *CountMinusOne = B.CreateSub( 3277 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3278 CountMinusOne->setName("cmo"); 3279 3280 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3281 assert(StepVPV && "step must have been expanded during VPlan execution"); 3282 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3283 : State.get(StepVPV, {0, 0}); 3284 Value *Escape = 3285 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, 3286 II.getKind(), II.getInductionBinOp()); 3287 Escape->setName("ind.escape"); 3288 MissingVals[UI] = Escape; 3289 } 3290 } 3291 3292 for (auto &I : MissingVals) { 3293 PHINode *PHI = cast<PHINode>(I.first); 3294 // One corner case we have to handle is two IVs "chasing" each-other, 3295 // that is %IV2 = phi [...], [ %IV1, %latch ] 3296 // In this case, if IV1 has an external use, we need to avoid adding both 3297 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3298 // don't already have an incoming value for the middle block. 3299 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3300 PHI->addIncoming(I.second, MiddleBlock); 3301 Plan.removeLiveOut(PHI); 3302 } 3303 } 3304 } 3305 3306 namespace { 3307 3308 struct CSEDenseMapInfo { 3309 static bool canHandle(const Instruction *I) { 3310 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3311 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3312 } 3313 3314 static inline Instruction *getEmptyKey() { 3315 return DenseMapInfo<Instruction *>::getEmptyKey(); 3316 } 3317 3318 static inline Instruction *getTombstoneKey() { 3319 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3320 } 3321 3322 static unsigned getHashValue(const Instruction *I) { 3323 assert(canHandle(I) && "Unknown instruction!"); 3324 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3325 I->value_op_end())); 3326 } 3327 3328 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3329 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3330 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3331 return LHS == RHS; 3332 return LHS->isIdenticalTo(RHS); 3333 } 3334 }; 3335 3336 } // end anonymous namespace 3337 3338 ///Perform cse of induction variable instructions. 3339 static void cse(BasicBlock *BB) { 3340 // Perform simple cse. 3341 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3342 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3343 if (!CSEDenseMapInfo::canHandle(&In)) 3344 continue; 3345 3346 // Check if we can replace this instruction with any of the 3347 // visited instructions. 3348 if (Instruction *V = CSEMap.lookup(&In)) { 3349 In.replaceAllUsesWith(V); 3350 In.eraseFromParent(); 3351 continue; 3352 } 3353 3354 CSEMap[&In] = &In; 3355 } 3356 } 3357 3358 InstructionCost 3359 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3360 ElementCount VF) const { 3361 // We only need to calculate a cost if the VF is scalar; for actual vectors 3362 // we should already have a pre-calculated cost at each VF. 3363 if (!VF.isScalar()) 3364 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 3365 3366 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3367 Type *RetTy = CI->getType(); 3368 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 3369 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 3370 return *RedCost; 3371 3372 SmallVector<Type *, 4> Tys; 3373 for (auto &ArgOp : CI->args()) 3374 Tys.push_back(ArgOp->getType()); 3375 3376 InstructionCost ScalarCallCost = 3377 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 3378 3379 // If this is an intrinsic we may have a lower cost for it. 3380 if (getVectorIntrinsicIDForCall(CI, TLI)) { 3381 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 3382 return std::min(ScalarCallCost, IntrinsicCost); 3383 } 3384 return ScalarCallCost; 3385 } 3386 3387 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3388 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3389 return Elt; 3390 return VectorType::get(Elt, VF); 3391 } 3392 3393 InstructionCost 3394 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3395 ElementCount VF) const { 3396 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3397 assert(ID && "Expected intrinsic call!"); 3398 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3399 FastMathFlags FMF; 3400 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3401 FMF = FPMO->getFastMathFlags(); 3402 3403 SmallVector<const Value *> Arguments(CI->args()); 3404 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3405 SmallVector<Type *> ParamTys; 3406 std::transform(FTy->param_begin(), FTy->param_end(), 3407 std::back_inserter(ParamTys), 3408 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3409 3410 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3411 dyn_cast<IntrinsicInst>(CI)); 3412 return TTI.getIntrinsicInstrCost(CostAttrs, 3413 TargetTransformInfo::TCK_RecipThroughput); 3414 } 3415 3416 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3417 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3418 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3419 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3420 } 3421 3422 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3423 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3424 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3425 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3426 } 3427 3428 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3429 VPlan &Plan) { 3430 // Fix widened non-induction PHIs by setting up the PHI operands. 3431 if (EnableVPlanNativePath) 3432 fixNonInductionPHIs(Plan, State); 3433 3434 // At this point every instruction in the original loop is widened to a 3435 // vector form. Now we need to fix the recurrences in the loop. These PHI 3436 // nodes are currently empty because we did not want to introduce cycles. 3437 // This is the second stage of vectorizing recurrences. 3438 fixCrossIterationPHIs(State); 3439 3440 // Forget the original basic block. 3441 PSE.getSE()->forgetLoop(OrigLoop); 3442 PSE.getSE()->forgetBlockAndLoopDispositions(); 3443 3444 // After vectorization, the exit blocks of the original loop will have 3445 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3446 // looked through single-entry phis. 3447 SmallVector<BasicBlock *> ExitBlocks; 3448 OrigLoop->getExitBlocks(ExitBlocks); 3449 for (BasicBlock *Exit : ExitBlocks) 3450 for (PHINode &PN : Exit->phis()) 3451 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 3452 3453 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3454 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3455 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3456 // No edge from the middle block to the unique exit block has been inserted 3457 // and there is nothing to fix from vector loop; phis should have incoming 3458 // from scalar loop only. 3459 } else { 3460 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3461 // the cost model. 3462 3463 // If we inserted an edge from the middle block to the unique exit block, 3464 // update uses outside the loop (phis) to account for the newly inserted 3465 // edge. 3466 3467 // Fix-up external users of the induction variables. 3468 for (const auto &Entry : Legal->getInductionVars()) 3469 fixupIVUsers(Entry.first, Entry.second, 3470 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3471 IVEndValues[Entry.first], LoopMiddleBlock, 3472 VectorLoop->getHeader(), Plan, State); 3473 } 3474 3475 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3476 // in the exit block, so update the builder. 3477 State.Builder.SetInsertPoint(State.CFG.ExitBB, 3478 State.CFG.ExitBB->getFirstNonPHIIt()); 3479 for (const auto &KV : Plan.getLiveOuts()) 3480 KV.second->fixPhi(Plan, State); 3481 3482 for (Instruction *PI : PredicatedInstructions) 3483 sinkScalarOperands(&*PI); 3484 3485 // Remove redundant induction instructions. 3486 cse(VectorLoop->getHeader()); 3487 3488 // Set/update profile weights for the vector and remainder loops as original 3489 // loop iterations are now distributed among them. Note that original loop 3490 // represented by LoopScalarBody becomes remainder loop after vectorization. 3491 // 3492 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3493 // end up getting slightly roughened result but that should be OK since 3494 // profile is not inherently precise anyway. Note also possible bypass of 3495 // vector code caused by legality checks is ignored, assigning all the weight 3496 // to the vector loop, optimistically. 3497 // 3498 // For scalable vectorization we can't know at compile time how many iterations 3499 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3500 // vscale of '1'. 3501 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3502 LI->getLoopFor(LoopScalarBody), 3503 VF.getKnownMinValue() * UF); 3504 } 3505 3506 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3507 // In order to support recurrences we need to be able to vectorize Phi nodes. 3508 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3509 // stage #2: We now need to fix the recurrences by adding incoming edges to 3510 // the currently empty PHI nodes. At this point every instruction in the 3511 // original loop is widened to a vector form so we can use them to construct 3512 // the incoming edges. 3513 VPBasicBlock *Header = 3514 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3515 3516 for (VPRecipeBase &R : Header->phis()) { 3517 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3518 fixReduction(ReductionPhi, State); 3519 } 3520 3521 for (VPRecipeBase &R : Header->phis()) { 3522 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3523 fixFixedOrderRecurrence(FOR, State); 3524 } 3525 } 3526 3527 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3528 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3529 // This is the second phase of vectorizing first-order recurrences. An 3530 // overview of the transformation is described below. Suppose we have the 3531 // following loop. 3532 // 3533 // for (int i = 0; i < n; ++i) 3534 // b[i] = a[i] - a[i - 1]; 3535 // 3536 // There is a first-order recurrence on "a". For this loop, the shorthand 3537 // scalar IR looks like: 3538 // 3539 // scalar.ph: 3540 // s_init = a[-1] 3541 // br scalar.body 3542 // 3543 // scalar.body: 3544 // i = phi [0, scalar.ph], [i+1, scalar.body] 3545 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3546 // s2 = a[i] 3547 // b[i] = s2 - s1 3548 // br cond, scalar.body, ... 3549 // 3550 // In this example, s1 is a recurrence because it's value depends on the 3551 // previous iteration. In the first phase of vectorization, we created a 3552 // vector phi v1 for s1. We now complete the vectorization and produce the 3553 // shorthand vector IR shown below (for VF = 4, UF = 1). 3554 // 3555 // vector.ph: 3556 // v_init = vector(..., ..., ..., a[-1]) 3557 // br vector.body 3558 // 3559 // vector.body 3560 // i = phi [0, vector.ph], [i+4, vector.body] 3561 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3562 // v2 = a[i, i+1, i+2, i+3]; 3563 // v3 = vector(v1(3), v2(0, 1, 2)) 3564 // b[i, i+1, i+2, i+3] = v2 - v3 3565 // br cond, vector.body, middle.block 3566 // 3567 // middle.block: 3568 // x = v2(3) 3569 // br scalar.ph 3570 // 3571 // scalar.ph: 3572 // s_init = phi [x, middle.block], [a[-1], otherwise] 3573 // br scalar.body 3574 // 3575 // After execution completes the vector loop, we extract the next value of 3576 // the recurrence (x) to use as the initial value in the scalar loop. 3577 3578 // Extract the last vector element in the middle block. This will be the 3579 // initial value for the recurrence when jumping to the scalar loop. 3580 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3581 Value *Incoming = State.get(PreviousDef, UF - 1); 3582 auto *ExtractForScalar = Incoming; 3583 auto *IdxTy = Builder.getInt32Ty(); 3584 Value *RuntimeVF = nullptr; 3585 if (VF.isVector()) { 3586 auto *One = ConstantInt::get(IdxTy, 1); 3587 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3588 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3589 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3590 ExtractForScalar = 3591 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3592 } 3593 3594 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3595 assert(PhiR->getNumUsers() == 1 && 3596 RecurSplice->getOpcode() == 3597 VPInstruction::FirstOrderRecurrenceSplice && 3598 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3599 SmallVector<VPLiveOut *> LiveOuts; 3600 for (VPUser *U : RecurSplice->users()) 3601 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3602 LiveOuts.push_back(LiveOut); 3603 3604 if (!LiveOuts.empty()) { 3605 // Extract the second last element in the middle block if the 3606 // Phi is used outside the loop. We need to extract the phi itself 3607 // and not the last element (the phi update in the current iteration). This 3608 // will be the value when jumping to the exit block from the 3609 // LoopMiddleBlock, when the scalar loop is not run at all. 3610 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3611 if (VF.isVector()) { 3612 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3613 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3614 Incoming, Idx, "vector.recur.extract.for.phi"); 3615 } else { 3616 assert(UF > 1 && "VF and UF cannot both be 1"); 3617 // When loop is unrolled without vectorizing, initialize 3618 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3619 // value of `Incoming`. This is analogous to the vectorized case above: 3620 // extracting the second last element when VF > 1. 3621 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3622 } 3623 3624 for (VPLiveOut *LiveOut : LiveOuts) { 3625 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3626 PHINode *LCSSAPhi = LiveOut->getPhi(); 3627 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3628 State.Plan->removeLiveOut(LCSSAPhi); 3629 } 3630 } 3631 3632 // Fix the initial value of the original recurrence in the scalar loop. 3633 Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); 3634 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3635 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3636 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3637 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3638 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3639 Start->addIncoming(Incoming, BB); 3640 } 3641 3642 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3643 Phi->setName("scalar.recur"); 3644 } 3645 3646 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3647 VPTransformState &State) { 3648 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3649 // Get it's reduction variable descriptor. 3650 assert(Legal->isReductionVariable(OrigPhi) && 3651 "Unable to find the reduction variable"); 3652 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3653 3654 RecurKind RK = RdxDesc.getRecurrenceKind(); 3655 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3656 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3657 if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue)) 3658 State.setDebugLocFrom(I->getDebugLoc()); 3659 3660 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3661 3662 // Before each round, move the insertion point right between 3663 // the PHIs and the values we are going to write. 3664 // This allows us to write both PHINodes and the extractelement 3665 // instructions. 3666 Builder.SetInsertPoint(LoopMiddleBlock, 3667 LoopMiddleBlock->getFirstInsertionPt()); 3668 3669 State.setDebugLocFrom(LoopExitInst->getDebugLoc()); 3670 3671 Type *PhiTy = OrigPhi->getType(); 3672 // If tail is folded by masking, the vector value to leave the loop should be 3673 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3674 // instead of the former. For an inloop reduction the reduction will already 3675 // be predicated, and does not need to be handled here. 3676 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3677 VPValue *Def = nullptr; 3678 for (VPUser *U : LoopExitInstDef->users()) { 3679 auto *S = dyn_cast<VPInstruction>(U); 3680 if (S && S->getOpcode() == Instruction::Select) { 3681 Def = S; 3682 break; 3683 } 3684 } 3685 if (Def) 3686 LoopExitInstDef = Def; 3687 } 3688 3689 VectorParts RdxParts(UF); 3690 for (unsigned Part = 0; Part < UF; ++Part) 3691 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3692 3693 // If the vector reduction can be performed in a smaller type, we truncate 3694 // then extend the loop exit value to enable InstCombine to evaluate the 3695 // entire expression in the smaller type. 3696 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3697 Builder.SetInsertPoint(LoopMiddleBlock, 3698 LoopMiddleBlock->getFirstInsertionPt()); 3699 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3700 for (unsigned Part = 0; Part < UF; ++Part) { 3701 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3702 } 3703 } 3704 3705 // Reduce all of the unrolled parts into a single vector. 3706 Value *ReducedPartRdx = RdxParts[0]; 3707 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3708 3709 // The middle block terminator has already been assigned a DebugLoc here (the 3710 // OrigLoop's single latch terminator). We want the whole middle block to 3711 // appear to execute on this line because: (a) it is all compiler generated, 3712 // (b) these instructions are always executed after evaluating the latch 3713 // conditional branch, and (c) other passes may add new predecessors which 3714 // terminate on this line. This is the easiest way to ensure we don't 3715 // accidentally cause an extra step back into the loop while debugging. 3716 State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc()); 3717 if (PhiR->isOrdered()) 3718 ReducedPartRdx = RdxParts[UF - 1]; 3719 else { 3720 // Floating-point operations should have some FMF to enable the reduction. 3721 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3722 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3723 for (unsigned Part = 1; Part < UF; ++Part) { 3724 Value *RdxPart = RdxParts[Part]; 3725 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3726 ReducedPartRdx = Builder.CreateBinOp( 3727 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3728 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) 3729 ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, 3730 ReducedPartRdx, RdxPart); 3731 else 3732 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3733 } 3734 } 3735 3736 // Create the reduction after the loop. Note that inloop reductions create the 3737 // target reduction in the loop using a Reduction recipe. 3738 if (VF.isVector() && !PhiR->isInLoop()) { 3739 ReducedPartRdx = 3740 createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); 3741 // If the reduction can be performed in a smaller type, we need to extend 3742 // the reduction to the wider type before we branch to the original loop. 3743 if (PhiTy != RdxDesc.getRecurrenceType()) 3744 ReducedPartRdx = RdxDesc.isSigned() 3745 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3746 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3747 } 3748 3749 PHINode *ResumePhi = 3750 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3751 3752 // Create a phi node that merges control-flow from the backedge-taken check 3753 // block and the middle block. 3754 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3755 LoopScalarPreHeader->getTerminator()); 3756 3757 // If we are fixing reductions in the epilogue loop then we should already 3758 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3759 // we carry over the incoming values correctly. 3760 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 3761 if (Incoming == LoopMiddleBlock) 3762 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 3763 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 3764 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 3765 Incoming); 3766 else 3767 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 3768 } 3769 3770 // Set the resume value for this reduction 3771 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 3772 3773 // If there were stores of the reduction value to a uniform memory address 3774 // inside the loop, create the final store here. 3775 if (StoreInst *SI = RdxDesc.IntermediateStore) { 3776 StoreInst *NewSI = 3777 Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(), 3778 SI->getAlign()); 3779 propagateMetadata(NewSI, SI); 3780 3781 // If the reduction value is used in other places, 3782 // then let the code below create PHI's for that. 3783 } 3784 3785 // Now, we need to fix the users of the reduction variable 3786 // inside and outside of the scalar remainder loop. 3787 3788 // We know that the loop is in LCSSA form. We need to update the PHI nodes 3789 // in the exit blocks. See comment on analogous loop in 3790 // fixFixedOrderRecurrence for a more complete explaination of the logic. 3791 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3792 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3793 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 3794 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3795 State.Plan->removeLiveOut(&LCSSAPhi); 3796 } 3797 3798 // Fix the scalar loop reduction variable with the incoming reduction sum 3799 // from the vector body and from the backedge value. 3800 int IncomingEdgeBlockIdx = 3801 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3802 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3803 // Pick the other block. 3804 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3805 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3806 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3807 } 3808 3809 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3810 // The basic block and loop containing the predicated instruction. 3811 auto *PredBB = PredInst->getParent(); 3812 auto *VectorLoop = LI->getLoopFor(PredBB); 3813 3814 // Initialize a worklist with the operands of the predicated instruction. 3815 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3816 3817 // Holds instructions that we need to analyze again. An instruction may be 3818 // reanalyzed if we don't yet know if we can sink it or not. 3819 SmallVector<Instruction *, 8> InstsToReanalyze; 3820 3821 // Returns true if a given use occurs in the predicated block. Phi nodes use 3822 // their operands in their corresponding predecessor blocks. 3823 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3824 auto *I = cast<Instruction>(U.getUser()); 3825 BasicBlock *BB = I->getParent(); 3826 if (auto *Phi = dyn_cast<PHINode>(I)) 3827 BB = Phi->getIncomingBlock( 3828 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3829 return BB == PredBB; 3830 }; 3831 3832 // Iteratively sink the scalarized operands of the predicated instruction 3833 // into the block we created for it. When an instruction is sunk, it's 3834 // operands are then added to the worklist. The algorithm ends after one pass 3835 // through the worklist doesn't sink a single instruction. 3836 bool Changed; 3837 do { 3838 // Add the instructions that need to be reanalyzed to the worklist, and 3839 // reset the changed indicator. 3840 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3841 InstsToReanalyze.clear(); 3842 Changed = false; 3843 3844 while (!Worklist.empty()) { 3845 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3846 3847 // We can't sink an instruction if it is a phi node, is not in the loop, 3848 // may have side effects or may read from memory. 3849 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 3850 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3851 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3852 continue; 3853 3854 // If the instruction is already in PredBB, check if we can sink its 3855 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3856 // sinking the scalar instruction I, hence it appears in PredBB; but it 3857 // may have failed to sink I's operands (recursively), which we try 3858 // (again) here. 3859 if (I->getParent() == PredBB) { 3860 Worklist.insert(I->op_begin(), I->op_end()); 3861 continue; 3862 } 3863 3864 // It's legal to sink the instruction if all its uses occur in the 3865 // predicated block. Otherwise, there's nothing to do yet, and we may 3866 // need to reanalyze the instruction. 3867 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3868 InstsToReanalyze.push_back(I); 3869 continue; 3870 } 3871 3872 // Move the instruction to the beginning of the predicated block, and add 3873 // it's operands to the worklist. 3874 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3875 Worklist.insert(I->op_begin(), I->op_end()); 3876 3877 // The sinking may have enabled other instructions to be sunk, so we will 3878 // need to iterate. 3879 Changed = true; 3880 } 3881 } while (Changed); 3882 } 3883 3884 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 3885 VPTransformState &State) { 3886 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3887 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3888 for (VPRecipeBase &P : VPBB->phis()) { 3889 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3890 if (!VPPhi) 3891 continue; 3892 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 3893 // Make sure the builder has a valid insert point. 3894 Builder.SetInsertPoint(NewPhi); 3895 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 3896 VPValue *Inc = VPPhi->getIncomingValue(i); 3897 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 3898 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 3899 } 3900 } 3901 } 3902 } 3903 3904 bool InnerLoopVectorizer::useOrderedReductions( 3905 const RecurrenceDescriptor &RdxDesc) { 3906 return Cost->useOrderedReductions(RdxDesc); 3907 } 3908 3909 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3910 // We should not collect Scalars more than once per VF. Right now, this 3911 // function is called from collectUniformsAndScalars(), which already does 3912 // this check. Collecting Scalars for VF=1 does not make any sense. 3913 assert(VF.isVector() && !Scalars.contains(VF) && 3914 "This function should not be visited twice for the same VF"); 3915 3916 // This avoids any chances of creating a REPLICATE recipe during planning 3917 // since that would result in generation of scalarized code during execution, 3918 // which is not supported for scalable vectors. 3919 if (VF.isScalable()) { 3920 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3921 return; 3922 } 3923 3924 SmallSetVector<Instruction *, 8> Worklist; 3925 3926 // These sets are used to seed the analysis with pointers used by memory 3927 // accesses that will remain scalar. 3928 SmallSetVector<Instruction *, 8> ScalarPtrs; 3929 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3930 auto *Latch = TheLoop->getLoopLatch(); 3931 3932 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3933 // The pointer operands of loads and stores will be scalar as long as the 3934 // memory access is not a gather or scatter operation. The value operand of a 3935 // store will remain scalar if the store is scalarized. 3936 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3937 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3938 assert(WideningDecision != CM_Unknown && 3939 "Widening decision should be ready at this moment"); 3940 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3941 if (Ptr == Store->getValueOperand()) 3942 return WideningDecision == CM_Scalarize; 3943 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3944 "Ptr is neither a value or pointer operand"); 3945 return WideningDecision != CM_GatherScatter; 3946 }; 3947 3948 // A helper that returns true if the given value is a bitcast or 3949 // getelementptr instruction contained in the loop. 3950 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 3951 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 3952 isa<GetElementPtrInst>(V)) && 3953 !TheLoop->isLoopInvariant(V); 3954 }; 3955 3956 // A helper that evaluates a memory access's use of a pointer. If the use will 3957 // be a scalar use and the pointer is only used by memory accesses, we place 3958 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3959 // PossibleNonScalarPtrs. 3960 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3961 // We only care about bitcast and getelementptr instructions contained in 3962 // the loop. 3963 if (!isLoopVaryingBitCastOrGEP(Ptr)) 3964 return; 3965 3966 // If the pointer has already been identified as scalar (e.g., if it was 3967 // also identified as uniform), there's nothing to do. 3968 auto *I = cast<Instruction>(Ptr); 3969 if (Worklist.count(I)) 3970 return; 3971 3972 // If the use of the pointer will be a scalar use, and all users of the 3973 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3974 // place the pointer in PossibleNonScalarPtrs. 3975 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 3976 return isa<LoadInst>(U) || isa<StoreInst>(U); 3977 })) 3978 ScalarPtrs.insert(I); 3979 else 3980 PossibleNonScalarPtrs.insert(I); 3981 }; 3982 3983 // We seed the scalars analysis with three classes of instructions: (1) 3984 // instructions marked uniform-after-vectorization and (2) bitcast, 3985 // getelementptr and (pointer) phi instructions used by memory accesses 3986 // requiring a scalar use. 3987 // 3988 // (1) Add to the worklist all instructions that have been identified as 3989 // uniform-after-vectorization. 3990 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3991 3992 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3993 // memory accesses requiring a scalar use. The pointer operands of loads and 3994 // stores will be scalar as long as the memory accesses is not a gather or 3995 // scatter operation. The value operand of a store will remain scalar if the 3996 // store is scalarized. 3997 for (auto *BB : TheLoop->blocks()) 3998 for (auto &I : *BB) { 3999 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4000 evaluatePtrUse(Load, Load->getPointerOperand()); 4001 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4002 evaluatePtrUse(Store, Store->getPointerOperand()); 4003 evaluatePtrUse(Store, Store->getValueOperand()); 4004 } 4005 } 4006 for (auto *I : ScalarPtrs) 4007 if (!PossibleNonScalarPtrs.count(I)) { 4008 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4009 Worklist.insert(I); 4010 } 4011 4012 // Insert the forced scalars. 4013 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4014 // induction variable when the PHI user is scalarized. 4015 auto ForcedScalar = ForcedScalars.find(VF); 4016 if (ForcedScalar != ForcedScalars.end()) 4017 for (auto *I : ForcedScalar->second) { 4018 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 4019 Worklist.insert(I); 4020 } 4021 4022 // Expand the worklist by looking through any bitcasts and getelementptr 4023 // instructions we've already identified as scalar. This is similar to the 4024 // expansion step in collectLoopUniforms(); however, here we're only 4025 // expanding to include additional bitcasts and getelementptr instructions. 4026 unsigned Idx = 0; 4027 while (Idx != Worklist.size()) { 4028 Instruction *Dst = Worklist[Idx++]; 4029 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4030 continue; 4031 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4032 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4033 auto *J = cast<Instruction>(U); 4034 return !TheLoop->contains(J) || Worklist.count(J) || 4035 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4036 isScalarUse(J, Src)); 4037 })) { 4038 Worklist.insert(Src); 4039 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4040 } 4041 } 4042 4043 // An induction variable will remain scalar if all users of the induction 4044 // variable and induction variable update remain scalar. 4045 for (const auto &Induction : Legal->getInductionVars()) { 4046 auto *Ind = Induction.first; 4047 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4048 4049 // If tail-folding is applied, the primary induction variable will be used 4050 // to feed a vector compare. 4051 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4052 continue; 4053 4054 // Returns true if \p Indvar is a pointer induction that is used directly by 4055 // load/store instruction \p I. 4056 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4057 Instruction *I) { 4058 return Induction.second.getKind() == 4059 InductionDescriptor::IK_PtrInduction && 4060 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4061 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4062 }; 4063 4064 // Determine if all users of the induction variable are scalar after 4065 // vectorization. 4066 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4067 auto *I = cast<Instruction>(U); 4068 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4069 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4070 }); 4071 if (!ScalarInd) 4072 continue; 4073 4074 // Determine if all users of the induction variable update instruction are 4075 // scalar after vectorization. 4076 auto ScalarIndUpdate = 4077 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4078 auto *I = cast<Instruction>(U); 4079 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4080 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4081 }); 4082 if (!ScalarIndUpdate) 4083 continue; 4084 4085 // The induction variable and its update instruction will remain scalar. 4086 Worklist.insert(Ind); 4087 Worklist.insert(IndUpdate); 4088 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4089 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4090 << "\n"); 4091 } 4092 4093 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4094 } 4095 4096 bool LoopVectorizationCostModel::isScalarWithPredication( 4097 Instruction *I, ElementCount VF) const { 4098 if (!isPredicatedInst(I)) 4099 return false; 4100 4101 // Do we have a non-scalar lowering for this predicated 4102 // instruction? No - it is scalar with predication. 4103 switch(I->getOpcode()) { 4104 default: 4105 return true; 4106 case Instruction::Call: 4107 if (VF.isScalar()) 4108 return true; 4109 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 4110 .Kind == CM_Scalarize; 4111 case Instruction::Load: 4112 case Instruction::Store: { 4113 auto *Ptr = getLoadStorePointerOperand(I); 4114 auto *Ty = getLoadStoreType(I); 4115 Type *VTy = Ty; 4116 if (VF.isVector()) 4117 VTy = VectorType::get(Ty, VF); 4118 const Align Alignment = getLoadStoreAlignment(I); 4119 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4120 TTI.isLegalMaskedGather(VTy, Alignment)) 4121 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4122 TTI.isLegalMaskedScatter(VTy, Alignment)); 4123 } 4124 case Instruction::UDiv: 4125 case Instruction::SDiv: 4126 case Instruction::SRem: 4127 case Instruction::URem: { 4128 // We have the option to use the safe-divisor idiom to avoid predication. 4129 // The cost based decision here will always select safe-divisor for 4130 // scalable vectors as scalarization isn't legal. 4131 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 4132 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 4133 } 4134 } 4135 } 4136 4137 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 4138 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4139 return false; 4140 4141 // Can we prove this instruction is safe to unconditionally execute? 4142 // If not, we must use some form of predication. 4143 switch(I->getOpcode()) { 4144 default: 4145 return false; 4146 case Instruction::Load: 4147 case Instruction::Store: { 4148 if (!Legal->isMaskRequired(I)) 4149 return false; 4150 // When we know the load's address is loop invariant and the instruction 4151 // in the original scalar loop was unconditionally executed then we 4152 // don't need to mark it as a predicated instruction. Tail folding may 4153 // introduce additional predication, but we're guaranteed to always have 4154 // at least one active lane. We call Legal->blockNeedsPredication here 4155 // because it doesn't query tail-folding. For stores, we need to prove 4156 // both speculation safety (which follows from the same argument as loads), 4157 // but also must prove the value being stored is correct. The easiest 4158 // form of the later is to require that all values stored are the same. 4159 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 4160 (isa<LoadInst>(I) || 4161 (isa<StoreInst>(I) && 4162 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4163 !Legal->blockNeedsPredication(I->getParent())) 4164 return false; 4165 return true; 4166 } 4167 case Instruction::UDiv: 4168 case Instruction::SDiv: 4169 case Instruction::SRem: 4170 case Instruction::URem: 4171 // TODO: We can use the loop-preheader as context point here and get 4172 // context sensitive reasoning 4173 return !isSafeToSpeculativelyExecute(I); 4174 case Instruction::Call: 4175 return Legal->isMaskRequired(I); 4176 } 4177 } 4178 4179 std::pair<InstructionCost, InstructionCost> 4180 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4181 ElementCount VF) const { 4182 assert(I->getOpcode() == Instruction::UDiv || 4183 I->getOpcode() == Instruction::SDiv || 4184 I->getOpcode() == Instruction::SRem || 4185 I->getOpcode() == Instruction::URem); 4186 assert(!isSafeToSpeculativelyExecute(I)); 4187 4188 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4189 4190 // Scalarization isn't legal for scalable vector types 4191 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4192 if (!VF.isScalable()) { 4193 // Get the scalarization cost and scale this amount by the probability of 4194 // executing the predicated block. If the instruction is not predicated, 4195 // we fall through to the next case. 4196 ScalarizationCost = 0; 4197 4198 // These instructions have a non-void type, so account for the phi nodes 4199 // that we will create. This cost is likely to be zero. The phi node 4200 // cost, if any, should be scaled by the block probability because it 4201 // models a copy at the end of each predicated block. 4202 ScalarizationCost += VF.getKnownMinValue() * 4203 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4204 4205 // The cost of the non-predicated instruction. 4206 ScalarizationCost += VF.getKnownMinValue() * 4207 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4208 4209 // The cost of insertelement and extractelement instructions needed for 4210 // scalarization. 4211 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4212 4213 // Scale the cost by the probability of executing the predicated blocks. 4214 // This assumes the predicated block for each vector lane is equally 4215 // likely. 4216 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4217 } 4218 InstructionCost SafeDivisorCost = 0; 4219 4220 auto *VecTy = ToVectorTy(I->getType(), VF); 4221 4222 // The cost of the select guard to ensure all lanes are well defined 4223 // after we speculate above any internal control flow. 4224 SafeDivisorCost += TTI.getCmpSelInstrCost( 4225 Instruction::Select, VecTy, 4226 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4227 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4228 4229 // Certain instructions can be cheaper to vectorize if they have a constant 4230 // second vector operand. One example of this are shifts on x86. 4231 Value *Op2 = I->getOperand(1); 4232 auto Op2Info = TTI.getOperandInfo(Op2); 4233 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4234 Legal->isInvariant(Op2)) 4235 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4236 4237 SmallVector<const Value *, 4> Operands(I->operand_values()); 4238 SafeDivisorCost += TTI.getArithmeticInstrCost( 4239 I->getOpcode(), VecTy, CostKind, 4240 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4241 Op2Info, Operands, I); 4242 return {ScalarizationCost, SafeDivisorCost}; 4243 } 4244 4245 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4246 Instruction *I, ElementCount VF) { 4247 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4248 assert(getWideningDecision(I, VF) == CM_Unknown && 4249 "Decision should not be set yet."); 4250 auto *Group = getInterleavedAccessGroup(I); 4251 assert(Group && "Must have a group."); 4252 4253 // If the instruction's allocated size doesn't equal it's type size, it 4254 // requires padding and will be scalarized. 4255 auto &DL = I->getModule()->getDataLayout(); 4256 auto *ScalarTy = getLoadStoreType(I); 4257 if (hasIrregularType(ScalarTy, DL)) 4258 return false; 4259 4260 // If the group involves a non-integral pointer, we may not be able to 4261 // losslessly cast all values to a common type. 4262 unsigned InterleaveFactor = Group->getFactor(); 4263 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4264 for (unsigned i = 0; i < InterleaveFactor; i++) { 4265 Instruction *Member = Group->getMember(i); 4266 if (!Member) 4267 continue; 4268 auto *MemberTy = getLoadStoreType(Member); 4269 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4270 // Don't coerce non-integral pointers to integers or vice versa. 4271 if (MemberNI != ScalarNI) { 4272 // TODO: Consider adding special nullptr value case here 4273 return false; 4274 } else if (MemberNI && ScalarNI && 4275 ScalarTy->getPointerAddressSpace() != 4276 MemberTy->getPointerAddressSpace()) { 4277 return false; 4278 } 4279 } 4280 4281 // Check if masking is required. 4282 // A Group may need masking for one of two reasons: it resides in a block that 4283 // needs predication, or it was decided to use masking to deal with gaps 4284 // (either a gap at the end of a load-access that may result in a speculative 4285 // load, or any gaps in a store-access). 4286 bool PredicatedAccessRequiresMasking = 4287 blockNeedsPredicationForAnyReason(I->getParent()) && 4288 Legal->isMaskRequired(I); 4289 bool LoadAccessWithGapsRequiresEpilogMasking = 4290 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4291 !isScalarEpilogueAllowed(); 4292 bool StoreAccessWithGapsRequiresMasking = 4293 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4294 if (!PredicatedAccessRequiresMasking && 4295 !LoadAccessWithGapsRequiresEpilogMasking && 4296 !StoreAccessWithGapsRequiresMasking) 4297 return true; 4298 4299 // If masked interleaving is required, we expect that the user/target had 4300 // enabled it, because otherwise it either wouldn't have been created or 4301 // it should have been invalidated by the CostModel. 4302 assert(useMaskedInterleavedAccesses(TTI) && 4303 "Masked interleave-groups for predicated accesses are not enabled."); 4304 4305 if (Group->isReverse()) 4306 return false; 4307 4308 auto *Ty = getLoadStoreType(I); 4309 const Align Alignment = getLoadStoreAlignment(I); 4310 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4311 : TTI.isLegalMaskedStore(Ty, Alignment); 4312 } 4313 4314 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4315 Instruction *I, ElementCount VF) { 4316 // Get and ensure we have a valid memory instruction. 4317 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4318 4319 auto *Ptr = getLoadStorePointerOperand(I); 4320 auto *ScalarTy = getLoadStoreType(I); 4321 4322 // In order to be widened, the pointer should be consecutive, first of all. 4323 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4324 return false; 4325 4326 // If the instruction is a store located in a predicated block, it will be 4327 // scalarized. 4328 if (isScalarWithPredication(I, VF)) 4329 return false; 4330 4331 // If the instruction's allocated size doesn't equal it's type size, it 4332 // requires padding and will be scalarized. 4333 auto &DL = I->getModule()->getDataLayout(); 4334 if (hasIrregularType(ScalarTy, DL)) 4335 return false; 4336 4337 return true; 4338 } 4339 4340 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4341 // We should not collect Uniforms more than once per VF. Right now, 4342 // this function is called from collectUniformsAndScalars(), which 4343 // already does this check. Collecting Uniforms for VF=1 does not make any 4344 // sense. 4345 4346 assert(VF.isVector() && !Uniforms.contains(VF) && 4347 "This function should not be visited twice for the same VF"); 4348 4349 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4350 // not analyze again. Uniforms.count(VF) will return 1. 4351 Uniforms[VF].clear(); 4352 4353 // We now know that the loop is vectorizable! 4354 // Collect instructions inside the loop that will remain uniform after 4355 // vectorization. 4356 4357 // Global values, params and instructions outside of current loop are out of 4358 // scope. 4359 auto isOutOfScope = [&](Value *V) -> bool { 4360 Instruction *I = dyn_cast<Instruction>(V); 4361 return (!I || !TheLoop->contains(I)); 4362 }; 4363 4364 // Worklist containing uniform instructions demanding lane 0. 4365 SetVector<Instruction *> Worklist; 4366 BasicBlock *Latch = TheLoop->getLoopLatch(); 4367 4368 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4369 // that are scalar with predication must not be considered uniform after 4370 // vectorization, because that would create an erroneous replicating region 4371 // where only a single instance out of VF should be formed. 4372 // TODO: optimize such seldom cases if found important, see PR40816. 4373 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4374 if (isOutOfScope(I)) { 4375 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4376 << *I << "\n"); 4377 return; 4378 } 4379 if (isScalarWithPredication(I, VF)) { 4380 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4381 << *I << "\n"); 4382 return; 4383 } 4384 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4385 Worklist.insert(I); 4386 }; 4387 4388 // Start with the conditional branch. If the branch condition is an 4389 // instruction contained in the loop that is only used by the branch, it is 4390 // uniform. 4391 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4392 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4393 addToWorklistIfAllowed(Cmp); 4394 4395 auto PrevVF = VF.divideCoefficientBy(2); 4396 // Return true if all lanes perform the same memory operation, and we can 4397 // thus chose to execute only one. 4398 auto isUniformMemOpUse = [&](Instruction *I) { 4399 // If the value was already known to not be uniform for the previous 4400 // (smaller VF), it cannot be uniform for the larger VF. 4401 if (PrevVF.isVector()) { 4402 auto Iter = Uniforms.find(PrevVF); 4403 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4404 return false; 4405 } 4406 if (!Legal->isUniformMemOp(*I, VF)) 4407 return false; 4408 if (isa<LoadInst>(I)) 4409 // Loading the same address always produces the same result - at least 4410 // assuming aliasing and ordering which have already been checked. 4411 return true; 4412 // Storing the same value on every iteration. 4413 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4414 }; 4415 4416 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4417 InstWidening WideningDecision = getWideningDecision(I, VF); 4418 assert(WideningDecision != CM_Unknown && 4419 "Widening decision should be ready at this moment"); 4420 4421 if (isUniformMemOpUse(I)) 4422 return true; 4423 4424 return (WideningDecision == CM_Widen || 4425 WideningDecision == CM_Widen_Reverse || 4426 WideningDecision == CM_Interleave); 4427 }; 4428 4429 // Returns true if Ptr is the pointer operand of a memory access instruction 4430 // I, I is known to not require scalarization, and the pointer is not also 4431 // stored. 4432 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4433 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4434 return false; 4435 return getLoadStorePointerOperand(I) == Ptr && 4436 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4437 }; 4438 4439 // Holds a list of values which are known to have at least one uniform use. 4440 // Note that there may be other uses which aren't uniform. A "uniform use" 4441 // here is something which only demands lane 0 of the unrolled iterations; 4442 // it does not imply that all lanes produce the same value (e.g. this is not 4443 // the usual meaning of uniform) 4444 SetVector<Value *> HasUniformUse; 4445 4446 // Scan the loop for instructions which are either a) known to have only 4447 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4448 for (auto *BB : TheLoop->blocks()) 4449 for (auto &I : *BB) { 4450 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4451 switch (II->getIntrinsicID()) { 4452 case Intrinsic::sideeffect: 4453 case Intrinsic::experimental_noalias_scope_decl: 4454 case Intrinsic::assume: 4455 case Intrinsic::lifetime_start: 4456 case Intrinsic::lifetime_end: 4457 if (TheLoop->hasLoopInvariantOperands(&I)) 4458 addToWorklistIfAllowed(&I); 4459 break; 4460 default: 4461 break; 4462 } 4463 } 4464 4465 // ExtractValue instructions must be uniform, because the operands are 4466 // known to be loop-invariant. 4467 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4468 assert(isOutOfScope(EVI->getAggregateOperand()) && 4469 "Expected aggregate value to be loop invariant"); 4470 addToWorklistIfAllowed(EVI); 4471 continue; 4472 } 4473 4474 // If there's no pointer operand, there's nothing to do. 4475 auto *Ptr = getLoadStorePointerOperand(&I); 4476 if (!Ptr) 4477 continue; 4478 4479 if (isUniformMemOpUse(&I)) 4480 addToWorklistIfAllowed(&I); 4481 4482 if (isVectorizedMemAccessUse(&I, Ptr)) 4483 HasUniformUse.insert(Ptr); 4484 } 4485 4486 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4487 // demanding) users. Since loops are assumed to be in LCSSA form, this 4488 // disallows uses outside the loop as well. 4489 for (auto *V : HasUniformUse) { 4490 if (isOutOfScope(V)) 4491 continue; 4492 auto *I = cast<Instruction>(V); 4493 auto UsersAreMemAccesses = 4494 llvm::all_of(I->users(), [&](User *U) -> bool { 4495 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4496 }); 4497 if (UsersAreMemAccesses) 4498 addToWorklistIfAllowed(I); 4499 } 4500 4501 // Expand Worklist in topological order: whenever a new instruction 4502 // is added , its users should be already inside Worklist. It ensures 4503 // a uniform instruction will only be used by uniform instructions. 4504 unsigned idx = 0; 4505 while (idx != Worklist.size()) { 4506 Instruction *I = Worklist[idx++]; 4507 4508 for (auto *OV : I->operand_values()) { 4509 // isOutOfScope operands cannot be uniform instructions. 4510 if (isOutOfScope(OV)) 4511 continue; 4512 // First order recurrence Phi's should typically be considered 4513 // non-uniform. 4514 auto *OP = dyn_cast<PHINode>(OV); 4515 if (OP && Legal->isFixedOrderRecurrence(OP)) 4516 continue; 4517 // If all the users of the operand are uniform, then add the 4518 // operand into the uniform worklist. 4519 auto *OI = cast<Instruction>(OV); 4520 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4521 auto *J = cast<Instruction>(U); 4522 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4523 })) 4524 addToWorklistIfAllowed(OI); 4525 } 4526 } 4527 4528 // For an instruction to be added into Worklist above, all its users inside 4529 // the loop should also be in Worklist. However, this condition cannot be 4530 // true for phi nodes that form a cyclic dependence. We must process phi 4531 // nodes separately. An induction variable will remain uniform if all users 4532 // of the induction variable and induction variable update remain uniform. 4533 // The code below handles both pointer and non-pointer induction variables. 4534 for (const auto &Induction : Legal->getInductionVars()) { 4535 auto *Ind = Induction.first; 4536 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4537 4538 // Determine if all users of the induction variable are uniform after 4539 // vectorization. 4540 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4541 auto *I = cast<Instruction>(U); 4542 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4543 isVectorizedMemAccessUse(I, Ind); 4544 }); 4545 if (!UniformInd) 4546 continue; 4547 4548 // Determine if all users of the induction variable update instruction are 4549 // uniform after vectorization. 4550 auto UniformIndUpdate = 4551 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4552 auto *I = cast<Instruction>(U); 4553 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4554 isVectorizedMemAccessUse(I, IndUpdate); 4555 }); 4556 if (!UniformIndUpdate) 4557 continue; 4558 4559 // The induction variable and its update instruction will remain uniform. 4560 addToWorklistIfAllowed(Ind); 4561 addToWorklistIfAllowed(IndUpdate); 4562 } 4563 4564 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4565 } 4566 4567 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4568 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4569 4570 if (Legal->getRuntimePointerChecking()->Need) { 4571 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4572 "runtime pointer checks needed. Enable vectorization of this " 4573 "loop with '#pragma clang loop vectorize(enable)' when " 4574 "compiling with -Os/-Oz", 4575 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4576 return true; 4577 } 4578 4579 if (!PSE.getPredicate().isAlwaysTrue()) { 4580 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4581 "runtime SCEV checks needed. Enable vectorization of this " 4582 "loop with '#pragma clang loop vectorize(enable)' when " 4583 "compiling with -Os/-Oz", 4584 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4585 return true; 4586 } 4587 4588 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4589 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4590 reportVectorizationFailure("Runtime stride check for small trip count", 4591 "runtime stride == 1 checks needed. Enable vectorization of " 4592 "this loop without such check by compiling with -Os/-Oz", 4593 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4594 return true; 4595 } 4596 4597 return false; 4598 } 4599 4600 ElementCount 4601 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4602 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4603 return ElementCount::getScalable(0); 4604 4605 if (Hints->isScalableVectorizationDisabled()) { 4606 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4607 "ScalableVectorizationDisabled", ORE, TheLoop); 4608 return ElementCount::getScalable(0); 4609 } 4610 4611 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4612 4613 auto MaxScalableVF = ElementCount::getScalable( 4614 std::numeric_limits<ElementCount::ScalarTy>::max()); 4615 4616 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4617 // FIXME: While for scalable vectors this is currently sufficient, this should 4618 // be replaced by a more detailed mechanism that filters out specific VFs, 4619 // instead of invalidating vectorization for a whole set of VFs based on the 4620 // MaxVF. 4621 4622 // Disable scalable vectorization if the loop contains unsupported reductions. 4623 if (!canVectorizeReductions(MaxScalableVF)) { 4624 reportVectorizationInfo( 4625 "Scalable vectorization not supported for the reduction " 4626 "operations found in this loop.", 4627 "ScalableVFUnfeasible", ORE, TheLoop); 4628 return ElementCount::getScalable(0); 4629 } 4630 4631 // Disable scalable vectorization if the loop contains any instructions 4632 // with element types not supported for scalable vectors. 4633 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4634 return !Ty->isVoidTy() && 4635 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4636 })) { 4637 reportVectorizationInfo("Scalable vectorization is not supported " 4638 "for all element types found in this loop.", 4639 "ScalableVFUnfeasible", ORE, TheLoop); 4640 return ElementCount::getScalable(0); 4641 } 4642 4643 if (Legal->isSafeForAnyVectorWidth()) 4644 return MaxScalableVF; 4645 4646 // Limit MaxScalableVF by the maximum safe dependence distance. 4647 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4648 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4649 else 4650 MaxScalableVF = ElementCount::getScalable(0); 4651 4652 if (!MaxScalableVF) 4653 reportVectorizationInfo( 4654 "Max legal vector width too small, scalable vectorization " 4655 "unfeasible.", 4656 "ScalableVFUnfeasible", ORE, TheLoop); 4657 4658 return MaxScalableVF; 4659 } 4660 4661 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4662 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4663 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4664 unsigned SmallestType, WidestType; 4665 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4666 4667 // Get the maximum safe dependence distance in bits computed by LAA. 4668 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4669 // the memory accesses that is most restrictive (involved in the smallest 4670 // dependence distance). 4671 unsigned MaxSafeElements = 4672 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4673 4674 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4675 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4676 4677 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4678 << ".\n"); 4679 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4680 << ".\n"); 4681 4682 // First analyze the UserVF, fall back if the UserVF should be ignored. 4683 if (UserVF) { 4684 auto MaxSafeUserVF = 4685 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4686 4687 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4688 // If `VF=vscale x N` is safe, then so is `VF=N` 4689 if (UserVF.isScalable()) 4690 return FixedScalableVFPair( 4691 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4692 else 4693 return UserVF; 4694 } 4695 4696 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4697 4698 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4699 // is better to ignore the hint and let the compiler choose a suitable VF. 4700 if (!UserVF.isScalable()) { 4701 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4702 << " is unsafe, clamping to max safe VF=" 4703 << MaxSafeFixedVF << ".\n"); 4704 ORE->emit([&]() { 4705 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4706 TheLoop->getStartLoc(), 4707 TheLoop->getHeader()) 4708 << "User-specified vectorization factor " 4709 << ore::NV("UserVectorizationFactor", UserVF) 4710 << " is unsafe, clamping to maximum safe vectorization factor " 4711 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4712 }); 4713 return MaxSafeFixedVF; 4714 } 4715 4716 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4717 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4718 << " is ignored because scalable vectors are not " 4719 "available.\n"); 4720 ORE->emit([&]() { 4721 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4722 TheLoop->getStartLoc(), 4723 TheLoop->getHeader()) 4724 << "User-specified vectorization factor " 4725 << ore::NV("UserVectorizationFactor", UserVF) 4726 << " is ignored because the target does not support scalable " 4727 "vectors. The compiler will pick a more suitable value."; 4728 }); 4729 } else { 4730 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4731 << " is unsafe. Ignoring scalable UserVF.\n"); 4732 ORE->emit([&]() { 4733 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4734 TheLoop->getStartLoc(), 4735 TheLoop->getHeader()) 4736 << "User-specified vectorization factor " 4737 << ore::NV("UserVectorizationFactor", UserVF) 4738 << " is unsafe. Ignoring the hint to let the compiler pick a " 4739 "more suitable value."; 4740 }); 4741 } 4742 } 4743 4744 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4745 << " / " << WidestType << " bits.\n"); 4746 4747 FixedScalableVFPair Result(ElementCount::getFixed(1), 4748 ElementCount::getScalable(0)); 4749 if (auto MaxVF = 4750 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4751 MaxSafeFixedVF, FoldTailByMasking)) 4752 Result.FixedVF = MaxVF; 4753 4754 if (auto MaxVF = 4755 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4756 MaxSafeScalableVF, FoldTailByMasking)) 4757 if (MaxVF.isScalable()) { 4758 Result.ScalableVF = MaxVF; 4759 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4760 << "\n"); 4761 } 4762 4763 return Result; 4764 } 4765 4766 FixedScalableVFPair 4767 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4768 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4769 // TODO: It may by useful to do since it's still likely to be dynamically 4770 // uniform if the target can skip. 4771 reportVectorizationFailure( 4772 "Not inserting runtime ptr check for divergent target", 4773 "runtime pointer checks needed. Not enabled for divergent target", 4774 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4775 return FixedScalableVFPair::getNone(); 4776 } 4777 4778 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4779 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 4780 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4781 if (TC == 1) { 4782 reportVectorizationFailure("Single iteration (non) loop", 4783 "loop trip count is one, irrelevant for vectorization", 4784 "SingleIterationLoop", ORE, TheLoop); 4785 return FixedScalableVFPair::getNone(); 4786 } 4787 4788 switch (ScalarEpilogueStatus) { 4789 case CM_ScalarEpilogueAllowed: 4790 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4791 case CM_ScalarEpilogueNotAllowedUsePredicate: 4792 [[fallthrough]]; 4793 case CM_ScalarEpilogueNotNeededUsePredicate: 4794 LLVM_DEBUG( 4795 dbgs() << "LV: vector predicate hint/switch found.\n" 4796 << "LV: Not allowing scalar epilogue, creating predicated " 4797 << "vector loop.\n"); 4798 break; 4799 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4800 // fallthrough as a special case of OptForSize 4801 case CM_ScalarEpilogueNotAllowedOptSize: 4802 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4803 LLVM_DEBUG( 4804 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4805 else 4806 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4807 << "count.\n"); 4808 4809 // Bail if runtime checks are required, which are not good when optimising 4810 // for size. 4811 if (runtimeChecksRequired()) 4812 return FixedScalableVFPair::getNone(); 4813 4814 break; 4815 } 4816 4817 // The only loops we can vectorize without a scalar epilogue, are loops with 4818 // a bottom-test and a single exiting block. We'd have to handle the fact 4819 // that not every instruction executes on the last iteration. This will 4820 // require a lane mask which varies through the vector loop body. (TODO) 4821 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4822 // If there was a tail-folding hint/switch, but we can't fold the tail by 4823 // masking, fallback to a vectorization with a scalar epilogue. 4824 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4825 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4826 "scalar epilogue instead.\n"); 4827 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4828 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4829 } 4830 return FixedScalableVFPair::getNone(); 4831 } 4832 4833 // Now try the tail folding 4834 4835 // Invalidate interleave groups that require an epilogue if we can't mask 4836 // the interleave-group. 4837 if (!useMaskedInterleavedAccesses(TTI)) { 4838 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4839 "No decisions should have been taken at this point"); 4840 // Note: There is no need to invalidate any cost modeling decisions here, as 4841 // non where taken so far. 4842 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4843 } 4844 4845 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4846 4847 // Avoid tail folding if the trip count is known to be a multiple of any VF 4848 // we choose. 4849 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4850 MaxFactors.FixedVF.getFixedValue(); 4851 if (MaxFactors.ScalableVF) { 4852 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4853 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4854 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4855 *MaxPowerOf2RuntimeVF, 4856 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4857 } else 4858 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4859 } 4860 4861 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4862 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4863 "MaxFixedVF must be a power of 2"); 4864 unsigned MaxVFtimesIC = 4865 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4866 ScalarEvolution *SE = PSE.getSE(); 4867 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 4868 const SCEV *ExitCount = SE->getAddExpr( 4869 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4870 const SCEV *Rem = SE->getURemExpr( 4871 SE->applyLoopGuards(ExitCount, TheLoop), 4872 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4873 if (Rem->isZero()) { 4874 // Accept MaxFixedVF if we do not have a tail. 4875 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4876 return MaxFactors; 4877 } 4878 } 4879 4880 // If we don't know the precise trip count, or if the trip count that we 4881 // found modulo the vectorization factor is not zero, try to fold the tail 4882 // by masking. 4883 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4884 if (Legal->prepareToFoldTailByMasking()) { 4885 CanFoldTailByMasking = true; 4886 return MaxFactors; 4887 } 4888 4889 // If there was a tail-folding hint/switch, but we can't fold the tail by 4890 // masking, fallback to a vectorization with a scalar epilogue. 4891 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4892 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4893 "scalar epilogue instead.\n"); 4894 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4895 return MaxFactors; 4896 } 4897 4898 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4899 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4900 return FixedScalableVFPair::getNone(); 4901 } 4902 4903 if (TC == 0) { 4904 reportVectorizationFailure( 4905 "Unable to calculate the loop count due to complex control flow", 4906 "unable to calculate the loop count due to complex control flow", 4907 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4908 return FixedScalableVFPair::getNone(); 4909 } 4910 4911 reportVectorizationFailure( 4912 "Cannot optimize for size and vectorize at the same time.", 4913 "cannot optimize for size and vectorize at the same time. " 4914 "Enable vectorization of this loop with '#pragma clang loop " 4915 "vectorize(enable)' when compiling with -Os/-Oz", 4916 "NoTailLoopWithOptForSize", ORE, TheLoop); 4917 return FixedScalableVFPair::getNone(); 4918 } 4919 4920 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4921 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4922 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4923 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4924 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4925 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4926 : TargetTransformInfo::RGK_FixedWidthVector); 4927 4928 // Convenience function to return the minimum of two ElementCounts. 4929 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4930 assert((LHS.isScalable() == RHS.isScalable()) && 4931 "Scalable flags must match"); 4932 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4933 }; 4934 4935 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4936 // Note that both WidestRegister and WidestType may not be a powers of 2. 4937 auto MaxVectorElementCount = ElementCount::get( 4938 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4939 ComputeScalableMaxVF); 4940 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4941 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4942 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4943 4944 if (!MaxVectorElementCount) { 4945 LLVM_DEBUG(dbgs() << "LV: The target has no " 4946 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4947 << " vector registers.\n"); 4948 return ElementCount::getFixed(1); 4949 } 4950 4951 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4952 if (MaxVectorElementCount.isScalable() && 4953 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4954 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4955 auto Min = Attr.getVScaleRangeMin(); 4956 WidestRegisterMinEC *= Min; 4957 } 4958 4959 // When a scalar epilogue is required, at least one iteration of the scalar 4960 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4961 // max VF that results in a dead vector loop. 4962 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4963 MaxTripCount -= 1; 4964 4965 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4966 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4967 // If upper bound loop trip count (TC) is known at compile time there is no 4968 // point in choosing VF greater than TC (as done in the loop below). Select 4969 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4970 // scalable, we only fall back on a fixed VF when the TC is less than or 4971 // equal to the known number of lanes. 4972 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4973 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4974 "exceeding the constant trip count: " 4975 << ClampedUpperTripCount << "\n"); 4976 return ElementCount::get( 4977 ClampedUpperTripCount, 4978 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4979 } 4980 4981 TargetTransformInfo::RegisterKind RegKind = 4982 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4983 : TargetTransformInfo::RGK_FixedWidthVector; 4984 ElementCount MaxVF = MaxVectorElementCount; 4985 if (MaximizeBandwidth || 4986 (MaximizeBandwidth.getNumOccurrences() == 0 && 4987 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4988 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4989 auto MaxVectorElementCountMaxBW = ElementCount::get( 4990 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4991 ComputeScalableMaxVF); 4992 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4993 4994 // Collect all viable vectorization factors larger than the default MaxVF 4995 // (i.e. MaxVectorElementCount). 4996 SmallVector<ElementCount, 8> VFs; 4997 for (ElementCount VS = MaxVectorElementCount * 2; 4998 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4999 VFs.push_back(VS); 5000 5001 // For each VF calculate its register usage. 5002 auto RUs = calculateRegisterUsage(VFs); 5003 5004 // Select the largest VF which doesn't require more registers than existing 5005 // ones. 5006 for (int i = RUs.size() - 1; i >= 0; --i) { 5007 bool Selected = true; 5008 for (auto &pair : RUs[i].MaxLocalUsers) { 5009 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5010 if (pair.second > TargetNumRegisters) 5011 Selected = false; 5012 } 5013 if (Selected) { 5014 MaxVF = VFs[i]; 5015 break; 5016 } 5017 } 5018 if (ElementCount MinVF = 5019 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5020 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5021 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5022 << ") with target's minimum: " << MinVF << '\n'); 5023 MaxVF = MinVF; 5024 } 5025 } 5026 5027 // Invalidate any widening decisions we might have made, in case the loop 5028 // requires prediction (decided later), but we have already made some 5029 // load/store widening decisions. 5030 invalidateCostModelingDecisions(); 5031 } 5032 return MaxVF; 5033 } 5034 5035 /// Convenience function that returns the value of vscale_range iff 5036 /// vscale_range.min == vscale_range.max or otherwise returns the value 5037 /// returned by the corresponding TTI method. 5038 static std::optional<unsigned> 5039 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 5040 const Function *Fn = L->getHeader()->getParent(); 5041 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 5042 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 5043 auto Min = Attr.getVScaleRangeMin(); 5044 auto Max = Attr.getVScaleRangeMax(); 5045 if (Max && Min == Max) 5046 return Max; 5047 } 5048 5049 return TTI.getVScaleForTuning(); 5050 } 5051 5052 bool LoopVectorizationPlanner::isMoreProfitable( 5053 const VectorizationFactor &A, const VectorizationFactor &B) const { 5054 InstructionCost CostA = A.Cost; 5055 InstructionCost CostB = B.Cost; 5056 5057 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 5058 5059 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 5060 // If the trip count is a known (possibly small) constant, the trip count 5061 // will be rounded up to an integer number of iterations under 5062 // FoldTailByMasking. The total cost in that case will be 5063 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 5064 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 5065 // some extra overheads, but for the purpose of comparing the costs of 5066 // different VFs we can use this to compare the total loop-body cost 5067 // expected after vectorization. 5068 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 5069 InstructionCost VectorCost, 5070 InstructionCost ScalarCost) { 5071 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 5072 : VectorCost * (MaxTripCount / VF) + 5073 ScalarCost * (MaxTripCount % VF); 5074 }; 5075 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 5076 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 5077 5078 return RTCostA < RTCostB; 5079 } 5080 5081 // Improve estimate for the vector width if it is scalable. 5082 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5083 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5084 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 5085 if (A.Width.isScalable()) 5086 EstimatedWidthA *= *VScale; 5087 if (B.Width.isScalable()) 5088 EstimatedWidthB *= *VScale; 5089 } 5090 5091 // Assume vscale may be larger than 1 (or the value being tuned for), 5092 // so that scalable vectorization is slightly favorable over fixed-width 5093 // vectorization. 5094 if (A.Width.isScalable() && !B.Width.isScalable()) 5095 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5096 5097 // To avoid the need for FP division: 5098 // (CostA / A.Width) < (CostB / B.Width) 5099 // <=> (CostA * B.Width) < (CostB * A.Width) 5100 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5101 } 5102 5103 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 5104 OptimizationRemarkEmitter *ORE, 5105 Loop *TheLoop) { 5106 if (InvalidCosts.empty()) 5107 return; 5108 5109 // Emit a report of VFs with invalid costs in the loop. 5110 5111 // Group the remarks per instruction, keeping the instruction order from 5112 // InvalidCosts. 5113 std::map<Instruction *, unsigned> Numbering; 5114 unsigned I = 0; 5115 for (auto &Pair : InvalidCosts) 5116 if (!Numbering.count(Pair.first)) 5117 Numbering[Pair.first] = I++; 5118 5119 // Sort the list, first on instruction(number) then on VF. 5120 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5121 if (Numbering[A.first] != Numbering[B.first]) 5122 return Numbering[A.first] < Numbering[B.first]; 5123 ElementCountComparator ECC; 5124 return ECC(A.second, B.second); 5125 }); 5126 5127 // For a list of ordered instruction-vf pairs: 5128 // [(load, vf1), (load, vf2), (store, vf1)] 5129 // Group the instructions together to emit separate remarks for: 5130 // load (vf1, vf2) 5131 // store (vf1) 5132 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5133 auto Subset = ArrayRef<InstructionVFPair>(); 5134 do { 5135 if (Subset.empty()) 5136 Subset = Tail.take_front(1); 5137 5138 Instruction *I = Subset.front().first; 5139 5140 // If the next instruction is different, or if there are no other pairs, 5141 // emit a remark for the collated subset. e.g. 5142 // [(load, vf1), (load, vf2))] 5143 // to emit: 5144 // remark: invalid costs for 'load' at VF=(vf, vf2) 5145 if (Subset == Tail || Tail[Subset.size()].first != I) { 5146 std::string OutString; 5147 raw_string_ostream OS(OutString); 5148 assert(!Subset.empty() && "Unexpected empty range"); 5149 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5150 for (const auto &Pair : Subset) 5151 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 5152 OS << "):"; 5153 if (auto *CI = dyn_cast<CallInst>(I)) 5154 OS << " call to " << CI->getCalledFunction()->getName(); 5155 else 5156 OS << " " << I->getOpcodeName(); 5157 OS.flush(); 5158 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5159 Tail = Tail.drop_front(Subset.size()); 5160 Subset = {}; 5161 } else 5162 // Grow the subset by one element 5163 Subset = Tail.take_front(Subset.size() + 1); 5164 } while (!Tail.empty()); 5165 } 5166 5167 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 5168 const ElementCountSet &VFCandidates) { 5169 InstructionCost ExpectedCost = 5170 CM.expectedCost(ElementCount::getFixed(1)).first; 5171 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5172 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5173 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5174 "Expected Scalar VF to be a candidate"); 5175 5176 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5177 ExpectedCost); 5178 VectorizationFactor ChosenFactor = ScalarCost; 5179 5180 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 5181 if (ForceVectorization && VFCandidates.size() > 1) { 5182 // Ignore scalar width, because the user explicitly wants vectorization. 5183 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5184 // evaluation. 5185 ChosenFactor.Cost = InstructionCost::getMax(); 5186 } 5187 5188 SmallVector<InstructionVFPair> InvalidCosts; 5189 for (const auto &i : VFCandidates) { 5190 // The cost for scalar VF=1 is already calculated, so ignore it. 5191 if (i.isScalar()) 5192 continue; 5193 5194 LoopVectorizationCostModel::VectorizationCostTy C = 5195 CM.expectedCost(i, &InvalidCosts); 5196 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5197 5198 #ifndef NDEBUG 5199 unsigned AssumedMinimumVscale = 1; 5200 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5201 AssumedMinimumVscale = *VScale; 5202 unsigned Width = 5203 Candidate.Width.isScalable() 5204 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5205 : Candidate.Width.getFixedValue(); 5206 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5207 << " costs: " << (Candidate.Cost / Width)); 5208 if (i.isScalable()) 5209 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5210 << AssumedMinimumVscale << ")"); 5211 LLVM_DEBUG(dbgs() << ".\n"); 5212 #endif 5213 5214 if (!C.second && !ForceVectorization) { 5215 LLVM_DEBUG( 5216 dbgs() << "LV: Not considering vector loop of width " << i 5217 << " because it will not generate any vector instructions.\n"); 5218 continue; 5219 } 5220 5221 // If profitable add it to ProfitableVF list. 5222 if (isMoreProfitable(Candidate, ScalarCost)) 5223 ProfitableVFs.push_back(Candidate); 5224 5225 if (isMoreProfitable(Candidate, ChosenFactor)) 5226 ChosenFactor = Candidate; 5227 } 5228 5229 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5230 5231 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5232 reportVectorizationFailure( 5233 "There are conditional stores.", 5234 "store that is conditionally executed prevents vectorization", 5235 "ConditionalStore", ORE, OrigLoop); 5236 ChosenFactor = ScalarCost; 5237 } 5238 5239 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5240 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5241 << "LV: Vectorization seems to be not beneficial, " 5242 << "but was forced by a user.\n"); 5243 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5244 return ChosenFactor; 5245 } 5246 5247 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5248 ElementCount VF) const { 5249 // Cross iteration phis such as reductions need special handling and are 5250 // currently unsupported. 5251 if (any_of(OrigLoop->getHeader()->phis(), 5252 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5253 return false; 5254 5255 // Phis with uses outside of the loop require special handling and are 5256 // currently unsupported. 5257 for (const auto &Entry : Legal->getInductionVars()) { 5258 // Look for uses of the value of the induction at the last iteration. 5259 Value *PostInc = 5260 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5261 for (User *U : PostInc->users()) 5262 if (!OrigLoop->contains(cast<Instruction>(U))) 5263 return false; 5264 // Look for uses of penultimate value of the induction. 5265 for (User *U : Entry.first->users()) 5266 if (!OrigLoop->contains(cast<Instruction>(U))) 5267 return false; 5268 } 5269 5270 // Epilogue vectorization code has not been auditted to ensure it handles 5271 // non-latch exits properly. It may be fine, but it needs auditted and 5272 // tested. 5273 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5274 return false; 5275 5276 return true; 5277 } 5278 5279 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5280 const ElementCount VF) const { 5281 // FIXME: We need a much better cost-model to take different parameters such 5282 // as register pressure, code size increase and cost of extra branches into 5283 // account. For now we apply a very crude heuristic and only consider loops 5284 // with vectorization factors larger than a certain value. 5285 5286 // Allow the target to opt out entirely. 5287 if (!TTI.preferEpilogueVectorization()) 5288 return false; 5289 5290 // We also consider epilogue vectorization unprofitable for targets that don't 5291 // consider interleaving beneficial (eg. MVE). 5292 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5293 return false; 5294 5295 unsigned Multiplier = 1; 5296 if (VF.isScalable()) 5297 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5298 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5299 return true; 5300 return false; 5301 } 5302 5303 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5304 const ElementCount MainLoopVF, unsigned IC) { 5305 VectorizationFactor Result = VectorizationFactor::Disabled(); 5306 if (!EnableEpilogueVectorization) { 5307 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5308 return Result; 5309 } 5310 5311 if (!CM.isScalarEpilogueAllowed()) { 5312 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5313 "epilogue is allowed.\n"); 5314 return Result; 5315 } 5316 5317 // Not really a cost consideration, but check for unsupported cases here to 5318 // simplify the logic. 5319 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5320 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5321 "is not a supported candidate.\n"); 5322 return Result; 5323 } 5324 5325 if (EpilogueVectorizationForceVF > 1) { 5326 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5327 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5328 if (hasPlanWithVF(ForcedEC)) 5329 return {ForcedEC, 0, 0}; 5330 else { 5331 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5332 "viable.\n"); 5333 return Result; 5334 } 5335 } 5336 5337 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5338 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5339 LLVM_DEBUG( 5340 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5341 return Result; 5342 } 5343 5344 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5345 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5346 "this loop\n"); 5347 return Result; 5348 } 5349 5350 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5351 // the main loop handles 8 lanes per iteration. We could still benefit from 5352 // vectorizing the epilogue loop with VF=4. 5353 ElementCount EstimatedRuntimeVF = MainLoopVF; 5354 if (MainLoopVF.isScalable()) { 5355 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5356 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5357 EstimatedRuntimeVF *= *VScale; 5358 } 5359 5360 ScalarEvolution &SE = *PSE.getSE(); 5361 Type *TCType = Legal->getWidestInductionType(); 5362 const SCEV *RemainingIterations = nullptr; 5363 for (auto &NextVF : ProfitableVFs) { 5364 // Skip candidate VFs without a corresponding VPlan. 5365 if (!hasPlanWithVF(NextVF.Width)) 5366 continue; 5367 5368 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5369 // vectors) or the VF of the main loop (fixed vectors). 5370 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5371 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5372 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5373 continue; 5374 5375 // If NextVF is greater than the number of remaining iterations, the 5376 // epilogue loop would be dead. Skip such factors. 5377 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5378 // TODO: extend to support scalable VFs. 5379 if (!RemainingIterations) { 5380 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5381 RemainingIterations = SE.getURemExpr( 5382 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5383 } 5384 if (SE.isKnownPredicate( 5385 CmpInst::ICMP_UGT, 5386 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5387 RemainingIterations)) 5388 continue; 5389 } 5390 5391 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5392 Result = NextVF; 5393 } 5394 5395 if (Result != VectorizationFactor::Disabled()) 5396 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5397 << Result.Width << "\n"); 5398 return Result; 5399 } 5400 5401 std::pair<unsigned, unsigned> 5402 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5403 unsigned MinWidth = -1U; 5404 unsigned MaxWidth = 8; 5405 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5406 // For in-loop reductions, no element types are added to ElementTypesInLoop 5407 // if there are no loads/stores in the loop. In this case, check through the 5408 // reduction variables to determine the maximum width. 5409 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5410 // Reset MaxWidth so that we can find the smallest type used by recurrences 5411 // in the loop. 5412 MaxWidth = -1U; 5413 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5414 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5415 // When finding the min width used by the recurrence we need to account 5416 // for casts on the input operands of the recurrence. 5417 MaxWidth = std::min<unsigned>( 5418 MaxWidth, std::min<unsigned>( 5419 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5420 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5421 } 5422 } else { 5423 for (Type *T : ElementTypesInLoop) { 5424 MinWidth = std::min<unsigned>( 5425 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5426 MaxWidth = std::max<unsigned>( 5427 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5428 } 5429 } 5430 return {MinWidth, MaxWidth}; 5431 } 5432 5433 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5434 ElementTypesInLoop.clear(); 5435 // For each block. 5436 for (BasicBlock *BB : TheLoop->blocks()) { 5437 // For each instruction in the loop. 5438 for (Instruction &I : BB->instructionsWithoutDebug()) { 5439 Type *T = I.getType(); 5440 5441 // Skip ignored values. 5442 if (ValuesToIgnore.count(&I)) 5443 continue; 5444 5445 // Only examine Loads, Stores and PHINodes. 5446 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5447 continue; 5448 5449 // Examine PHI nodes that are reduction variables. Update the type to 5450 // account for the recurrence type. 5451 if (auto *PN = dyn_cast<PHINode>(&I)) { 5452 if (!Legal->isReductionVariable(PN)) 5453 continue; 5454 const RecurrenceDescriptor &RdxDesc = 5455 Legal->getReductionVars().find(PN)->second; 5456 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5457 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5458 RdxDesc.getRecurrenceType(), 5459 TargetTransformInfo::ReductionFlags())) 5460 continue; 5461 T = RdxDesc.getRecurrenceType(); 5462 } 5463 5464 // Examine the stored values. 5465 if (auto *ST = dyn_cast<StoreInst>(&I)) 5466 T = ST->getValueOperand()->getType(); 5467 5468 assert(T->isSized() && 5469 "Expected the load/store/recurrence type to be sized"); 5470 5471 ElementTypesInLoop.insert(T); 5472 } 5473 } 5474 } 5475 5476 unsigned 5477 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5478 InstructionCost LoopCost) { 5479 // -- The interleave heuristics -- 5480 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5481 // There are many micro-architectural considerations that we can't predict 5482 // at this level. For example, frontend pressure (on decode or fetch) due to 5483 // code size, or the number and capabilities of the execution ports. 5484 // 5485 // We use the following heuristics to select the interleave count: 5486 // 1. If the code has reductions, then we interleave to break the cross 5487 // iteration dependency. 5488 // 2. If the loop is really small, then we interleave to reduce the loop 5489 // overhead. 5490 // 3. We don't interleave if we think that we will spill registers to memory 5491 // due to the increased register pressure. 5492 5493 if (!isScalarEpilogueAllowed()) 5494 return 1; 5495 5496 // We used the distance for the interleave count. 5497 if (!Legal->isSafeForAnyVectorWidth()) 5498 return 1; 5499 5500 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5501 const bool HasReductions = !Legal->getReductionVars().empty(); 5502 // Do not interleave loops with a relatively small known or estimated trip 5503 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5504 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5505 // because with the above conditions interleaving can expose ILP and break 5506 // cross iteration dependences for reductions. 5507 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5508 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5509 return 1; 5510 5511 // If we did not calculate the cost for VF (because the user selected the VF) 5512 // then we calculate the cost of VF here. 5513 if (LoopCost == 0) { 5514 LoopCost = expectedCost(VF).first; 5515 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5516 5517 // Loop body is free and there is no need for interleaving. 5518 if (LoopCost == 0) 5519 return 1; 5520 } 5521 5522 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5523 // We divide by these constants so assume that we have at least one 5524 // instruction that uses at least one register. 5525 for (auto& pair : R.MaxLocalUsers) { 5526 pair.second = std::max(pair.second, 1U); 5527 } 5528 5529 // We calculate the interleave count using the following formula. 5530 // Subtract the number of loop invariants from the number of available 5531 // registers. These registers are used by all of the interleaved instances. 5532 // Next, divide the remaining registers by the number of registers that is 5533 // required by the loop, in order to estimate how many parallel instances 5534 // fit without causing spills. All of this is rounded down if necessary to be 5535 // a power of two. We want power of two interleave count to simplify any 5536 // addressing operations or alignment considerations. 5537 // We also want power of two interleave counts to ensure that the induction 5538 // variable of the vector loop wraps to zero, when tail is folded by masking; 5539 // this currently happens when OptForSize, in which case IC is set to 1 above. 5540 unsigned IC = UINT_MAX; 5541 5542 for (auto& pair : R.MaxLocalUsers) { 5543 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5544 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5545 << " registers of " 5546 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5547 if (VF.isScalar()) { 5548 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5549 TargetNumRegisters = ForceTargetNumScalarRegs; 5550 } else { 5551 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5552 TargetNumRegisters = ForceTargetNumVectorRegs; 5553 } 5554 unsigned MaxLocalUsers = pair.second; 5555 unsigned LoopInvariantRegs = 0; 5556 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5557 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5558 5559 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5560 MaxLocalUsers); 5561 // Don't count the induction variable as interleaved. 5562 if (EnableIndVarRegisterHeur) { 5563 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5564 std::max(1U, (MaxLocalUsers - 1))); 5565 } 5566 5567 IC = std::min(IC, TmpIC); 5568 } 5569 5570 // Clamp the interleave ranges to reasonable counts. 5571 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5572 5573 // Check if the user has overridden the max. 5574 if (VF.isScalar()) { 5575 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5576 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5577 } else { 5578 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5579 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5580 } 5581 5582 // If trip count is known or estimated compile time constant, limit the 5583 // interleave count to be less than the trip count divided by VF, provided it 5584 // is at least 1. 5585 // 5586 // For scalable vectors we can't know if interleaving is beneficial. It may 5587 // not be beneficial for small loops if none of the lanes in the second vector 5588 // iterations is enabled. However, for larger loops, there is likely to be a 5589 // similar benefit as for fixed-width vectors. For now, we choose to leave 5590 // the InterleaveCount as if vscale is '1', although if some information about 5591 // the vector is known (e.g. min vector size), we can make a better decision. 5592 if (BestKnownTC) { 5593 MaxInterleaveCount = 5594 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5595 // Make sure MaxInterleaveCount is greater than 0. 5596 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5597 } 5598 5599 assert(MaxInterleaveCount > 0 && 5600 "Maximum interleave count must be greater than 0"); 5601 5602 // Clamp the calculated IC to be between the 1 and the max interleave count 5603 // that the target and trip count allows. 5604 if (IC > MaxInterleaveCount) 5605 IC = MaxInterleaveCount; 5606 else 5607 // Make sure IC is greater than 0. 5608 IC = std::max(1u, IC); 5609 5610 assert(IC > 0 && "Interleave count must be greater than 0."); 5611 5612 // Interleave if we vectorized this loop and there is a reduction that could 5613 // benefit from interleaving. 5614 if (VF.isVector() && HasReductions) { 5615 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5616 return IC; 5617 } 5618 5619 // For any scalar loop that either requires runtime checks or predication we 5620 // are better off leaving this to the unroller. Note that if we've already 5621 // vectorized the loop we will have done the runtime check and so interleaving 5622 // won't require further checks. 5623 bool ScalarInterleavingRequiresPredication = 5624 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5625 return Legal->blockNeedsPredication(BB); 5626 })); 5627 bool ScalarInterleavingRequiresRuntimePointerCheck = 5628 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5629 5630 // We want to interleave small loops in order to reduce the loop overhead and 5631 // potentially expose ILP opportunities. 5632 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5633 << "LV: IC is " << IC << '\n' 5634 << "LV: VF is " << VF << '\n'); 5635 const bool AggressivelyInterleaveReductions = 5636 TTI.enableAggressiveInterleaving(HasReductions); 5637 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5638 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5639 // We assume that the cost overhead is 1 and we use the cost model 5640 // to estimate the cost of the loop and interleave until the cost of the 5641 // loop overhead is about 5% of the cost of the loop. 5642 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5643 SmallLoopCost / *LoopCost.getValue())); 5644 5645 // Interleave until store/load ports (estimated by max interleave count) are 5646 // saturated. 5647 unsigned NumStores = Legal->getNumStores(); 5648 unsigned NumLoads = Legal->getNumLoads(); 5649 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5650 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5651 5652 // There is little point in interleaving for reductions containing selects 5653 // and compares when VF=1 since it may just create more overhead than it's 5654 // worth for loops with small trip counts. This is because we still have to 5655 // do the final reduction after the loop. 5656 bool HasSelectCmpReductions = 5657 HasReductions && 5658 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5659 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5660 return RecurrenceDescriptor::isAnyOfRecurrenceKind( 5661 RdxDesc.getRecurrenceKind()); 5662 }); 5663 if (HasSelectCmpReductions) { 5664 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5665 return 1; 5666 } 5667 5668 // If we have a scalar reduction (vector reductions are already dealt with 5669 // by this point), we can increase the critical path length if the loop 5670 // we're interleaving is inside another loop. For tree-wise reductions 5671 // set the limit to 2, and for ordered reductions it's best to disable 5672 // interleaving entirely. 5673 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5674 bool HasOrderedReductions = 5675 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5676 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5677 return RdxDesc.isOrdered(); 5678 }); 5679 if (HasOrderedReductions) { 5680 LLVM_DEBUG( 5681 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5682 return 1; 5683 } 5684 5685 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5686 SmallIC = std::min(SmallIC, F); 5687 StoresIC = std::min(StoresIC, F); 5688 LoadsIC = std::min(LoadsIC, F); 5689 } 5690 5691 if (EnableLoadStoreRuntimeInterleave && 5692 std::max(StoresIC, LoadsIC) > SmallIC) { 5693 LLVM_DEBUG( 5694 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5695 return std::max(StoresIC, LoadsIC); 5696 } 5697 5698 // If there are scalar reductions and TTI has enabled aggressive 5699 // interleaving for reductions, we will interleave to expose ILP. 5700 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5701 AggressivelyInterleaveReductions) { 5702 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5703 // Interleave no less than SmallIC but not as aggressive as the normal IC 5704 // to satisfy the rare situation when resources are too limited. 5705 return std::max(IC / 2, SmallIC); 5706 } else { 5707 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5708 return SmallIC; 5709 } 5710 } 5711 5712 // Interleave if this is a large loop (small loops are already dealt with by 5713 // this point) that could benefit from interleaving. 5714 if (AggressivelyInterleaveReductions) { 5715 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5716 return IC; 5717 } 5718 5719 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5720 return 1; 5721 } 5722 5723 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5724 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5725 // This function calculates the register usage by measuring the highest number 5726 // of values that are alive at a single location. Obviously, this is a very 5727 // rough estimation. We scan the loop in a topological order in order and 5728 // assign a number to each instruction. We use RPO to ensure that defs are 5729 // met before their users. We assume that each instruction that has in-loop 5730 // users starts an interval. We record every time that an in-loop value is 5731 // used, so we have a list of the first and last occurrences of each 5732 // instruction. Next, we transpose this data structure into a multi map that 5733 // holds the list of intervals that *end* at a specific location. This multi 5734 // map allows us to perform a linear search. We scan the instructions linearly 5735 // and record each time that a new interval starts, by placing it in a set. 5736 // If we find this value in the multi-map then we remove it from the set. 5737 // The max register usage is the maximum size of the set. 5738 // We also search for instructions that are defined outside the loop, but are 5739 // used inside the loop. We need this number separately from the max-interval 5740 // usage number because when we unroll, loop-invariant values do not take 5741 // more register. 5742 LoopBlocksDFS DFS(TheLoop); 5743 DFS.perform(LI); 5744 5745 RegisterUsage RU; 5746 5747 // Each 'key' in the map opens a new interval. The values 5748 // of the map are the index of the 'last seen' usage of the 5749 // instruction that is the key. 5750 using IntervalMap = DenseMap<Instruction *, unsigned>; 5751 5752 // Maps instruction to its index. 5753 SmallVector<Instruction *, 64> IdxToInstr; 5754 // Marks the end of each interval. 5755 IntervalMap EndPoint; 5756 // Saves the list of instruction indices that are used in the loop. 5757 SmallPtrSet<Instruction *, 8> Ends; 5758 // Saves the list of values that are used in the loop but are defined outside 5759 // the loop (not including non-instruction values such as arguments and 5760 // constants). 5761 SmallSetVector<Instruction *, 8> LoopInvariants; 5762 5763 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5764 for (Instruction &I : BB->instructionsWithoutDebug()) { 5765 IdxToInstr.push_back(&I); 5766 5767 // Save the end location of each USE. 5768 for (Value *U : I.operands()) { 5769 auto *Instr = dyn_cast<Instruction>(U); 5770 5771 // Ignore non-instruction values such as arguments, constants, etc. 5772 // FIXME: Might need some motivation why these values are ignored. If 5773 // for example an argument is used inside the loop it will increase the 5774 // register pressure (so shouldn't we add it to LoopInvariants). 5775 if (!Instr) 5776 continue; 5777 5778 // If this instruction is outside the loop then record it and continue. 5779 if (!TheLoop->contains(Instr)) { 5780 LoopInvariants.insert(Instr); 5781 continue; 5782 } 5783 5784 // Overwrite previous end points. 5785 EndPoint[Instr] = IdxToInstr.size(); 5786 Ends.insert(Instr); 5787 } 5788 } 5789 } 5790 5791 // Saves the list of intervals that end with the index in 'key'. 5792 using InstrList = SmallVector<Instruction *, 2>; 5793 DenseMap<unsigned, InstrList> TransposeEnds; 5794 5795 // Transpose the EndPoints to a list of values that end at each index. 5796 for (auto &Interval : EndPoint) 5797 TransposeEnds[Interval.second].push_back(Interval.first); 5798 5799 SmallPtrSet<Instruction *, 8> OpenIntervals; 5800 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5801 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5802 5803 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5804 5805 const auto &TTICapture = TTI; 5806 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5807 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5808 return 0; 5809 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5810 }; 5811 5812 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5813 Instruction *I = IdxToInstr[i]; 5814 5815 // Remove all of the instructions that end at this location. 5816 InstrList &List = TransposeEnds[i]; 5817 for (Instruction *ToRemove : List) 5818 OpenIntervals.erase(ToRemove); 5819 5820 // Ignore instructions that are never used within the loop. 5821 if (!Ends.count(I)) 5822 continue; 5823 5824 // Skip ignored values. 5825 if (ValuesToIgnore.count(I)) 5826 continue; 5827 5828 collectInLoopReductions(); 5829 5830 // For each VF find the maximum usage of registers. 5831 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5832 // Count the number of registers used, per register class, given all open 5833 // intervals. 5834 // Note that elements in this SmallMapVector will be default constructed 5835 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5836 // there is no previous entry for ClassID. 5837 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5838 5839 if (VFs[j].isScalar()) { 5840 for (auto *Inst : OpenIntervals) { 5841 unsigned ClassID = 5842 TTI.getRegisterClassForType(false, Inst->getType()); 5843 // FIXME: The target might use more than one register for the type 5844 // even in the scalar case. 5845 RegUsage[ClassID] += 1; 5846 } 5847 } else { 5848 collectUniformsAndScalars(VFs[j]); 5849 for (auto *Inst : OpenIntervals) { 5850 // Skip ignored values for VF > 1. 5851 if (VecValuesToIgnore.count(Inst)) 5852 continue; 5853 if (isScalarAfterVectorization(Inst, VFs[j])) { 5854 unsigned ClassID = 5855 TTI.getRegisterClassForType(false, Inst->getType()); 5856 // FIXME: The target might use more than one register for the type 5857 // even in the scalar case. 5858 RegUsage[ClassID] += 1; 5859 } else { 5860 unsigned ClassID = 5861 TTI.getRegisterClassForType(true, Inst->getType()); 5862 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5863 } 5864 } 5865 } 5866 5867 for (auto& pair : RegUsage) { 5868 auto &Entry = MaxUsages[j][pair.first]; 5869 Entry = std::max(Entry, pair.second); 5870 } 5871 } 5872 5873 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5874 << OpenIntervals.size() << '\n'); 5875 5876 // Add the current instruction to the list of open intervals. 5877 OpenIntervals.insert(I); 5878 } 5879 5880 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5881 // Note that elements in this SmallMapVector will be default constructed 5882 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5883 // there is no previous entry for ClassID. 5884 SmallMapVector<unsigned, unsigned, 4> Invariant; 5885 5886 for (auto *Inst : LoopInvariants) { 5887 // FIXME: The target might use more than one register for the type 5888 // even in the scalar case. 5889 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5890 auto *I = cast<Instruction>(U); 5891 return TheLoop != LI->getLoopFor(I->getParent()) || 5892 isScalarAfterVectorization(I, VFs[i]); 5893 }); 5894 5895 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 5896 unsigned ClassID = 5897 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5898 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5899 } 5900 5901 LLVM_DEBUG({ 5902 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5903 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5904 << " item\n"; 5905 for (const auto &pair : MaxUsages[i]) { 5906 dbgs() << "LV(REG): RegisterClass: " 5907 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5908 << " registers\n"; 5909 } 5910 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5911 << " item\n"; 5912 for (const auto &pair : Invariant) { 5913 dbgs() << "LV(REG): RegisterClass: " 5914 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5915 << " registers\n"; 5916 } 5917 }); 5918 5919 RU.LoopInvariantRegs = Invariant; 5920 RU.MaxLocalUsers = MaxUsages[i]; 5921 RUs[i] = RU; 5922 } 5923 5924 return RUs; 5925 } 5926 5927 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5928 ElementCount VF) { 5929 // TODO: Cost model for emulated masked load/store is completely 5930 // broken. This hack guides the cost model to use an artificially 5931 // high enough value to practically disable vectorization with such 5932 // operations, except where previously deployed legality hack allowed 5933 // using very low cost values. This is to avoid regressions coming simply 5934 // from moving "masked load/store" check from legality to cost model. 5935 // Masked Load/Gather emulation was previously never allowed. 5936 // Limited number of Masked Store/Scatter emulation was allowed. 5937 assert((isPredicatedInst(I)) && 5938 "Expecting a scalar emulated instruction"); 5939 return isa<LoadInst>(I) || 5940 (isa<StoreInst>(I) && 5941 NumPredStores > NumberOfStoresToPredicate); 5942 } 5943 5944 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5945 // If we aren't vectorizing the loop, or if we've already collected the 5946 // instructions to scalarize, there's nothing to do. Collection may already 5947 // have occurred if we have a user-selected VF and are now computing the 5948 // expected cost for interleaving. 5949 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5950 return; 5951 5952 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5953 // not profitable to scalarize any instructions, the presence of VF in the 5954 // map will indicate that we've analyzed it already. 5955 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5956 5957 PredicatedBBsAfterVectorization[VF].clear(); 5958 5959 // Find all the instructions that are scalar with predication in the loop and 5960 // determine if it would be better to not if-convert the blocks they are in. 5961 // If so, we also record the instructions to scalarize. 5962 for (BasicBlock *BB : TheLoop->blocks()) { 5963 if (!blockNeedsPredicationForAnyReason(BB)) 5964 continue; 5965 for (Instruction &I : *BB) 5966 if (isScalarWithPredication(&I, VF)) { 5967 ScalarCostsTy ScalarCosts; 5968 // Do not apply discount if scalable, because that would lead to 5969 // invalid scalarization costs. 5970 // Do not apply discount logic if hacked cost is needed 5971 // for emulated masked memrefs. 5972 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 5973 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5974 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5975 // Remember that BB will remain after vectorization. 5976 PredicatedBBsAfterVectorization[VF].insert(BB); 5977 } 5978 } 5979 } 5980 5981 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5982 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5983 assert(!isUniformAfterVectorization(PredInst, VF) && 5984 "Instruction marked uniform-after-vectorization will be predicated"); 5985 5986 // Initialize the discount to zero, meaning that the scalar version and the 5987 // vector version cost the same. 5988 InstructionCost Discount = 0; 5989 5990 // Holds instructions to analyze. The instructions we visit are mapped in 5991 // ScalarCosts. Those instructions are the ones that would be scalarized if 5992 // we find that the scalar version costs less. 5993 SmallVector<Instruction *, 8> Worklist; 5994 5995 // Returns true if the given instruction can be scalarized. 5996 auto canBeScalarized = [&](Instruction *I) -> bool { 5997 // We only attempt to scalarize instructions forming a single-use chain 5998 // from the original predicated block that would otherwise be vectorized. 5999 // Although not strictly necessary, we give up on instructions we know will 6000 // already be scalar to avoid traversing chains that are unlikely to be 6001 // beneficial. 6002 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6003 isScalarAfterVectorization(I, VF)) 6004 return false; 6005 6006 // If the instruction is scalar with predication, it will be analyzed 6007 // separately. We ignore it within the context of PredInst. 6008 if (isScalarWithPredication(I, VF)) 6009 return false; 6010 6011 // If any of the instruction's operands are uniform after vectorization, 6012 // the instruction cannot be scalarized. This prevents, for example, a 6013 // masked load from being scalarized. 6014 // 6015 // We assume we will only emit a value for lane zero of an instruction 6016 // marked uniform after vectorization, rather than VF identical values. 6017 // Thus, if we scalarize an instruction that uses a uniform, we would 6018 // create uses of values corresponding to the lanes we aren't emitting code 6019 // for. This behavior can be changed by allowing getScalarValue to clone 6020 // the lane zero values for uniforms rather than asserting. 6021 for (Use &U : I->operands()) 6022 if (auto *J = dyn_cast<Instruction>(U.get())) 6023 if (isUniformAfterVectorization(J, VF)) 6024 return false; 6025 6026 // Otherwise, we can scalarize the instruction. 6027 return true; 6028 }; 6029 6030 // Compute the expected cost discount from scalarizing the entire expression 6031 // feeding the predicated instruction. We currently only consider expressions 6032 // that are single-use instruction chains. 6033 Worklist.push_back(PredInst); 6034 while (!Worklist.empty()) { 6035 Instruction *I = Worklist.pop_back_val(); 6036 6037 // If we've already analyzed the instruction, there's nothing to do. 6038 if (ScalarCosts.contains(I)) 6039 continue; 6040 6041 // Compute the cost of the vector instruction. Note that this cost already 6042 // includes the scalarization overhead of the predicated instruction. 6043 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6044 6045 // Compute the cost of the scalarized instruction. This cost is the cost of 6046 // the instruction as if it wasn't if-converted and instead remained in the 6047 // predicated block. We will scale this cost by block probability after 6048 // computing the scalarization overhead. 6049 InstructionCost ScalarCost = 6050 VF.getFixedValue() * 6051 getInstructionCost(I, ElementCount::getFixed(1)).first; 6052 6053 // Compute the scalarization overhead of needed insertelement instructions 6054 // and phi nodes. 6055 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6056 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6057 ScalarCost += TTI.getScalarizationOverhead( 6058 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6059 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 6060 /*Extract*/ false, CostKind); 6061 ScalarCost += 6062 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6063 } 6064 6065 // Compute the scalarization overhead of needed extractelement 6066 // instructions. For each of the instruction's operands, if the operand can 6067 // be scalarized, add it to the worklist; otherwise, account for the 6068 // overhead. 6069 for (Use &U : I->operands()) 6070 if (auto *J = dyn_cast<Instruction>(U.get())) { 6071 assert(VectorType::isValidElementType(J->getType()) && 6072 "Instruction has non-scalar type"); 6073 if (canBeScalarized(J)) 6074 Worklist.push_back(J); 6075 else if (needsExtract(J, VF)) { 6076 ScalarCost += TTI.getScalarizationOverhead( 6077 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6078 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 6079 /*Extract*/ true, CostKind); 6080 } 6081 } 6082 6083 // Scale the total scalar cost by block probability. 6084 ScalarCost /= getReciprocalPredBlockProb(); 6085 6086 // Compute the discount. A non-negative discount means the vector version 6087 // of the instruction costs more, and scalarizing would be beneficial. 6088 Discount += VectorCost - ScalarCost; 6089 ScalarCosts[I] = ScalarCost; 6090 } 6091 6092 return Discount; 6093 } 6094 6095 LoopVectorizationCostModel::VectorizationCostTy 6096 LoopVectorizationCostModel::expectedCost( 6097 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6098 VectorizationCostTy Cost; 6099 6100 // For each block. 6101 for (BasicBlock *BB : TheLoop->blocks()) { 6102 VectorizationCostTy BlockCost; 6103 6104 // For each instruction in the old loop. 6105 for (Instruction &I : BB->instructionsWithoutDebug()) { 6106 // Skip ignored values. 6107 if (ValuesToIgnore.count(&I) || 6108 (VF.isVector() && VecValuesToIgnore.count(&I))) 6109 continue; 6110 6111 VectorizationCostTy C = getInstructionCost(&I, VF); 6112 6113 // Check if we should override the cost. 6114 if (C.first.isValid() && 6115 ForceTargetInstructionCost.getNumOccurrences() > 0) 6116 C.first = InstructionCost(ForceTargetInstructionCost); 6117 6118 // Keep a list of instructions with invalid costs. 6119 if (Invalid && !C.first.isValid()) 6120 Invalid->emplace_back(&I, VF); 6121 6122 BlockCost.first += C.first; 6123 BlockCost.second |= C.second; 6124 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6125 << " for VF " << VF << " For instruction: " << I 6126 << '\n'); 6127 } 6128 6129 // If we are vectorizing a predicated block, it will have been 6130 // if-converted. This means that the block's instructions (aside from 6131 // stores and instructions that may divide by zero) will now be 6132 // unconditionally executed. For the scalar case, we may not always execute 6133 // the predicated block, if it is an if-else block. Thus, scale the block's 6134 // cost by the probability of executing it. blockNeedsPredication from 6135 // Legal is used so as to not include all blocks in tail folded loops. 6136 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6137 BlockCost.first /= getReciprocalPredBlockProb(); 6138 6139 Cost.first += BlockCost.first; 6140 Cost.second |= BlockCost.second; 6141 } 6142 6143 return Cost; 6144 } 6145 6146 /// Gets Address Access SCEV after verifying that the access pattern 6147 /// is loop invariant except the induction variable dependence. 6148 /// 6149 /// This SCEV can be sent to the Target in order to estimate the address 6150 /// calculation cost. 6151 static const SCEV *getAddressAccessSCEV( 6152 Value *Ptr, 6153 LoopVectorizationLegality *Legal, 6154 PredicatedScalarEvolution &PSE, 6155 const Loop *TheLoop) { 6156 6157 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6158 if (!Gep) 6159 return nullptr; 6160 6161 // We are looking for a gep with all loop invariant indices except for one 6162 // which should be an induction variable. 6163 auto SE = PSE.getSE(); 6164 unsigned NumOperands = Gep->getNumOperands(); 6165 for (unsigned i = 1; i < NumOperands; ++i) { 6166 Value *Opd = Gep->getOperand(i); 6167 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6168 !Legal->isInductionVariable(Opd)) 6169 return nullptr; 6170 } 6171 6172 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6173 return PSE.getSCEV(Ptr); 6174 } 6175 6176 InstructionCost 6177 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6178 ElementCount VF) { 6179 assert(VF.isVector() && 6180 "Scalarization cost of instruction implies vectorization."); 6181 if (VF.isScalable()) 6182 return InstructionCost::getInvalid(); 6183 6184 Type *ValTy = getLoadStoreType(I); 6185 auto SE = PSE.getSE(); 6186 6187 unsigned AS = getLoadStoreAddressSpace(I); 6188 Value *Ptr = getLoadStorePointerOperand(I); 6189 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6190 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6191 // that it is being called from this specific place. 6192 6193 // Figure out whether the access is strided and get the stride value 6194 // if it's known in compile time 6195 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6196 6197 // Get the cost of the scalar memory instruction and address computation. 6198 InstructionCost Cost = 6199 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6200 6201 // Don't pass *I here, since it is scalar but will actually be part of a 6202 // vectorized loop where the user of it is a vectorized instruction. 6203 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6204 const Align Alignment = getLoadStoreAlignment(I); 6205 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6206 ValTy->getScalarType(), 6207 Alignment, AS, CostKind); 6208 6209 // Get the overhead of the extractelement and insertelement instructions 6210 // we might create due to scalarization. 6211 Cost += getScalarizationOverhead(I, VF, CostKind); 6212 6213 // If we have a predicated load/store, it will need extra i1 extracts and 6214 // conditional branches, but may not be executed for each vector lane. Scale 6215 // the cost by the probability of executing the predicated block. 6216 if (isPredicatedInst(I)) { 6217 Cost /= getReciprocalPredBlockProb(); 6218 6219 // Add the cost of an i1 extract and a branch 6220 auto *Vec_i1Ty = 6221 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6222 Cost += TTI.getScalarizationOverhead( 6223 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6224 /*Insert=*/false, /*Extract=*/true, CostKind); 6225 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6226 6227 if (useEmulatedMaskMemRefHack(I, VF)) 6228 // Artificially setting to a high enough value to practically disable 6229 // vectorization with such operations. 6230 Cost = 3000000; 6231 } 6232 6233 return Cost; 6234 } 6235 6236 InstructionCost 6237 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6238 ElementCount VF) { 6239 Type *ValTy = getLoadStoreType(I); 6240 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6241 Value *Ptr = getLoadStorePointerOperand(I); 6242 unsigned AS = getLoadStoreAddressSpace(I); 6243 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6244 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6245 6246 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6247 "Stride should be 1 or -1 for consecutive memory access"); 6248 const Align Alignment = getLoadStoreAlignment(I); 6249 InstructionCost Cost = 0; 6250 if (Legal->isMaskRequired(I)) { 6251 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6252 CostKind); 6253 } else { 6254 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6255 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6256 CostKind, OpInfo, I); 6257 } 6258 6259 bool Reverse = ConsecutiveStride < 0; 6260 if (Reverse) 6261 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6262 std::nullopt, CostKind, 0); 6263 return Cost; 6264 } 6265 6266 InstructionCost 6267 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6268 ElementCount VF) { 6269 assert(Legal->isUniformMemOp(*I, VF)); 6270 6271 Type *ValTy = getLoadStoreType(I); 6272 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6273 const Align Alignment = getLoadStoreAlignment(I); 6274 unsigned AS = getLoadStoreAddressSpace(I); 6275 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6276 if (isa<LoadInst>(I)) { 6277 return TTI.getAddressComputationCost(ValTy) + 6278 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6279 CostKind) + 6280 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6281 } 6282 StoreInst *SI = cast<StoreInst>(I); 6283 6284 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6285 return TTI.getAddressComputationCost(ValTy) + 6286 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6287 CostKind) + 6288 (isLoopInvariantStoreValue 6289 ? 0 6290 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6291 CostKind, VF.getKnownMinValue() - 1)); 6292 } 6293 6294 InstructionCost 6295 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6296 ElementCount VF) { 6297 Type *ValTy = getLoadStoreType(I); 6298 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6299 const Align Alignment = getLoadStoreAlignment(I); 6300 const Value *Ptr = getLoadStorePointerOperand(I); 6301 6302 return TTI.getAddressComputationCost(VectorTy) + 6303 TTI.getGatherScatterOpCost( 6304 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6305 TargetTransformInfo::TCK_RecipThroughput, I); 6306 } 6307 6308 InstructionCost 6309 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6310 ElementCount VF) { 6311 Type *ValTy = getLoadStoreType(I); 6312 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6313 unsigned AS = getLoadStoreAddressSpace(I); 6314 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6315 6316 auto Group = getInterleavedAccessGroup(I); 6317 assert(Group && "Fail to get an interleaved access group."); 6318 6319 unsigned InterleaveFactor = Group->getFactor(); 6320 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6321 6322 // Holds the indices of existing members in the interleaved group. 6323 SmallVector<unsigned, 4> Indices; 6324 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6325 if (Group->getMember(IF)) 6326 Indices.push_back(IF); 6327 6328 // Calculate the cost of the whole interleaved group. 6329 bool UseMaskForGaps = 6330 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6331 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6332 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6333 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6334 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6335 6336 if (Group->isReverse()) { 6337 // TODO: Add support for reversed masked interleaved access. 6338 assert(!Legal->isMaskRequired(I) && 6339 "Reverse masked interleaved access not supported."); 6340 Cost += Group->getNumMembers() * 6341 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6342 std::nullopt, CostKind, 0); 6343 } 6344 return Cost; 6345 } 6346 6347 std::optional<InstructionCost> 6348 LoopVectorizationCostModel::getReductionPatternCost( 6349 Instruction *I, ElementCount VF, Type *Ty, 6350 TTI::TargetCostKind CostKind) const { 6351 using namespace llvm::PatternMatch; 6352 // Early exit for no inloop reductions 6353 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6354 return std::nullopt; 6355 auto *VectorTy = cast<VectorType>(Ty); 6356 6357 // We are looking for a pattern of, and finding the minimal acceptable cost: 6358 // reduce(mul(ext(A), ext(B))) or 6359 // reduce(mul(A, B)) or 6360 // reduce(ext(A)) or 6361 // reduce(A). 6362 // The basic idea is that we walk down the tree to do that, finding the root 6363 // reduction instruction in InLoopReductionImmediateChains. From there we find 6364 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6365 // of the components. If the reduction cost is lower then we return it for the 6366 // reduction instruction and 0 for the other instructions in the pattern. If 6367 // it is not we return an invalid cost specifying the orignal cost method 6368 // should be used. 6369 Instruction *RetI = I; 6370 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6371 if (!RetI->hasOneUser()) 6372 return std::nullopt; 6373 RetI = RetI->user_back(); 6374 } 6375 6376 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6377 RetI->user_back()->getOpcode() == Instruction::Add) { 6378 RetI = RetI->user_back(); 6379 } 6380 6381 // Test if the found instruction is a reduction, and if not return an invalid 6382 // cost specifying the parent to use the original cost modelling. 6383 if (!InLoopReductionImmediateChains.count(RetI)) 6384 return std::nullopt; 6385 6386 // Find the reduction this chain is a part of and calculate the basic cost of 6387 // the reduction on its own. 6388 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 6389 Instruction *ReductionPhi = LastChain; 6390 while (!isa<PHINode>(ReductionPhi)) 6391 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 6392 6393 const RecurrenceDescriptor &RdxDesc = 6394 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6395 6396 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6397 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6398 6399 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6400 // normal fmul instruction to the cost of the fadd reduction. 6401 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6402 BaseCost += 6403 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6404 6405 // If we're using ordered reductions then we can just return the base cost 6406 // here, since getArithmeticReductionCost calculates the full ordered 6407 // reduction cost when FP reassociation is not allowed. 6408 if (useOrderedReductions(RdxDesc)) 6409 return BaseCost; 6410 6411 // Get the operand that was not the reduction chain and match it to one of the 6412 // patterns, returning the better cost if it is found. 6413 Instruction *RedOp = RetI->getOperand(1) == LastChain 6414 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6415 : dyn_cast<Instruction>(RetI->getOperand(1)); 6416 6417 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6418 6419 Instruction *Op0, *Op1; 6420 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6421 match(RedOp, 6422 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6423 match(Op0, m_ZExtOrSExt(m_Value())) && 6424 Op0->getOpcode() == Op1->getOpcode() && 6425 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6426 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6427 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6428 6429 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6430 // Note that the extend opcodes need to all match, or if A==B they will have 6431 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6432 // which is equally fine. 6433 bool IsUnsigned = isa<ZExtInst>(Op0); 6434 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6435 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6436 6437 InstructionCost ExtCost = 6438 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6439 TTI::CastContextHint::None, CostKind, Op0); 6440 InstructionCost MulCost = 6441 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6442 InstructionCost Ext2Cost = 6443 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6444 TTI::CastContextHint::None, CostKind, RedOp); 6445 6446 InstructionCost RedCost = TTI.getMulAccReductionCost( 6447 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6448 6449 if (RedCost.isValid() && 6450 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6451 return I == RetI ? RedCost : 0; 6452 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6453 !TheLoop->isLoopInvariant(RedOp)) { 6454 // Matched reduce(ext(A)) 6455 bool IsUnsigned = isa<ZExtInst>(RedOp); 6456 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6457 InstructionCost RedCost = TTI.getExtendedReductionCost( 6458 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6459 RdxDesc.getFastMathFlags(), CostKind); 6460 6461 InstructionCost ExtCost = 6462 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6463 TTI::CastContextHint::None, CostKind, RedOp); 6464 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6465 return I == RetI ? RedCost : 0; 6466 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6467 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6468 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6469 Op0->getOpcode() == Op1->getOpcode() && 6470 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6471 bool IsUnsigned = isa<ZExtInst>(Op0); 6472 Type *Op0Ty = Op0->getOperand(0)->getType(); 6473 Type *Op1Ty = Op1->getOperand(0)->getType(); 6474 Type *LargestOpTy = 6475 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6476 : Op0Ty; 6477 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6478 6479 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6480 // different sizes. We take the largest type as the ext to reduce, and add 6481 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6482 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6483 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6484 TTI::CastContextHint::None, CostKind, Op0); 6485 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6486 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6487 TTI::CastContextHint::None, CostKind, Op1); 6488 InstructionCost MulCost = 6489 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6490 6491 InstructionCost RedCost = TTI.getMulAccReductionCost( 6492 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6493 InstructionCost ExtraExtCost = 0; 6494 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6495 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6496 ExtraExtCost = TTI.getCastInstrCost( 6497 ExtraExtOp->getOpcode(), ExtType, 6498 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6499 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6500 } 6501 6502 if (RedCost.isValid() && 6503 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6504 return I == RetI ? RedCost : 0; 6505 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6506 // Matched reduce.add(mul()) 6507 InstructionCost MulCost = 6508 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6509 6510 InstructionCost RedCost = TTI.getMulAccReductionCost( 6511 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6512 6513 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6514 return I == RetI ? RedCost : 0; 6515 } 6516 } 6517 6518 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6519 } 6520 6521 InstructionCost 6522 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6523 ElementCount VF) { 6524 // Calculate scalar cost only. Vectorization cost should be ready at this 6525 // moment. 6526 if (VF.isScalar()) { 6527 Type *ValTy = getLoadStoreType(I); 6528 const Align Alignment = getLoadStoreAlignment(I); 6529 unsigned AS = getLoadStoreAddressSpace(I); 6530 6531 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6532 return TTI.getAddressComputationCost(ValTy) + 6533 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6534 TTI::TCK_RecipThroughput, OpInfo, I); 6535 } 6536 return getWideningCost(I, VF); 6537 } 6538 6539 LoopVectorizationCostModel::VectorizationCostTy 6540 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6541 ElementCount VF) { 6542 // If we know that this instruction will remain uniform, check the cost of 6543 // the scalar version. 6544 if (isUniformAfterVectorization(I, VF)) 6545 VF = ElementCount::getFixed(1); 6546 6547 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6548 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6549 6550 // Forced scalars do not have any scalarization overhead. 6551 auto ForcedScalar = ForcedScalars.find(VF); 6552 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6553 auto InstSet = ForcedScalar->second; 6554 if (InstSet.count(I)) 6555 return VectorizationCostTy( 6556 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6557 VF.getKnownMinValue()), 6558 false); 6559 } 6560 6561 Type *VectorTy; 6562 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6563 6564 bool TypeNotScalarized = false; 6565 if (VF.isVector() && VectorTy->isVectorTy()) { 6566 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6567 if (VF.isScalable()) 6568 // <vscale x 1 x iN> is assumed to be profitable over iN because 6569 // scalable registers are a distinct register class from scalar ones. 6570 // If we ever find a target which wants to lower scalable vectors 6571 // back to scalars, we'll need to update this code to explicitly 6572 // ask TTI about the register class uses for each part. 6573 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6574 else 6575 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6576 } else 6577 C = InstructionCost::getInvalid(); 6578 } 6579 return VectorizationCostTy(C, TypeNotScalarized); 6580 } 6581 6582 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6583 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6584 6585 // There is no mechanism yet to create a scalable scalarization loop, 6586 // so this is currently Invalid. 6587 if (VF.isScalable()) 6588 return InstructionCost::getInvalid(); 6589 6590 if (VF.isScalar()) 6591 return 0; 6592 6593 InstructionCost Cost = 0; 6594 Type *RetTy = ToVectorTy(I->getType(), VF); 6595 if (!RetTy->isVoidTy() && 6596 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6597 Cost += TTI.getScalarizationOverhead( 6598 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6599 /*Insert*/ true, 6600 /*Extract*/ false, CostKind); 6601 6602 // Some targets keep addresses scalar. 6603 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6604 return Cost; 6605 6606 // Some targets support efficient element stores. 6607 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6608 return Cost; 6609 6610 // Collect operands to consider. 6611 CallInst *CI = dyn_cast<CallInst>(I); 6612 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6613 6614 // Skip operands that do not require extraction/scalarization and do not incur 6615 // any overhead. 6616 SmallVector<Type *> Tys; 6617 for (auto *V : filterExtractingOperands(Ops, VF)) 6618 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6619 return Cost + TTI.getOperandsScalarizationOverhead( 6620 filterExtractingOperands(Ops, VF), Tys, CostKind); 6621 } 6622 6623 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6624 if (VF.isScalar()) 6625 return; 6626 NumPredStores = 0; 6627 for (BasicBlock *BB : TheLoop->blocks()) { 6628 // For each instruction in the old loop. 6629 for (Instruction &I : *BB) { 6630 Value *Ptr = getLoadStorePointerOperand(&I); 6631 if (!Ptr) 6632 continue; 6633 6634 // TODO: We should generate better code and update the cost model for 6635 // predicated uniform stores. Today they are treated as any other 6636 // predicated store (see added test cases in 6637 // invariant-store-vectorization.ll). 6638 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6639 NumPredStores++; 6640 6641 if (Legal->isUniformMemOp(I, VF)) { 6642 auto isLegalToScalarize = [&]() { 6643 if (!VF.isScalable()) 6644 // Scalarization of fixed length vectors "just works". 6645 return true; 6646 6647 // We have dedicated lowering for unpredicated uniform loads and 6648 // stores. Note that even with tail folding we know that at least 6649 // one lane is active (i.e. generalized predication is not possible 6650 // here), and the logic below depends on this fact. 6651 if (!foldTailByMasking()) 6652 return true; 6653 6654 // For scalable vectors, a uniform memop load is always 6655 // uniform-by-parts and we know how to scalarize that. 6656 if (isa<LoadInst>(I)) 6657 return true; 6658 6659 // A uniform store isn't neccessarily uniform-by-part 6660 // and we can't assume scalarization. 6661 auto &SI = cast<StoreInst>(I); 6662 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6663 }; 6664 6665 const InstructionCost GatherScatterCost = 6666 isLegalGatherOrScatter(&I, VF) ? 6667 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6668 6669 // Load: Scalar load + broadcast 6670 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6671 // FIXME: This cost is a significant under-estimate for tail folded 6672 // memory ops. 6673 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6674 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6675 6676 // Choose better solution for the current VF, Note that Invalid 6677 // costs compare as maximumal large. If both are invalid, we get 6678 // scalable invalid which signals a failure and a vectorization abort. 6679 if (GatherScatterCost < ScalarizationCost) 6680 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6681 else 6682 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6683 continue; 6684 } 6685 6686 // We assume that widening is the best solution when possible. 6687 if (memoryInstructionCanBeWidened(&I, VF)) { 6688 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6689 int ConsecutiveStride = Legal->isConsecutivePtr( 6690 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6691 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6692 "Expected consecutive stride."); 6693 InstWidening Decision = 6694 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6695 setWideningDecision(&I, VF, Decision, Cost); 6696 continue; 6697 } 6698 6699 // Choose between Interleaving, Gather/Scatter or Scalarization. 6700 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6701 unsigned NumAccesses = 1; 6702 if (isAccessInterleaved(&I)) { 6703 auto Group = getInterleavedAccessGroup(&I); 6704 assert(Group && "Fail to get an interleaved access group."); 6705 6706 // Make one decision for the whole group. 6707 if (getWideningDecision(&I, VF) != CM_Unknown) 6708 continue; 6709 6710 NumAccesses = Group->getNumMembers(); 6711 if (interleavedAccessCanBeWidened(&I, VF)) 6712 InterleaveCost = getInterleaveGroupCost(&I, VF); 6713 } 6714 6715 InstructionCost GatherScatterCost = 6716 isLegalGatherOrScatter(&I, VF) 6717 ? getGatherScatterCost(&I, VF) * NumAccesses 6718 : InstructionCost::getInvalid(); 6719 6720 InstructionCost ScalarizationCost = 6721 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6722 6723 // Choose better solution for the current VF, 6724 // write down this decision and use it during vectorization. 6725 InstructionCost Cost; 6726 InstWidening Decision; 6727 if (InterleaveCost <= GatherScatterCost && 6728 InterleaveCost < ScalarizationCost) { 6729 Decision = CM_Interleave; 6730 Cost = InterleaveCost; 6731 } else if (GatherScatterCost < ScalarizationCost) { 6732 Decision = CM_GatherScatter; 6733 Cost = GatherScatterCost; 6734 } else { 6735 Decision = CM_Scalarize; 6736 Cost = ScalarizationCost; 6737 } 6738 // If the instructions belongs to an interleave group, the whole group 6739 // receives the same decision. The whole group receives the cost, but 6740 // the cost will actually be assigned to one instruction. 6741 if (auto Group = getInterleavedAccessGroup(&I)) 6742 setWideningDecision(Group, VF, Decision, Cost); 6743 else 6744 setWideningDecision(&I, VF, Decision, Cost); 6745 } 6746 } 6747 6748 // Make sure that any load of address and any other address computation 6749 // remains scalar unless there is gather/scatter support. This avoids 6750 // inevitable extracts into address registers, and also has the benefit of 6751 // activating LSR more, since that pass can't optimize vectorized 6752 // addresses. 6753 if (TTI.prefersVectorizedAddressing()) 6754 return; 6755 6756 // Start with all scalar pointer uses. 6757 SmallPtrSet<Instruction *, 8> AddrDefs; 6758 for (BasicBlock *BB : TheLoop->blocks()) 6759 for (Instruction &I : *BB) { 6760 Instruction *PtrDef = 6761 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6762 if (PtrDef && TheLoop->contains(PtrDef) && 6763 getWideningDecision(&I, VF) != CM_GatherScatter) 6764 AddrDefs.insert(PtrDef); 6765 } 6766 6767 // Add all instructions used to generate the addresses. 6768 SmallVector<Instruction *, 4> Worklist; 6769 append_range(Worklist, AddrDefs); 6770 while (!Worklist.empty()) { 6771 Instruction *I = Worklist.pop_back_val(); 6772 for (auto &Op : I->operands()) 6773 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6774 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6775 AddrDefs.insert(InstOp).second) 6776 Worklist.push_back(InstOp); 6777 } 6778 6779 for (auto *I : AddrDefs) { 6780 if (isa<LoadInst>(I)) { 6781 // Setting the desired widening decision should ideally be handled in 6782 // by cost functions, but since this involves the task of finding out 6783 // if the loaded register is involved in an address computation, it is 6784 // instead changed here when we know this is the case. 6785 InstWidening Decision = getWideningDecision(I, VF); 6786 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6787 // Scalarize a widened load of address. 6788 setWideningDecision( 6789 I, VF, CM_Scalarize, 6790 (VF.getKnownMinValue() * 6791 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6792 else if (auto Group = getInterleavedAccessGroup(I)) { 6793 // Scalarize an interleave group of address loads. 6794 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6795 if (Instruction *Member = Group->getMember(I)) 6796 setWideningDecision( 6797 Member, VF, CM_Scalarize, 6798 (VF.getKnownMinValue() * 6799 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6800 } 6801 } 6802 } else 6803 // Make sure I gets scalarized and a cost estimate without 6804 // scalarization overhead. 6805 ForcedScalars[VF].insert(I); 6806 } 6807 } 6808 6809 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6810 assert(!VF.isScalar() && 6811 "Trying to set a vectorization decision for a scalar VF"); 6812 6813 for (BasicBlock *BB : TheLoop->blocks()) { 6814 // For each instruction in the old loop. 6815 for (Instruction &I : *BB) { 6816 CallInst *CI = dyn_cast<CallInst>(&I); 6817 6818 if (!CI) 6819 continue; 6820 6821 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6822 InstructionCost VectorCost = InstructionCost::getInvalid(); 6823 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6824 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6825 6826 Function *ScalarFunc = CI->getCalledFunction(); 6827 Type *ScalarRetTy = CI->getType(); 6828 SmallVector<Type *, 4> Tys, ScalarTys; 6829 bool MaskRequired = Legal->isMaskRequired(CI); 6830 for (auto &ArgOp : CI->args()) 6831 ScalarTys.push_back(ArgOp->getType()); 6832 6833 // Compute corresponding vector type for return value and arguments. 6834 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 6835 for (Type *ScalarTy : ScalarTys) 6836 Tys.push_back(ToVectorTy(ScalarTy, VF)); 6837 6838 // An in-loop reduction using an fmuladd intrinsic is a special case; 6839 // we don't want the normal cost for that intrinsic. 6840 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6841 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6842 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6843 getVectorIntrinsicIDForCall(CI, TLI), 6844 std::nullopt, *RedCost); 6845 continue; 6846 } 6847 6848 // Estimate cost of scalarized vector call. The source operands are 6849 // assumed to be vectors, so we need to extract individual elements from 6850 // there, execute VF scalar calls, and then gather the result into the 6851 // vector return value. 6852 InstructionCost ScalarCallCost = 6853 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6854 6855 // Compute costs of unpacking argument values for the scalar calls and 6856 // packing the return values to a vector. 6857 InstructionCost ScalarizationCost = 6858 getScalarizationOverhead(CI, VF, CostKind); 6859 6860 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6861 6862 // Find the cost of vectorizing the call, if we can find a suitable 6863 // vector variant of the function. 6864 bool UsesMask = false; 6865 VFInfo FuncInfo; 6866 Function *VecFunc = nullptr; 6867 // Search through any available variants for one we can use at this VF. 6868 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6869 // Must match requested VF. 6870 if (Info.Shape.VF != VF) 6871 continue; 6872 6873 // Must take a mask argument if one is required 6874 if (MaskRequired && !Info.isMasked()) 6875 continue; 6876 6877 // Check that all parameter kinds are supported 6878 bool ParamsOk = true; 6879 for (VFParameter Param : Info.Shape.Parameters) { 6880 switch (Param.ParamKind) { 6881 case VFParamKind::Vector: 6882 break; 6883 case VFParamKind::OMP_Uniform: { 6884 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6885 // Make sure the scalar parameter in the loop is invariant. 6886 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6887 TheLoop)) 6888 ParamsOk = false; 6889 break; 6890 } 6891 case VFParamKind::OMP_Linear: { 6892 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6893 // Find the stride for the scalar parameter in this loop and see if 6894 // it matches the stride for the variant. 6895 // TODO: do we need to figure out the cost of an extract to get the 6896 // first lane? Or do we hope that it will be folded away? 6897 ScalarEvolution *SE = PSE.getSE(); 6898 const auto *SAR = 6899 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6900 6901 if (!SAR || SAR->getLoop() != TheLoop) { 6902 ParamsOk = false; 6903 break; 6904 } 6905 6906 const SCEVConstant *Step = 6907 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6908 6909 if (!Step || 6910 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6911 ParamsOk = false; 6912 6913 break; 6914 } 6915 case VFParamKind::GlobalPredicate: 6916 UsesMask = true; 6917 break; 6918 default: 6919 ParamsOk = false; 6920 break; 6921 } 6922 } 6923 6924 if (!ParamsOk) 6925 continue; 6926 6927 // Found a suitable candidate, stop here. 6928 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6929 FuncInfo = Info; 6930 break; 6931 } 6932 6933 // Add in the cost of synthesizing a mask if one wasn't required. 6934 InstructionCost MaskCost = 0; 6935 if (VecFunc && UsesMask && !MaskRequired) 6936 MaskCost = TTI.getShuffleCost( 6937 TargetTransformInfo::SK_Broadcast, 6938 VectorType::get(IntegerType::getInt1Ty( 6939 VecFunc->getFunctionType()->getContext()), 6940 VF)); 6941 6942 if (TLI && VecFunc && !CI->isNoBuiltin()) 6943 VectorCost = 6944 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6945 6946 // Find the cost of an intrinsic; some targets may have instructions that 6947 // perform the operation without needing an actual call. 6948 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6949 if (IID != Intrinsic::not_intrinsic) 6950 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6951 6952 InstructionCost Cost = ScalarCost; 6953 InstWidening Decision = CM_Scalarize; 6954 6955 if (VectorCost <= Cost) { 6956 Cost = VectorCost; 6957 Decision = CM_VectorCall; 6958 } 6959 6960 if (IntrinsicCost <= Cost) { 6961 Cost = IntrinsicCost; 6962 Decision = CM_IntrinsicCall; 6963 } 6964 6965 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6966 FuncInfo.getParamIndexForOptionalMask(), Cost); 6967 } 6968 } 6969 } 6970 6971 InstructionCost 6972 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6973 Type *&VectorTy) { 6974 Type *RetTy = I->getType(); 6975 if (canTruncateToMinimalBitwidth(I, VF)) 6976 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6977 auto SE = PSE.getSE(); 6978 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6979 6980 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6981 ElementCount VF) -> bool { 6982 if (VF.isScalar()) 6983 return true; 6984 6985 auto Scalarized = InstsToScalarize.find(VF); 6986 assert(Scalarized != InstsToScalarize.end() && 6987 "VF not yet analyzed for scalarization profitability"); 6988 return !Scalarized->second.count(I) && 6989 llvm::all_of(I->users(), [&](User *U) { 6990 auto *UI = cast<Instruction>(U); 6991 return !Scalarized->second.count(UI); 6992 }); 6993 }; 6994 (void) hasSingleCopyAfterVectorization; 6995 6996 if (isScalarAfterVectorization(I, VF)) { 6997 // With the exception of GEPs and PHIs, after scalarization there should 6998 // only be one copy of the instruction generated in the loop. This is 6999 // because the VF is either 1, or any instructions that need scalarizing 7000 // have already been dealt with by the time we get here. As a result, 7001 // it means we don't have to multiply the instruction cost by VF. 7002 assert(I->getOpcode() == Instruction::GetElementPtr || 7003 I->getOpcode() == Instruction::PHI || 7004 (I->getOpcode() == Instruction::BitCast && 7005 I->getType()->isPointerTy()) || 7006 hasSingleCopyAfterVectorization(I, VF)); 7007 VectorTy = RetTy; 7008 } else 7009 VectorTy = ToVectorTy(RetTy, VF); 7010 7011 // TODO: We need to estimate the cost of intrinsic calls. 7012 switch (I->getOpcode()) { 7013 case Instruction::GetElementPtr: 7014 // We mark this instruction as zero-cost because the cost of GEPs in 7015 // vectorized code depends on whether the corresponding memory instruction 7016 // is scalarized or not. Therefore, we handle GEPs with the memory 7017 // instruction cost. 7018 return 0; 7019 case Instruction::Br: { 7020 // In cases of scalarized and predicated instructions, there will be VF 7021 // predicated blocks in the vectorized loop. Each branch around these 7022 // blocks requires also an extract of its vector compare i1 element. 7023 bool ScalarPredicatedBB = false; 7024 BranchInst *BI = cast<BranchInst>(I); 7025 if (VF.isVector() && BI->isConditional() && 7026 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 7027 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 7028 ScalarPredicatedBB = true; 7029 7030 if (ScalarPredicatedBB) { 7031 // Not possible to scalarize scalable vector with predicated instructions. 7032 if (VF.isScalable()) 7033 return InstructionCost::getInvalid(); 7034 // Return cost for branches around scalarized and predicated blocks. 7035 auto *Vec_i1Ty = 7036 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7037 return ( 7038 TTI.getScalarizationOverhead( 7039 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 7040 /*Insert*/ false, /*Extract*/ true, CostKind) + 7041 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7042 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7043 // The back-edge branch will remain, as will all scalar branches. 7044 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7045 else 7046 // This branch will be eliminated by if-conversion. 7047 return 0; 7048 // Note: We currently assume zero cost for an unconditional branch inside 7049 // a predicated block since it will become a fall-through, although we 7050 // may decide in the future to call TTI for all branches. 7051 } 7052 case Instruction::PHI: { 7053 auto *Phi = cast<PHINode>(I); 7054 7055 // First-order recurrences are replaced by vector shuffles inside the loop. 7056 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 7057 SmallVector<int> Mask(VF.getKnownMinValue()); 7058 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 7059 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 7060 cast<VectorType>(VectorTy), Mask, CostKind, 7061 VF.getKnownMinValue() - 1); 7062 } 7063 7064 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7065 // converted into select instructions. We require N - 1 selects per phi 7066 // node, where N is the number of incoming values. 7067 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7068 return (Phi->getNumIncomingValues() - 1) * 7069 TTI.getCmpSelInstrCost( 7070 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7071 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7072 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7073 7074 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7075 } 7076 case Instruction::UDiv: 7077 case Instruction::SDiv: 7078 case Instruction::URem: 7079 case Instruction::SRem: 7080 if (VF.isVector() && isPredicatedInst(I)) { 7081 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 7082 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 7083 ScalarCost : SafeDivisorCost; 7084 } 7085 // We've proven all lanes safe to speculate, fall through. 7086 [[fallthrough]]; 7087 case Instruction::Add: 7088 case Instruction::FAdd: 7089 case Instruction::Sub: 7090 case Instruction::FSub: 7091 case Instruction::Mul: 7092 case Instruction::FMul: 7093 case Instruction::FDiv: 7094 case Instruction::FRem: 7095 case Instruction::Shl: 7096 case Instruction::LShr: 7097 case Instruction::AShr: 7098 case Instruction::And: 7099 case Instruction::Or: 7100 case Instruction::Xor: { 7101 // If we're speculating on the stride being 1, the multiplication may 7102 // fold away. We can generalize this for all operations using the notion 7103 // of neutral elements. (TODO) 7104 if (I->getOpcode() == Instruction::Mul && 7105 (PSE.getSCEV(I->getOperand(0))->isOne() || 7106 PSE.getSCEV(I->getOperand(1))->isOne())) 7107 return 0; 7108 7109 // Detect reduction patterns 7110 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7111 return *RedCost; 7112 7113 // Certain instructions can be cheaper to vectorize if they have a constant 7114 // second vector operand. One example of this are shifts on x86. 7115 Value *Op2 = I->getOperand(1); 7116 auto Op2Info = TTI.getOperandInfo(Op2); 7117 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 7118 Legal->isInvariant(Op2)) 7119 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7120 7121 SmallVector<const Value *, 4> Operands(I->operand_values()); 7122 return TTI.getArithmeticInstrCost( 7123 I->getOpcode(), VectorTy, CostKind, 7124 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7125 Op2Info, Operands, I); 7126 } 7127 case Instruction::FNeg: { 7128 return TTI.getArithmeticInstrCost( 7129 I->getOpcode(), VectorTy, CostKind, 7130 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7131 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7132 I->getOperand(0), I); 7133 } 7134 case Instruction::Select: { 7135 SelectInst *SI = cast<SelectInst>(I); 7136 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7137 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7138 7139 const Value *Op0, *Op1; 7140 using namespace llvm::PatternMatch; 7141 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7142 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7143 // select x, y, false --> x & y 7144 // select x, true, y --> x | y 7145 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7146 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7147 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7148 Op1->getType()->getScalarSizeInBits() == 1); 7149 7150 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7151 return TTI.getArithmeticInstrCost( 7152 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7153 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7154 } 7155 7156 Type *CondTy = SI->getCondition()->getType(); 7157 if (!ScalarCond) 7158 CondTy = VectorType::get(CondTy, VF); 7159 7160 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7161 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7162 Pred = Cmp->getPredicate(); 7163 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7164 CostKind, I); 7165 } 7166 case Instruction::ICmp: 7167 case Instruction::FCmp: { 7168 Type *ValTy = I->getOperand(0)->getType(); 7169 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7170 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7171 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7172 VectorTy = ToVectorTy(ValTy, VF); 7173 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7174 cast<CmpInst>(I)->getPredicate(), CostKind, 7175 I); 7176 } 7177 case Instruction::Store: 7178 case Instruction::Load: { 7179 ElementCount Width = VF; 7180 if (Width.isVector()) { 7181 InstWidening Decision = getWideningDecision(I, Width); 7182 assert(Decision != CM_Unknown && 7183 "CM decision should be taken at this point"); 7184 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7185 return InstructionCost::getInvalid(); 7186 if (Decision == CM_Scalarize) 7187 Width = ElementCount::getFixed(1); 7188 } 7189 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7190 return getMemoryInstructionCost(I, VF); 7191 } 7192 case Instruction::BitCast: 7193 if (I->getType()->isPointerTy()) 7194 return 0; 7195 [[fallthrough]]; 7196 case Instruction::ZExt: 7197 case Instruction::SExt: 7198 case Instruction::FPToUI: 7199 case Instruction::FPToSI: 7200 case Instruction::FPExt: 7201 case Instruction::PtrToInt: 7202 case Instruction::IntToPtr: 7203 case Instruction::SIToFP: 7204 case Instruction::UIToFP: 7205 case Instruction::Trunc: 7206 case Instruction::FPTrunc: { 7207 // Computes the CastContextHint from a Load/Store instruction. 7208 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7209 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7210 "Expected a load or a store!"); 7211 7212 if (VF.isScalar() || !TheLoop->contains(I)) 7213 return TTI::CastContextHint::Normal; 7214 7215 switch (getWideningDecision(I, VF)) { 7216 case LoopVectorizationCostModel::CM_GatherScatter: 7217 return TTI::CastContextHint::GatherScatter; 7218 case LoopVectorizationCostModel::CM_Interleave: 7219 return TTI::CastContextHint::Interleave; 7220 case LoopVectorizationCostModel::CM_Scalarize: 7221 case LoopVectorizationCostModel::CM_Widen: 7222 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7223 : TTI::CastContextHint::Normal; 7224 case LoopVectorizationCostModel::CM_Widen_Reverse: 7225 return TTI::CastContextHint::Reversed; 7226 case LoopVectorizationCostModel::CM_Unknown: 7227 llvm_unreachable("Instr did not go through cost modelling?"); 7228 case LoopVectorizationCostModel::CM_VectorCall: 7229 case LoopVectorizationCostModel::CM_IntrinsicCall: 7230 llvm_unreachable_internal("Instr has invalid widening decision"); 7231 } 7232 7233 llvm_unreachable("Unhandled case!"); 7234 }; 7235 7236 unsigned Opcode = I->getOpcode(); 7237 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7238 // For Trunc, the context is the only user, which must be a StoreInst. 7239 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7240 if (I->hasOneUse()) 7241 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7242 CCH = ComputeCCH(Store); 7243 } 7244 // For Z/Sext, the context is the operand, which must be a LoadInst. 7245 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7246 Opcode == Instruction::FPExt) { 7247 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7248 CCH = ComputeCCH(Load); 7249 } 7250 7251 // We optimize the truncation of induction variables having constant 7252 // integer steps. The cost of these truncations is the same as the scalar 7253 // operation. 7254 if (isOptimizableIVTruncate(I, VF)) { 7255 auto *Trunc = cast<TruncInst>(I); 7256 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7257 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7258 } 7259 7260 // Detect reduction patterns 7261 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7262 return *RedCost; 7263 7264 Type *SrcScalarTy = I->getOperand(0)->getType(); 7265 Type *SrcVecTy = 7266 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7267 if (canTruncateToMinimalBitwidth(I, VF)) { 7268 // This cast is going to be shrunk. This may remove the cast or it might 7269 // turn it into slightly different cast. For example, if MinBW == 16, 7270 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7271 // 7272 // Calculate the modified src and dest types. 7273 Type *MinVecTy = VectorTy; 7274 if (Opcode == Instruction::Trunc) { 7275 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7276 VectorTy = 7277 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7278 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7279 // Leave SrcVecTy unchanged - we only shrink the destination element 7280 // type. 7281 VectorTy = 7282 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7283 } 7284 } 7285 7286 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7287 } 7288 case Instruction::Call: 7289 return getVectorCallCost(cast<CallInst>(I), VF); 7290 case Instruction::ExtractValue: 7291 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7292 case Instruction::Alloca: 7293 // We cannot easily widen alloca to a scalable alloca, as 7294 // the result would need to be a vector of pointers. 7295 if (VF.isScalable()) 7296 return InstructionCost::getInvalid(); 7297 [[fallthrough]]; 7298 default: 7299 // This opcode is unknown. Assume that it is the same as 'mul'. 7300 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7301 } // end of switch. 7302 } 7303 7304 void LoopVectorizationCostModel::collectValuesToIgnore() { 7305 // Ignore ephemeral values. 7306 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7307 7308 // Find all stores to invariant variables. Since they are going to sink 7309 // outside the loop we do not need calculate cost for them. 7310 for (BasicBlock *BB : TheLoop->blocks()) 7311 for (Instruction &I : *BB) { 7312 StoreInst *SI; 7313 if ((SI = dyn_cast<StoreInst>(&I)) && 7314 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7315 ValuesToIgnore.insert(&I); 7316 } 7317 7318 // Ignore type-promoting instructions we identified during reduction 7319 // detection. 7320 for (const auto &Reduction : Legal->getReductionVars()) { 7321 const RecurrenceDescriptor &RedDes = Reduction.second; 7322 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7323 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7324 } 7325 // Ignore type-casting instructions we identified during induction 7326 // detection. 7327 for (const auto &Induction : Legal->getInductionVars()) { 7328 const InductionDescriptor &IndDes = Induction.second; 7329 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7330 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7331 } 7332 } 7333 7334 void LoopVectorizationCostModel::collectInLoopReductions() { 7335 for (const auto &Reduction : Legal->getReductionVars()) { 7336 PHINode *Phi = Reduction.first; 7337 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7338 7339 // We don't collect reductions that are type promoted (yet). 7340 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7341 continue; 7342 7343 // If the target would prefer this reduction to happen "in-loop", then we 7344 // want to record it as such. 7345 unsigned Opcode = RdxDesc.getOpcode(); 7346 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7347 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7348 TargetTransformInfo::ReductionFlags())) 7349 continue; 7350 7351 // Check that we can correctly put the reductions into the loop, by 7352 // finding the chain of operations that leads from the phi to the loop 7353 // exit value. 7354 SmallVector<Instruction *, 4> ReductionOperations = 7355 RdxDesc.getReductionOpChain(Phi, TheLoop); 7356 bool InLoop = !ReductionOperations.empty(); 7357 7358 if (InLoop) { 7359 InLoopReductions.insert(Phi); 7360 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7361 Instruction *LastChain = Phi; 7362 for (auto *I : ReductionOperations) { 7363 InLoopReductionImmediateChains[I] = LastChain; 7364 LastChain = I; 7365 } 7366 } 7367 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7368 << " reduction for phi: " << *Phi << "\n"); 7369 } 7370 } 7371 7372 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, 7373 DebugLoc DL, const Twine &Name) { 7374 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && 7375 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); 7376 return tryInsertInstruction( 7377 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); 7378 } 7379 7380 // This function will select a scalable VF if the target supports scalable 7381 // vectors and a fixed one otherwise. 7382 // TODO: we could return a pair of values that specify the max VF and 7383 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7384 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7385 // doesn't have a cost model that can choose which plan to execute if 7386 // more than one is generated. 7387 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7388 LoopVectorizationCostModel &CM) { 7389 unsigned WidestType; 7390 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7391 7392 TargetTransformInfo::RegisterKind RegKind = 7393 TTI.enableScalableVectorization() 7394 ? TargetTransformInfo::RGK_ScalableVector 7395 : TargetTransformInfo::RGK_FixedWidthVector; 7396 7397 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7398 unsigned N = RegSize.getKnownMinValue() / WidestType; 7399 return ElementCount::get(N, RegSize.isScalable()); 7400 } 7401 7402 VectorizationFactor 7403 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7404 ElementCount VF = UserVF; 7405 // Outer loop handling: They may require CFG and instruction level 7406 // transformations before even evaluating whether vectorization is profitable. 7407 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7408 // the vectorization pipeline. 7409 if (!OrigLoop->isInnermost()) { 7410 // If the user doesn't provide a vectorization factor, determine a 7411 // reasonable one. 7412 if (UserVF.isZero()) { 7413 VF = determineVPlanVF(TTI, CM); 7414 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7415 7416 // Make sure we have a VF > 1 for stress testing. 7417 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7418 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7419 << "overriding computed VF.\n"); 7420 VF = ElementCount::getFixed(4); 7421 } 7422 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7423 !ForceTargetSupportsScalableVectors) { 7424 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7425 << "not supported by the target.\n"); 7426 reportVectorizationFailure( 7427 "Scalable vectorization requested but not supported by the target", 7428 "the scalable user-specified vectorization width for outer-loop " 7429 "vectorization cannot be used because the target does not support " 7430 "scalable vectors.", 7431 "ScalableVFUnfeasible", ORE, OrigLoop); 7432 return VectorizationFactor::Disabled(); 7433 } 7434 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7435 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7436 "VF needs to be a power of two"); 7437 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7438 << "VF " << VF << " to build VPlans.\n"); 7439 buildVPlans(VF, VF); 7440 7441 // For VPlan build stress testing, we bail out after VPlan construction. 7442 if (VPlanBuildStressTest) 7443 return VectorizationFactor::Disabled(); 7444 7445 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7446 } 7447 7448 LLVM_DEBUG( 7449 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7450 "VPlan-native path.\n"); 7451 return VectorizationFactor::Disabled(); 7452 } 7453 7454 std::optional<VectorizationFactor> 7455 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7456 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7457 CM.collectValuesToIgnore(); 7458 CM.collectElementTypesForWidening(); 7459 7460 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7461 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7462 return std::nullopt; 7463 7464 // Invalidate interleave groups if all blocks of loop will be predicated. 7465 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7466 !useMaskedInterleavedAccesses(TTI)) { 7467 LLVM_DEBUG( 7468 dbgs() 7469 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7470 "which requires masked-interleaved support.\n"); 7471 if (CM.InterleaveInfo.invalidateGroups()) 7472 // Invalidating interleave groups also requires invalidating all decisions 7473 // based on them, which includes widening decisions and uniform and scalar 7474 // values. 7475 CM.invalidateCostModelingDecisions(); 7476 } 7477 7478 ElementCount MaxUserVF = 7479 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7480 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7481 if (!UserVF.isZero() && UserVFIsLegal) { 7482 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7483 "VF needs to be a power of two"); 7484 // Collect the instructions (and their associated costs) that will be more 7485 // profitable to scalarize. 7486 CM.collectInLoopReductions(); 7487 if (CM.selectUserVectorizationFactor(UserVF)) { 7488 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7489 buildVPlansWithVPRecipes(UserVF, UserVF); 7490 if (!hasPlanWithVF(UserVF)) { 7491 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7492 << ".\n"); 7493 return std::nullopt; 7494 } 7495 7496 LLVM_DEBUG(printPlans(dbgs())); 7497 return {{UserVF, 0, 0}}; 7498 } else 7499 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7500 "InvalidCost", ORE, OrigLoop); 7501 } 7502 7503 // Populate the set of Vectorization Factor Candidates. 7504 ElementCountSet VFCandidates; 7505 for (auto VF = ElementCount::getFixed(1); 7506 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7507 VFCandidates.insert(VF); 7508 for (auto VF = ElementCount::getScalable(1); 7509 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7510 VFCandidates.insert(VF); 7511 7512 CM.collectInLoopReductions(); 7513 for (const auto &VF : VFCandidates) { 7514 // Collect Uniform and Scalar instructions after vectorization with VF. 7515 CM.collectUniformsAndScalars(VF); 7516 7517 // Collect the instructions (and their associated costs) that will be more 7518 // profitable to scalarize. 7519 if (VF.isVector()) 7520 CM.collectInstsToScalarize(VF); 7521 } 7522 7523 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7524 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7525 7526 LLVM_DEBUG(printPlans(dbgs())); 7527 if (!MaxFactors.hasVector()) 7528 return VectorizationFactor::Disabled(); 7529 7530 // Select the optimal vectorization factor. 7531 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7532 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7533 if (!hasPlanWithVF(VF.Width)) { 7534 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7535 << ".\n"); 7536 return std::nullopt; 7537 } 7538 return VF; 7539 } 7540 7541 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7542 assert(count_if(VPlans, 7543 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7544 1 && 7545 "Best VF has not a single VPlan."); 7546 7547 for (const VPlanPtr &Plan : VPlans) { 7548 if (Plan->hasVF(VF)) 7549 return *Plan.get(); 7550 } 7551 llvm_unreachable("No plan found!"); 7552 } 7553 7554 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7555 SmallVector<Metadata *, 4> MDs; 7556 // Reserve first location for self reference to the LoopID metadata node. 7557 MDs.push_back(nullptr); 7558 bool IsUnrollMetadata = false; 7559 MDNode *LoopID = L->getLoopID(); 7560 if (LoopID) { 7561 // First find existing loop unrolling disable metadata. 7562 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7563 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7564 if (MD) { 7565 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7566 IsUnrollMetadata = 7567 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7568 } 7569 MDs.push_back(LoopID->getOperand(i)); 7570 } 7571 } 7572 7573 if (!IsUnrollMetadata) { 7574 // Add runtime unroll disable metadata. 7575 LLVMContext &Context = L->getHeader()->getContext(); 7576 SmallVector<Metadata *, 1> DisableOperands; 7577 DisableOperands.push_back( 7578 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7579 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7580 MDs.push_back(DisableNode); 7581 MDNode *NewLoopID = MDNode::get(Context, MDs); 7582 // Set operand 0 to refer to the loop id itself. 7583 NewLoopID->replaceOperandWith(0, NewLoopID); 7584 L->setLoopID(NewLoopID); 7585 } 7586 } 7587 7588 SCEV2ValueTy LoopVectorizationPlanner::executePlan( 7589 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7590 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7591 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7592 assert(BestVPlan.hasVF(BestVF) && 7593 "Trying to execute plan with unsupported VF"); 7594 assert(BestVPlan.hasUF(BestUF) && 7595 "Trying to execute plan with unsupported UF"); 7596 assert( 7597 (IsEpilogueVectorization || !ExpandedSCEVs) && 7598 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7599 7600 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7601 << '\n'); 7602 7603 if (!IsEpilogueVectorization) 7604 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7605 7606 // Perform the actual loop transformation. 7607 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, 7608 OrigLoop->getHeader()->getContext()); 7609 7610 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7611 // before making any changes to the CFG. 7612 if (!BestVPlan.getPreheader()->empty()) { 7613 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7614 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7615 BestVPlan.getPreheader()->execute(&State); 7616 } 7617 if (!ILV.getTripCount()) 7618 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7619 else 7620 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7621 "count during epilogue vectorization"); 7622 7623 // 1. Set up the skeleton for vectorization, including vector pre-header and 7624 // middle block. The vector loop is created during VPlan execution. 7625 Value *CanonicalIVStartValue; 7626 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7627 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7628 : State.ExpandedSCEVs); 7629 7630 // Only use noalias metadata when using memory checks guaranteeing no overlap 7631 // across all iterations. 7632 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7633 std::unique_ptr<LoopVersioning> LVer = nullptr; 7634 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7635 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7636 7637 // We currently don't use LoopVersioning for the actual loop cloning but we 7638 // still use it to add the noalias metadata. 7639 // TODO: Find a better way to re-use LoopVersioning functionality to add 7640 // metadata. 7641 LVer = std::make_unique<LoopVersioning>( 7642 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7643 PSE.getSE()); 7644 State.LVer = &*LVer; 7645 State.LVer->prepareNoAliasMetadata(); 7646 } 7647 7648 ILV.collectPoisonGeneratingRecipes(State); 7649 7650 ILV.printDebugTracesAtStart(); 7651 7652 //===------------------------------------------------===// 7653 // 7654 // Notice: any optimization or new instruction that go 7655 // into the code below should also be implemented in 7656 // the cost-model. 7657 // 7658 //===------------------------------------------------===// 7659 7660 // 2. Copy and widen instructions from the old loop into the new loop. 7661 BestVPlan.prepareToExecute(ILV.getTripCount(), 7662 ILV.getOrCreateVectorTripCount(nullptr), 7663 CanonicalIVStartValue, State); 7664 7665 BestVPlan.execute(&State); 7666 7667 // Keep all loop hints from the original loop on the vector loop (we'll 7668 // replace the vectorizer-specific hints below). 7669 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7670 7671 std::optional<MDNode *> VectorizedLoopID = 7672 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7673 LLVMLoopVectorizeFollowupVectorized}); 7674 7675 VPBasicBlock *HeaderVPBB = 7676 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7677 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7678 if (VectorizedLoopID) 7679 L->setLoopID(*VectorizedLoopID); 7680 else { 7681 // Keep all loop hints from the original loop on the vector loop (we'll 7682 // replace the vectorizer-specific hints below). 7683 if (MDNode *LID = OrigLoop->getLoopID()) 7684 L->setLoopID(LID); 7685 7686 LoopVectorizeHints Hints(L, true, *ORE); 7687 Hints.setAlreadyVectorized(); 7688 } 7689 TargetTransformInfo::UnrollingPreferences UP; 7690 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7691 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7692 AddRuntimeUnrollDisableMetaData(L); 7693 7694 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7695 // predication, updating analyses. 7696 ILV.fixVectorizedLoop(State, BestVPlan); 7697 7698 ILV.printDebugTracesAtEnd(); 7699 7700 return State.ExpandedSCEVs; 7701 } 7702 7703 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7704 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7705 for (const auto &Plan : VPlans) 7706 if (PrintVPlansInDotFormat) 7707 Plan->printDOT(O); 7708 else 7709 Plan->print(O); 7710 } 7711 #endif 7712 7713 //===--------------------------------------------------------------------===// 7714 // EpilogueVectorizerMainLoop 7715 //===--------------------------------------------------------------------===// 7716 7717 /// This function is partially responsible for generating the control flow 7718 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7719 std::pair<BasicBlock *, Value *> 7720 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7721 const SCEV2ValueTy &ExpandedSCEVs) { 7722 createVectorLoopSkeleton(""); 7723 7724 // Generate the code to check the minimum iteration count of the vector 7725 // epilogue (see below). 7726 EPI.EpilogueIterationCountCheck = 7727 emitIterationCountCheck(LoopScalarPreHeader, true); 7728 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7729 7730 // Generate the code to check any assumptions that we've made for SCEV 7731 // expressions. 7732 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7733 7734 // Generate the code that checks at runtime if arrays overlap. We put the 7735 // checks into a separate block to make the more common case of few elements 7736 // faster. 7737 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7738 7739 // Generate the iteration count check for the main loop, *after* the check 7740 // for the epilogue loop, so that the path-length is shorter for the case 7741 // that goes directly through the vector epilogue. The longer-path length for 7742 // the main loop is compensated for, by the gain from vectorizing the larger 7743 // trip count. Note: the branch will get updated later on when we vectorize 7744 // the epilogue. 7745 EPI.MainLoopIterationCountCheck = 7746 emitIterationCountCheck(LoopScalarPreHeader, false); 7747 7748 // Generate the induction variable. 7749 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7750 7751 // Skip induction resume value creation here because they will be created in 7752 // the second pass for the scalar loop. The induction resume values for the 7753 // inductions in the epilogue loop are created before executing the plan for 7754 // the epilogue loop. 7755 7756 return {completeLoopSkeleton(), nullptr}; 7757 } 7758 7759 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7760 LLVM_DEBUG({ 7761 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7762 << "Main Loop VF:" << EPI.MainLoopVF 7763 << ", Main Loop UF:" << EPI.MainLoopUF 7764 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7765 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7766 }); 7767 } 7768 7769 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7770 DEBUG_WITH_TYPE(VerboseDebug, { 7771 dbgs() << "intermediate fn:\n" 7772 << *OrigLoop->getHeader()->getParent() << "\n"; 7773 }); 7774 } 7775 7776 BasicBlock * 7777 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7778 bool ForEpilogue) { 7779 assert(Bypass && "Expected valid bypass basic block."); 7780 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7781 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7782 Value *Count = getTripCount(); 7783 // Reuse existing vector loop preheader for TC checks. 7784 // Note that new preheader block is generated for vector loop. 7785 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7786 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7787 7788 // Generate code to check if the loop's trip count is less than VF * UF of the 7789 // main vector loop. 7790 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7791 : VF.isVector()) 7792 ? ICmpInst::ICMP_ULE 7793 : ICmpInst::ICMP_ULT; 7794 7795 Value *CheckMinIters = Builder.CreateICmp( 7796 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7797 "min.iters.check"); 7798 7799 if (!ForEpilogue) 7800 TCCheckBlock->setName("vector.main.loop.iter.check"); 7801 7802 // Create new preheader for vector loop. 7803 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7804 DT, LI, nullptr, "vector.ph"); 7805 7806 if (ForEpilogue) { 7807 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7808 DT->getNode(Bypass)->getIDom()) && 7809 "TC check is expected to dominate Bypass"); 7810 7811 // Update dominator for Bypass & LoopExit. 7812 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7813 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7814 // For loops with multiple exits, there's no edge from the middle block 7815 // to exit blocks (as the epilogue must run) and thus no need to update 7816 // the immediate dominator of the exit blocks. 7817 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7818 7819 LoopBypassBlocks.push_back(TCCheckBlock); 7820 7821 // Save the trip count so we don't have to regenerate it in the 7822 // vec.epilog.iter.check. This is safe to do because the trip count 7823 // generated here dominates the vector epilog iter check. 7824 EPI.TripCount = Count; 7825 } 7826 7827 BranchInst &BI = 7828 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7829 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7830 setBranchWeights(BI, MinItersBypassWeights); 7831 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7832 7833 return TCCheckBlock; 7834 } 7835 7836 //===--------------------------------------------------------------------===// 7837 // EpilogueVectorizerEpilogueLoop 7838 //===--------------------------------------------------------------------===// 7839 7840 /// This function is partially responsible for generating the control flow 7841 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7842 std::pair<BasicBlock *, Value *> 7843 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7844 const SCEV2ValueTy &ExpandedSCEVs) { 7845 createVectorLoopSkeleton("vec.epilog."); 7846 7847 // Now, compare the remaining count and if there aren't enough iterations to 7848 // execute the vectorized epilogue skip to the scalar part. 7849 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7850 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7851 LoopVectorPreHeader = 7852 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7853 LI, nullptr, "vec.epilog.ph"); 7854 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7855 VecEpilogueIterationCountCheck); 7856 7857 // Adjust the control flow taking the state info from the main loop 7858 // vectorization into account. 7859 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7860 "expected this to be saved from the previous pass."); 7861 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7862 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7863 7864 DT->changeImmediateDominator(LoopVectorPreHeader, 7865 EPI.MainLoopIterationCountCheck); 7866 7867 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7868 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7869 7870 if (EPI.SCEVSafetyCheck) 7871 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7872 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7873 if (EPI.MemSafetyCheck) 7874 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7875 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7876 7877 DT->changeImmediateDominator( 7878 VecEpilogueIterationCountCheck, 7879 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7880 7881 DT->changeImmediateDominator(LoopScalarPreHeader, 7882 EPI.EpilogueIterationCountCheck); 7883 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7884 // If there is an epilogue which must run, there's no edge from the 7885 // middle block to exit blocks and thus no need to update the immediate 7886 // dominator of the exit blocks. 7887 DT->changeImmediateDominator(LoopExitBlock, 7888 EPI.EpilogueIterationCountCheck); 7889 7890 // Keep track of bypass blocks, as they feed start values to the induction and 7891 // reduction phis in the scalar loop preheader. 7892 if (EPI.SCEVSafetyCheck) 7893 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7894 if (EPI.MemSafetyCheck) 7895 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7896 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7897 7898 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7899 // reductions which merge control-flow from the latch block and the middle 7900 // block. Update the incoming values here and move the Phi into the preheader. 7901 SmallVector<PHINode *, 4> PhisInBlock; 7902 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7903 PhisInBlock.push_back(&Phi); 7904 7905 for (PHINode *Phi : PhisInBlock) { 7906 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7907 Phi->replaceIncomingBlockWith( 7908 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7909 VecEpilogueIterationCountCheck); 7910 7911 // If the phi doesn't have an incoming value from the 7912 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7913 // value and also those from other check blocks. This is needed for 7914 // reduction phis only. 7915 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7916 return EPI.EpilogueIterationCountCheck == IncB; 7917 })) 7918 continue; 7919 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7920 if (EPI.SCEVSafetyCheck) 7921 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7922 if (EPI.MemSafetyCheck) 7923 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7924 } 7925 7926 // Generate a resume induction for the vector epilogue and put it in the 7927 // vector epilogue preheader 7928 Type *IdxTy = Legal->getWidestInductionType(); 7929 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); 7930 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7931 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7932 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7933 EPI.MainLoopIterationCountCheck); 7934 7935 // Generate induction resume values. These variables save the new starting 7936 // indexes for the scalar loop. They are used to test if there are any tail 7937 // iterations left once the vector loop has completed. 7938 // Note that when the vectorized epilogue is skipped due to iteration count 7939 // check, then the resume value for the induction variable comes from 7940 // the trip count of the main vector loop, hence passing the AdditionalBypass 7941 // argument. 7942 createInductionResumeValues(ExpandedSCEVs, 7943 {VecEpilogueIterationCountCheck, 7944 EPI.VectorTripCount} /* AdditionalBypass */); 7945 7946 return {completeLoopSkeleton(), EPResumeVal}; 7947 } 7948 7949 BasicBlock * 7950 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7951 BasicBlock *Bypass, BasicBlock *Insert) { 7952 7953 assert(EPI.TripCount && 7954 "Expected trip count to have been safed in the first pass."); 7955 assert( 7956 (!isa<Instruction>(EPI.TripCount) || 7957 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7958 "saved trip count does not dominate insertion point."); 7959 Value *TC = EPI.TripCount; 7960 IRBuilder<> Builder(Insert->getTerminator()); 7961 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7962 7963 // Generate code to check if the loop's trip count is less than VF * UF of the 7964 // vector epilogue loop. 7965 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 7966 ? ICmpInst::ICMP_ULE 7967 : ICmpInst::ICMP_ULT; 7968 7969 Value *CheckMinIters = 7970 Builder.CreateICmp(P, Count, 7971 createStepForVF(Builder, Count->getType(), 7972 EPI.EpilogueVF, EPI.EpilogueUF), 7973 "min.epilog.iters.check"); 7974 7975 BranchInst &BI = 7976 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7977 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7978 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 7979 unsigned EpilogueLoopStep = 7980 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 7981 // We assume the remaining `Count` is equally distributed in 7982 // [0, MainLoopStep) 7983 // So the probability for `Count < EpilogueLoopStep` should be 7984 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 7985 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 7986 const uint32_t Weights[] = {EstimatedSkipCount, 7987 MainLoopStep - EstimatedSkipCount}; 7988 setBranchWeights(BI, Weights); 7989 } 7990 ReplaceInstWithInst(Insert->getTerminator(), &BI); 7991 7992 LoopBypassBlocks.push_back(Insert); 7993 return Insert; 7994 } 7995 7996 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7997 LLVM_DEBUG({ 7998 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7999 << "Epilogue Loop VF:" << EPI.EpilogueVF 8000 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8001 }); 8002 } 8003 8004 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8005 DEBUG_WITH_TYPE(VerboseDebug, { 8006 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8007 }); 8008 } 8009 8010 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8011 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8012 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8013 bool PredicateAtRangeStart = Predicate(Range.Start); 8014 8015 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 8016 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8017 Range.End = TmpVF; 8018 break; 8019 } 8020 8021 return PredicateAtRangeStart; 8022 } 8023 8024 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8025 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8026 /// of VF's starting at a given VF and extending it as much as possible. Each 8027 /// vectorization decision can potentially shorten this sub-range during 8028 /// buildVPlan(). 8029 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8030 ElementCount MaxVF) { 8031 auto MaxVFTimes2 = MaxVF * 2; 8032 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8033 VFRange SubRange = {VF, MaxVFTimes2}; 8034 VPlans.push_back(buildVPlan(SubRange)); 8035 VF = SubRange.End; 8036 } 8037 } 8038 8039 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8040 VPlan &Plan) { 8041 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8042 8043 // Look for cached value. 8044 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8045 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8046 if (ECEntryIt != EdgeMaskCache.end()) 8047 return ECEntryIt->second; 8048 8049 VPValue *SrcMask = createBlockInMask(Src, Plan); 8050 8051 // The terminator has to be a branch inst! 8052 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8053 assert(BI && "Unexpected terminator found"); 8054 8055 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8056 return EdgeMaskCache[Edge] = SrcMask; 8057 8058 // If source is an exiting block, we know the exit edge is dynamically dead 8059 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8060 // adding uses of an otherwise potentially dead instruction. 8061 if (OrigLoop->isLoopExiting(Src)) 8062 return EdgeMaskCache[Edge] = SrcMask; 8063 8064 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 8065 assert(EdgeMask && "No Edge Mask found for condition"); 8066 8067 if (BI->getSuccessor(0) != Dst) 8068 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8069 8070 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8071 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8072 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8073 // The select version does not introduce new UB if SrcMask is false and 8074 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8075 VPValue *False = Plan.getVPValueOrAddLiveIn( 8076 ConstantInt::getFalse(BI->getCondition()->getType())); 8077 EdgeMask = 8078 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8079 } 8080 8081 return EdgeMaskCache[Edge] = EdgeMask; 8082 } 8083 8084 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { 8085 BasicBlock *Header = OrigLoop->getHeader(); 8086 8087 // When not folding the tail, use nullptr to model all-true mask. 8088 if (!CM.foldTailByMasking()) { 8089 BlockMaskCache[Header] = nullptr; 8090 return; 8091 } 8092 8093 // Introduce the early-exit compare IV <= BTC to form header block mask. 8094 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8095 // constructing the desired canonical IV in the header block as its first 8096 // non-phi instructions. 8097 8098 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8099 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8100 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8101 HeaderVPBB->insert(IV, NewInsertionPoint); 8102 8103 VPBuilder::InsertPointGuard Guard(Builder); 8104 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8105 VPValue *BlockMask = nullptr; 8106 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8107 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8108 BlockMaskCache[Header] = BlockMask; 8109 } 8110 8111 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8112 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8113 8114 // Look for cached value. 8115 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8116 if (BCEntryIt != BlockMaskCache.end()) 8117 return BCEntryIt->second; 8118 8119 assert(OrigLoop->getHeader() != BB && 8120 "Loop header must have cached block mask"); 8121 8122 // All-one mask is modelled as no-mask following the convention for masked 8123 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8124 VPValue *BlockMask = nullptr; 8125 // This is the block mask. We OR all incoming edges. 8126 for (auto *Predecessor : predecessors(BB)) { 8127 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8128 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8129 return BlockMaskCache[BB] = EdgeMask; 8130 8131 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8132 BlockMask = EdgeMask; 8133 continue; 8134 } 8135 8136 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8137 } 8138 8139 return BlockMaskCache[BB] = BlockMask; 8140 } 8141 8142 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8143 ArrayRef<VPValue *> Operands, 8144 VFRange &Range, 8145 VPlanPtr &Plan) { 8146 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8147 "Must be called with either a load or store"); 8148 8149 auto willWiden = [&](ElementCount VF) -> bool { 8150 LoopVectorizationCostModel::InstWidening Decision = 8151 CM.getWideningDecision(I, VF); 8152 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8153 "CM decision should be taken at this point."); 8154 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8155 return true; 8156 if (CM.isScalarAfterVectorization(I, VF) || 8157 CM.isProfitableToScalarize(I, VF)) 8158 return false; 8159 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8160 }; 8161 8162 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8163 return nullptr; 8164 8165 VPValue *Mask = nullptr; 8166 if (Legal->isMaskRequired(I)) 8167 Mask = createBlockInMask(I->getParent(), *Plan); 8168 8169 // Determine if the pointer operand of the access is either consecutive or 8170 // reverse consecutive. 8171 LoopVectorizationCostModel::InstWidening Decision = 8172 CM.getWideningDecision(I, Range.Start); 8173 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8174 bool Consecutive = 8175 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8176 8177 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8178 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8179 Consecutive, Reverse); 8180 8181 StoreInst *Store = cast<StoreInst>(I); 8182 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8183 Mask, Consecutive, Reverse); 8184 } 8185 8186 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8187 /// insert a recipe to expand the step for the induction recipe. 8188 static VPWidenIntOrFpInductionRecipe * 8189 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8190 VPValue *Start, const InductionDescriptor &IndDesc, 8191 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8192 VFRange &Range) { 8193 assert(IndDesc.getStartValue() == 8194 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8195 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8196 "step must be loop invariant"); 8197 8198 VPValue *Step = 8199 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8200 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8201 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8202 } 8203 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8204 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8205 } 8206 8207 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8208 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8209 8210 // Check if this is an integer or fp induction. If so, build the recipe that 8211 // produces its scalar and vector values. 8212 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8213 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8214 *PSE.getSE(), *OrigLoop, Range); 8215 8216 // Check if this is pointer induction. If so, build the recipe for it. 8217 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8218 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8219 *PSE.getSE()); 8220 return new VPWidenPointerInductionRecipe( 8221 Phi, Operands[0], Step, *II, 8222 LoopVectorizationPlanner::getDecisionAndClampRange( 8223 [&](ElementCount VF) { 8224 return CM.isScalarAfterVectorization(Phi, VF); 8225 }, 8226 Range)); 8227 } 8228 return nullptr; 8229 } 8230 8231 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8232 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8233 // Optimize the special case where the source is a constant integer 8234 // induction variable. Notice that we can only optimize the 'trunc' case 8235 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8236 // (c) other casts depend on pointer size. 8237 8238 // Determine whether \p K is a truncation based on an induction variable that 8239 // can be optimized. 8240 auto isOptimizableIVTruncate = 8241 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8242 return [=](ElementCount VF) -> bool { 8243 return CM.isOptimizableIVTruncate(K, VF); 8244 }; 8245 }; 8246 8247 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8248 isOptimizableIVTruncate(I), Range)) { 8249 8250 auto *Phi = cast<PHINode>(I->getOperand(0)); 8251 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8252 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8253 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8254 *OrigLoop, Range); 8255 } 8256 return nullptr; 8257 } 8258 8259 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8260 ArrayRef<VPValue *> Operands, 8261 VPlanPtr &Plan) { 8262 // If all incoming values are equal, the incoming VPValue can be used directly 8263 // instead of creating a new VPBlendRecipe. 8264 if (llvm::all_equal(Operands)) 8265 return Operands[0]; 8266 8267 unsigned NumIncoming = Phi->getNumIncomingValues(); 8268 // For in-loop reductions, we do not need to create an additional select. 8269 VPValue *InLoopVal = nullptr; 8270 for (unsigned In = 0; In < NumIncoming; In++) { 8271 PHINode *PhiOp = 8272 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8273 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8274 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8275 InLoopVal = Operands[In]; 8276 } 8277 } 8278 8279 assert((!InLoopVal || NumIncoming == 2) && 8280 "Found an in-loop reduction for PHI with unexpected number of " 8281 "incoming values"); 8282 if (InLoopVal) 8283 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8284 8285 // We know that all PHIs in non-header blocks are converted into selects, so 8286 // we don't have to worry about the insertion order and we can just use the 8287 // builder. At this point we generate the predication tree. There may be 8288 // duplications since this is a simple recursive scan, but future 8289 // optimizations will clean it up. 8290 SmallVector<VPValue *, 2> OperandsWithMask; 8291 8292 for (unsigned In = 0; In < NumIncoming; In++) { 8293 VPValue *EdgeMask = 8294 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8295 assert((EdgeMask || NumIncoming == 1) && 8296 "Multiple predecessors with one having a full mask"); 8297 OperandsWithMask.push_back(Operands[In]); 8298 if (EdgeMask) 8299 OperandsWithMask.push_back(EdgeMask); 8300 } 8301 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8302 } 8303 8304 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8305 ArrayRef<VPValue *> Operands, 8306 VFRange &Range, 8307 VPlanPtr &Plan) { 8308 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8309 [this, CI](ElementCount VF) { 8310 return CM.isScalarWithPredication(CI, VF); 8311 }, 8312 Range); 8313 8314 if (IsPredicated) 8315 return nullptr; 8316 8317 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8318 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8319 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8320 ID == Intrinsic::pseudoprobe || 8321 ID == Intrinsic::experimental_noalias_scope_decl)) 8322 return nullptr; 8323 8324 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8325 8326 // Is it beneficial to perform intrinsic call compared to lib call? 8327 bool ShouldUseVectorIntrinsic = 8328 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8329 [&](ElementCount VF) -> bool { 8330 return CM.getCallWideningDecision(CI, VF).Kind == 8331 LoopVectorizationCostModel::CM_IntrinsicCall; 8332 }, 8333 Range); 8334 if (ShouldUseVectorIntrinsic) 8335 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8336 8337 Function *Variant = nullptr; 8338 std::optional<unsigned> MaskPos; 8339 // Is better to call a vectorized version of the function than to to scalarize 8340 // the call? 8341 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8342 [&](ElementCount VF) -> bool { 8343 // The following case may be scalarized depending on the VF. 8344 // The flag shows whether we can use a usual Call for vectorized 8345 // version of the instruction. 8346 8347 // If we've found a variant at a previous VF, then stop looking. A 8348 // vectorized variant of a function expects input in a certain shape 8349 // -- basically the number of input registers, the number of lanes 8350 // per register, and whether there's a mask required. 8351 // We store a pointer to the variant in the VPWidenCallRecipe, so 8352 // once we have an appropriate variant it's only valid for that VF. 8353 // This will force a different vplan to be generated for each VF that 8354 // finds a valid variant. 8355 if (Variant) 8356 return false; 8357 LoopVectorizationCostModel::CallWideningDecision Decision = 8358 CM.getCallWideningDecision(CI, VF); 8359 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8360 Variant = Decision.Variant; 8361 MaskPos = Decision.MaskPos; 8362 return true; 8363 } 8364 8365 return false; 8366 }, 8367 Range); 8368 if (ShouldUseVectorCall) { 8369 if (MaskPos.has_value()) { 8370 // We have 2 cases that would require a mask: 8371 // 1) The block needs to be predicated, either due to a conditional 8372 // in the scalar loop or use of an active lane mask with 8373 // tail-folding, and we use the appropriate mask for the block. 8374 // 2) No mask is required for the block, but the only available 8375 // vector variant at this VF requires a mask, so we synthesize an 8376 // all-true mask. 8377 VPValue *Mask = nullptr; 8378 if (Legal->isMaskRequired(CI)) 8379 Mask = createBlockInMask(CI->getParent(), *Plan); 8380 else 8381 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8382 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8383 8384 Ops.insert(Ops.begin() + *MaskPos, Mask); 8385 } 8386 8387 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8388 Intrinsic::not_intrinsic, Variant); 8389 } 8390 8391 return nullptr; 8392 } 8393 8394 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8395 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8396 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8397 // Instruction should be widened, unless it is scalar after vectorization, 8398 // scalarization is profitable or it is predicated. 8399 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8400 return CM.isScalarAfterVectorization(I, VF) || 8401 CM.isProfitableToScalarize(I, VF) || 8402 CM.isScalarWithPredication(I, VF); 8403 }; 8404 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8405 Range); 8406 } 8407 8408 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8409 ArrayRef<VPValue *> Operands, 8410 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8411 switch (I->getOpcode()) { 8412 default: 8413 return nullptr; 8414 case Instruction::SDiv: 8415 case Instruction::UDiv: 8416 case Instruction::SRem: 8417 case Instruction::URem: { 8418 // If not provably safe, use a select to form a safe divisor before widening the 8419 // div/rem operation itself. Otherwise fall through to general handling below. 8420 if (CM.isPredicatedInst(I)) { 8421 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8422 VPValue *Mask = createBlockInMask(I->getParent(), *Plan); 8423 VPValue *One = Plan->getVPValueOrAddLiveIn( 8424 ConstantInt::get(I->getType(), 1u, false)); 8425 auto *SafeRHS = 8426 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8427 I->getDebugLoc()); 8428 VPBB->appendRecipe(SafeRHS); 8429 Ops[1] = SafeRHS; 8430 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8431 } 8432 [[fallthrough]]; 8433 } 8434 case Instruction::Add: 8435 case Instruction::And: 8436 case Instruction::AShr: 8437 case Instruction::FAdd: 8438 case Instruction::FCmp: 8439 case Instruction::FDiv: 8440 case Instruction::FMul: 8441 case Instruction::FNeg: 8442 case Instruction::FRem: 8443 case Instruction::FSub: 8444 case Instruction::ICmp: 8445 case Instruction::LShr: 8446 case Instruction::Mul: 8447 case Instruction::Or: 8448 case Instruction::Select: 8449 case Instruction::Shl: 8450 case Instruction::Sub: 8451 case Instruction::Xor: 8452 case Instruction::Freeze: 8453 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8454 }; 8455 } 8456 8457 void VPRecipeBuilder::fixHeaderPhis() { 8458 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8459 for (VPHeaderPHIRecipe *R : PhisToFix) { 8460 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8461 VPRecipeBase *IncR = 8462 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8463 R->addOperand(IncR->getVPSingleValue()); 8464 } 8465 } 8466 8467 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8468 VFRange &Range, 8469 VPlan &Plan) { 8470 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8471 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8472 Range); 8473 8474 bool IsPredicated = CM.isPredicatedInst(I); 8475 8476 // Even if the instruction is not marked as uniform, there are certain 8477 // intrinsic calls that can be effectively treated as such, so we check for 8478 // them here. Conservatively, we only do this for scalable vectors, since 8479 // for fixed-width VFs we can always fall back on full scalarization. 8480 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8481 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8482 case Intrinsic::assume: 8483 case Intrinsic::lifetime_start: 8484 case Intrinsic::lifetime_end: 8485 // For scalable vectors if one of the operands is variant then we still 8486 // want to mark as uniform, which will generate one instruction for just 8487 // the first lane of the vector. We can't scalarize the call in the same 8488 // way as for fixed-width vectors because we don't know how many lanes 8489 // there are. 8490 // 8491 // The reasons for doing it this way for scalable vectors are: 8492 // 1. For the assume intrinsic generating the instruction for the first 8493 // lane is still be better than not generating any at all. For 8494 // example, the input may be a splat across all lanes. 8495 // 2. For the lifetime start/end intrinsics the pointer operand only 8496 // does anything useful when the input comes from a stack object, 8497 // which suggests it should always be uniform. For non-stack objects 8498 // the effect is to poison the object, which still allows us to 8499 // remove the call. 8500 IsUniform = true; 8501 break; 8502 default: 8503 break; 8504 } 8505 } 8506 VPValue *BlockInMask = nullptr; 8507 if (!IsPredicated) { 8508 // Finalize the recipe for Instr, first if it is not predicated. 8509 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8510 } else { 8511 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8512 // Instructions marked for predication are replicated and a mask operand is 8513 // added initially. Masked replicate recipes will later be placed under an 8514 // if-then construct to prevent side-effects. Generate recipes to compute 8515 // the block mask for this region. 8516 BlockInMask = createBlockInMask(I->getParent(), Plan); 8517 } 8518 8519 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8520 IsUniform, BlockInMask); 8521 return toVPRecipeResult(Recipe); 8522 } 8523 8524 VPRecipeOrVPValueTy 8525 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8526 ArrayRef<VPValue *> Operands, 8527 VFRange &Range, VPBasicBlock *VPBB, 8528 VPlanPtr &Plan) { 8529 // First, check for specific widening recipes that deal with inductions, Phi 8530 // nodes, calls and memory operations. 8531 VPRecipeBase *Recipe; 8532 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8533 if (Phi->getParent() != OrigLoop->getHeader()) 8534 return tryToBlend(Phi, Operands, Plan); 8535 8536 // Always record recipes for header phis. Later first-order recurrence phis 8537 // can have earlier phis as incoming values. 8538 recordRecipeOf(Phi); 8539 8540 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8541 return toVPRecipeResult(Recipe); 8542 8543 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8544 assert((Legal->isReductionVariable(Phi) || 8545 Legal->isFixedOrderRecurrence(Phi)) && 8546 "can only widen reductions and fixed-order recurrences here"); 8547 VPValue *StartV = Operands[0]; 8548 if (Legal->isReductionVariable(Phi)) { 8549 const RecurrenceDescriptor &RdxDesc = 8550 Legal->getReductionVars().find(Phi)->second; 8551 assert(RdxDesc.getRecurrenceStartValue() == 8552 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8553 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8554 CM.isInLoopReduction(Phi), 8555 CM.useOrderedReductions(RdxDesc)); 8556 } else { 8557 // TODO: Currently fixed-order recurrences are modeled as chains of 8558 // first-order recurrences. If there are no users of the intermediate 8559 // recurrences in the chain, the fixed order recurrence should be modeled 8560 // directly, enabling more efficient codegen. 8561 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8562 } 8563 8564 // Record the incoming value from the backedge, so we can add the incoming 8565 // value from the backedge after all recipes have been created. 8566 auto *Inc = cast<Instruction>( 8567 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8568 auto RecipeIter = Ingredient2Recipe.find(Inc); 8569 if (RecipeIter == Ingredient2Recipe.end()) 8570 recordRecipeOf(Inc); 8571 8572 PhisToFix.push_back(PhiRecipe); 8573 return toVPRecipeResult(PhiRecipe); 8574 } 8575 8576 if (isa<TruncInst>(Instr) && 8577 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8578 Range, *Plan))) 8579 return toVPRecipeResult(Recipe); 8580 8581 // All widen recipes below deal only with VF > 1. 8582 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8583 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8584 return nullptr; 8585 8586 if (auto *CI = dyn_cast<CallInst>(Instr)) 8587 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8588 8589 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8590 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8591 8592 if (!shouldWiden(Instr, Range)) 8593 return nullptr; 8594 8595 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8596 return toVPRecipeResult(new VPWidenGEPRecipe( 8597 GEP, make_range(Operands.begin(), Operands.end()))); 8598 8599 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8600 return toVPRecipeResult(new VPWidenSelectRecipe( 8601 *SI, make_range(Operands.begin(), Operands.end()))); 8602 } 8603 8604 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8605 return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], 8606 CI->getType(), *CI)); 8607 } 8608 8609 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8610 } 8611 8612 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8613 ElementCount MaxVF) { 8614 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8615 8616 auto MaxVFTimes2 = MaxVF * 2; 8617 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8618 VFRange SubRange = {VF, MaxVFTimes2}; 8619 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8620 // Now optimize the initial VPlan. 8621 if (!Plan->hasVF(ElementCount::getFixed(1))) 8622 VPlanTransforms::truncateToMinimalBitwidths( 8623 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); 8624 VPlanTransforms::optimize(*Plan, *PSE.getSE()); 8625 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 8626 VPlans.push_back(std::move(Plan)); 8627 } 8628 VF = SubRange.End; 8629 } 8630 } 8631 8632 // Add the necessary canonical IV and branch recipes required to control the 8633 // loop. 8634 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8635 DebugLoc DL) { 8636 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8637 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8638 8639 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8640 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8641 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8642 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8643 Header->insert(CanonicalIVPHI, Header->begin()); 8644 8645 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8646 // IV by VF * UF. 8647 auto *CanonicalIVIncrement = 8648 new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, 8649 {HasNUW, false}, DL, "index.next"); 8650 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8651 8652 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8653 EB->appendRecipe(CanonicalIVIncrement); 8654 8655 // Add the BranchOnCount VPInstruction to the latch. 8656 VPInstruction *BranchBack = 8657 new VPInstruction(VPInstruction::BranchOnCount, 8658 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8659 EB->appendRecipe(BranchBack); 8660 } 8661 8662 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8663 // original exit block. 8664 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, 8665 VPlan &Plan) { 8666 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8667 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8668 // Only handle single-exit loops with unique exit blocks for now. 8669 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8670 return; 8671 8672 // Introduce VPUsers modeling the exit values. 8673 for (PHINode &ExitPhi : ExitBB->phis()) { 8674 Value *IncomingValue = 8675 ExitPhi.getIncomingValueForBlock(ExitingBB); 8676 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8677 Plan.addLiveOut(&ExitPhi, V); 8678 } 8679 } 8680 8681 VPlanPtr 8682 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 8683 8684 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8685 8686 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8687 8688 // --------------------------------------------------------------------------- 8689 // Pre-construction: record ingredients whose recipes we'll need to further 8690 // process after constructing the initial VPlan. 8691 // --------------------------------------------------------------------------- 8692 8693 // For each interleave group which is relevant for this (possibly trimmed) 8694 // Range, add it to the set of groups to be later applied to the VPlan and add 8695 // placeholders for its members' Recipes which we'll be replacing with a 8696 // single VPInterleaveRecipe. 8697 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8698 auto applyIG = [IG, this](ElementCount VF) -> bool { 8699 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8700 CM.getWideningDecision(IG->getInsertPos(), VF) == 8701 LoopVectorizationCostModel::CM_Interleave); 8702 // For scalable vectors, the only interleave factor currently supported 8703 // is 2 since we require the (de)interleave2 intrinsics instead of 8704 // shufflevectors. 8705 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8706 "Unsupported interleave factor for scalable vectors"); 8707 return Result; 8708 }; 8709 if (!getDecisionAndClampRange(applyIG, Range)) 8710 continue; 8711 InterleaveGroups.insert(IG); 8712 for (unsigned i = 0; i < IG->getFactor(); i++) 8713 if (Instruction *Member = IG->getMember(i)) 8714 RecipeBuilder.recordRecipeOf(Member); 8715 }; 8716 8717 // --------------------------------------------------------------------------- 8718 // Build initial VPlan: Scan the body of the loop in a topological order to 8719 // visit each basic block after having visited its predecessor basic blocks. 8720 // --------------------------------------------------------------------------- 8721 8722 // Create initial VPlan skeleton, having a basic block for the pre-header 8723 // which contains SCEV expansions that need to happen before the CFG is 8724 // modified; a basic block for the vector pre-header, followed by a region for 8725 // the vector loop, followed by the middle basic block. The skeleton vector 8726 // loop region contains a header and latch basic blocks. 8727 VPlanPtr Plan = VPlan::createInitialVPlan( 8728 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8729 *PSE.getSE()); 8730 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8731 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8732 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8733 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); 8734 Plan->getVectorLoopRegion()->setExiting(LatchVPBB); 8735 8736 // Don't use getDecisionAndClampRange here, because we don't know the UF 8737 // so this function is better to be conservative, rather than to split 8738 // it up into different VPlans. 8739 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 8740 bool IVUpdateMayOverflow = false; 8741 for (ElementCount VF : Range) 8742 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8743 8744 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8745 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 8746 // When not folding the tail, we know that the induction increment will not 8747 // overflow. 8748 bool HasNUW = Style == TailFoldingStyle::None; 8749 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 8750 8751 // Proactively create header mask. Masks for other blocks are created on 8752 // demand. 8753 RecipeBuilder.createHeaderMask(*Plan); 8754 8755 // Scan the body of the loop in a topological order to visit each basic block 8756 // after having visited its predecessor basic blocks. 8757 LoopBlocksDFS DFS(OrigLoop); 8758 DFS.perform(LI); 8759 8760 VPBasicBlock *VPBB = HeaderVPBB; 8761 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8762 // Relevant instructions from basic block BB will be grouped into VPRecipe 8763 // ingredients and fill a new VPBasicBlock. 8764 if (VPBB != HeaderVPBB) 8765 VPBB->setName(BB->getName()); 8766 Builder.setInsertPoint(VPBB); 8767 8768 // Introduce each ingredient into VPlan. 8769 // TODO: Model and preserve debug intrinsics in VPlan. 8770 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 8771 Instruction *Instr = &I; 8772 SmallVector<VPValue *, 4> Operands; 8773 auto *Phi = dyn_cast<PHINode>(Instr); 8774 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8775 Operands.push_back(Plan->getVPValueOrAddLiveIn( 8776 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8777 } else { 8778 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8779 Operands = {OpRange.begin(), OpRange.end()}; 8780 } 8781 8782 // Invariant stores inside loop will be deleted and a single store 8783 // with the final reduction value will be added to the exit block 8784 StoreInst *SI; 8785 if ((SI = dyn_cast<StoreInst>(&I)) && 8786 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8787 continue; 8788 8789 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8790 Instr, Operands, Range, VPBB, Plan); 8791 if (!RecipeOrValue) 8792 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 8793 // If Instr can be simplified to an existing VPValue, use it. 8794 if (isa<VPValue *>(RecipeOrValue)) { 8795 auto *VPV = cast<VPValue *>(RecipeOrValue); 8796 Plan->addVPValue(Instr, VPV); 8797 // If the re-used value is a recipe, register the recipe for the 8798 // instruction, in case the recipe for Instr needs to be recorded. 8799 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8800 RecipeBuilder.setRecipe(Instr, R); 8801 continue; 8802 } 8803 // Otherwise, add the new recipe. 8804 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 8805 for (auto *Def : Recipe->definedValues()) { 8806 auto *UV = Def->getUnderlyingValue(); 8807 Plan->addVPValue(UV, Def); 8808 } 8809 8810 RecipeBuilder.setRecipe(Instr, Recipe); 8811 if (isa<VPHeaderPHIRecipe>(Recipe)) { 8812 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 8813 // the following cases, VPHeaderPHIRecipes may be created after non-phi 8814 // recipes and need to be moved to the phi section of HeaderVPBB: 8815 // * tail-folding (non-phi recipes computing the header mask are 8816 // introduced earlier than regular header phi recipes, and should appear 8817 // after them) 8818 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 8819 8820 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 8821 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 8822 "unexpected recipe needs moving"); 8823 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8824 } else 8825 VPBB->appendRecipe(Recipe); 8826 } 8827 8828 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8829 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8830 } 8831 8832 // After here, VPBB should not be used. 8833 VPBB = nullptr; 8834 8835 if (CM.requiresScalarEpilogue(Range)) { 8836 // No edge from the middle block to the unique exit block has been inserted 8837 // and there is nothing to fix from vector loop; phis should have incoming 8838 // from scalar loop only. 8839 } else 8840 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); 8841 8842 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8843 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8844 "entry block must be set to a VPRegionBlock having a non-empty entry " 8845 "VPBasicBlock"); 8846 RecipeBuilder.fixHeaderPhis(); 8847 8848 // --------------------------------------------------------------------------- 8849 // Transform initial VPlan: Apply previously taken decisions, in order, to 8850 // bring the VPlan to its final state. 8851 // --------------------------------------------------------------------------- 8852 8853 // Adjust the recipes for any inloop reductions. 8854 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); 8855 8856 // Interleave memory: for each Interleave Group we marked earlier as relevant 8857 // for this VPlan, replace the Recipes widening its memory instructions with a 8858 // single VPInterleaveRecipe at its insertion point. 8859 for (const auto *IG : InterleaveGroups) { 8860 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8861 RecipeBuilder.getRecipe(IG->getInsertPos())); 8862 SmallVector<VPValue *, 4> StoredValues; 8863 for (unsigned i = 0; i < IG->getFactor(); ++i) 8864 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8865 auto *StoreR = 8866 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8867 StoredValues.push_back(StoreR->getStoredValue()); 8868 } 8869 8870 bool NeedsMaskForGaps = 8871 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 8872 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8873 Recipe->getMask(), NeedsMaskForGaps); 8874 VPIG->insertBefore(Recipe); 8875 unsigned J = 0; 8876 for (unsigned i = 0; i < IG->getFactor(); ++i) 8877 if (Instruction *Member = IG->getMember(i)) { 8878 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 8879 if (!Member->getType()->isVoidTy()) { 8880 VPValue *OriginalV = MemberR->getVPSingleValue(); 8881 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8882 J++; 8883 } 8884 MemberR->eraseFromParent(); 8885 } 8886 } 8887 8888 for (ElementCount VF : Range) 8889 Plan->addVF(VF); 8890 Plan->setName("Initial VPlan"); 8891 8892 // Replace VPValues for known constant strides guaranteed by predicate scalar 8893 // evolution. 8894 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 8895 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 8896 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 8897 // Only handle constant strides for now. 8898 if (!ScevStride) 8899 continue; 8900 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 8901 8902 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 8903 // The versioned value may not be used in the loop directly, so just add a 8904 // new live-in in those cases. 8905 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 8906 } 8907 8908 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8909 // in ways that accessing values using original IR values is incorrect. 8910 Plan->disableValue2VPValue(); 8911 8912 // Sink users of fixed-order recurrence past the recipe defining the previous 8913 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 8914 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 8915 return nullptr; 8916 8917 if (useActiveLaneMask(Style)) { 8918 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 8919 // TailFoldingStyle is visible there. 8920 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 8921 bool WithoutRuntimeCheck = 8922 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 8923 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 8924 WithoutRuntimeCheck); 8925 } 8926 return Plan; 8927 } 8928 8929 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8930 // Outer loop handling: They may require CFG and instruction level 8931 // transformations before even evaluating whether vectorization is profitable. 8932 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8933 // the vectorization pipeline. 8934 assert(!OrigLoop->isInnermost()); 8935 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8936 8937 // Create new empty VPlan 8938 auto Plan = VPlan::createInitialVPlan( 8939 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8940 *PSE.getSE()); 8941 8942 // Build hierarchical CFG 8943 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8944 HCFGBuilder.buildHierarchicalCFG(); 8945 8946 for (ElementCount VF : Range) 8947 Plan->addVF(VF); 8948 8949 VPlanTransforms::VPInstructionsToVPRecipes( 8950 Plan, 8951 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 8952 *PSE.getSE(), *TLI); 8953 8954 // Remove the existing terminator of the exiting block of the top-most region. 8955 // A BranchOnCount will be added instead when adding the canonical IV recipes. 8956 auto *Term = 8957 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 8958 Term->eraseFromParent(); 8959 8960 // Tail folding is not supported for outer loops, so the induction increment 8961 // is guaranteed to not wrap. 8962 bool HasNUW = true; 8963 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 8964 DebugLoc()); 8965 return Plan; 8966 } 8967 8968 // Adjust the recipes for reductions. For in-loop reductions the chain of 8969 // instructions leading from the loop exit instr to the phi need to be converted 8970 // to reductions, with one operand being vector and the other being the scalar 8971 // reduction chain. For other reductions, a select is introduced between the phi 8972 // and live-out recipes when folding the tail. 8973 void LoopVectorizationPlanner::adjustRecipesForReductions( 8974 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 8975 ElementCount MinVF) { 8976 VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8977 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 8978 // sank outside of the loop would keep the same order as they had in the 8979 // original loop. 8980 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 8981 for (VPRecipeBase &R : Header->phis()) { 8982 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 8983 ReductionPHIList.emplace_back(ReductionPhi); 8984 } 8985 bool HasIntermediateStore = false; 8986 stable_sort(ReductionPHIList, 8987 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, 8988 const VPReductionPHIRecipe *R2) { 8989 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 8990 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 8991 HasIntermediateStore |= IS1 || IS2; 8992 8993 // If neither of the recipes has an intermediate store, keep the 8994 // order the same. 8995 if (!IS1 && !IS2) 8996 return false; 8997 8998 // If only one of the recipes has an intermediate store, then 8999 // move it towards the beginning of the list. 9000 if (IS1 && !IS2) 9001 return true; 9002 9003 if (!IS1 && IS2) 9004 return false; 9005 9006 // If both recipes have an intermediate store, then the recipe 9007 // with the later store should be processed earlier. So it 9008 // should go to the beginning of the list. 9009 return DT->dominates(IS2, IS1); 9010 }); 9011 9012 if (HasIntermediateStore && ReductionPHIList.size() > 1) 9013 for (VPRecipeBase *R : ReductionPHIList) 9014 R->moveBefore(*Header, Header->getFirstNonPhi()); 9015 9016 SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis; 9017 for (VPRecipeBase &R : Header->phis()) { 9018 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9019 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9020 continue; 9021 InLoopReductionPhis.push_back(PhiR); 9022 } 9023 9024 for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) { 9025 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9026 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9027 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9028 "AnyOf reductions are not allowed for in-loop reductions"); 9029 9030 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9031 SetVector<VPRecipeBase *> Worklist; 9032 Worklist.insert(PhiR); 9033 for (unsigned I = 0; I != Worklist.size(); ++I) { 9034 VPRecipeBase *Cur = Worklist[I]; 9035 for (VPUser *U : Cur->getVPSingleValue()->users()) { 9036 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 9037 if (!UserRecipe) 9038 continue; 9039 assert(UserRecipe->getNumDefinedValues() == 1 && 9040 "recipes must define exactly one result value"); 9041 Worklist.insert(UserRecipe); 9042 } 9043 } 9044 9045 // Visit operation "Links" along the reduction chain top-down starting from 9046 // the phi until LoopExitValue. We keep track of the previous item 9047 // (PreviousLink) to tell which of the two operands of a Link will remain 9048 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9049 // the select instructions. 9050 VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0]. 9051 for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) { 9052 VPValue *PreviousLinkV = PreviousLink->getVPSingleValue(); 9053 9054 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9055 9056 // Index of the first operand which holds a non-mask vector operand. 9057 unsigned IndexOfFirstOperand; 9058 // Recognize a call to the llvm.fmuladd intrinsic. 9059 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9060 VPValue *VecOp; 9061 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9062 if (IsFMulAdd) { 9063 assert( 9064 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9065 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9066 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9067 isa<VPWidenCallRecipe>(CurrentLink)) && 9068 CurrentLink->getOperand(2) == PreviousLinkV && 9069 "expected a call where the previous link is the added operand"); 9070 9071 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9072 // need to create an fmul recipe (multiplying the first two operands of 9073 // the fmuladd together) to use as the vector operand for the fadd 9074 // reduction. 9075 VPInstruction *FMulRecipe = new VPInstruction( 9076 Instruction::FMul, 9077 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9078 CurrentLinkI->getFastMathFlags()); 9079 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9080 VecOp = FMulRecipe; 9081 } else { 9082 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9083 if (isa<VPWidenRecipe>(CurrentLink)) { 9084 assert(isa<CmpInst>(CurrentLinkI) && 9085 "need to have the compare of the select"); 9086 continue; 9087 } 9088 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9089 "must be a select recipe"); 9090 IndexOfFirstOperand = 1; 9091 } else { 9092 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9093 "Expected to replace a VPWidenSC"); 9094 IndexOfFirstOperand = 0; 9095 } 9096 // Note that for non-commutable operands (cmp-selects), the semantics of 9097 // the cmp-select are captured in the recurrence kind. 9098 unsigned VecOpId = 9099 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV 9100 ? IndexOfFirstOperand + 1 9101 : IndexOfFirstOperand; 9102 VecOp = CurrentLink->getOperand(VecOpId); 9103 assert(VecOp != PreviousLinkV && 9104 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9105 (VecOpId - IndexOfFirstOperand)) == 9106 PreviousLinkV && 9107 "PreviousLinkV must be the operand other than VecOp"); 9108 } 9109 9110 BasicBlock *BB = CurrentLinkI->getParent(); 9111 VPValue *CondOp = nullptr; 9112 if (CM.blockNeedsPredicationForAnyReason(BB)) { 9113 VPBuilder::InsertPointGuard Guard(Builder); 9114 Builder.setInsertPoint(CurrentLink); 9115 CondOp = RecipeBuilder.createBlockInMask(BB, *Plan); 9116 } 9117 9118 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9119 RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp); 9120 // Append the recipe to the end of the VPBasicBlock because we need to 9121 // ensure that it comes after all of it's inputs, including CondOp. 9122 // Note that this transformation may leave over dead recipes (including 9123 // CurrentLink), which will be cleaned by a later VPlan transform. 9124 LinkVPBB->appendRecipe(RedRecipe); 9125 CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9126 PreviousLink = RedRecipe; 9127 } 9128 } 9129 Builder.setInsertPoint(&*LatchVPBB->begin()); 9130 for (VPRecipeBase &R : 9131 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9132 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9133 if (!PhiR || PhiR->isInLoop()) 9134 continue; 9135 9136 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9137 auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe(); 9138 // If tail is folded by masking, introduce selects between the phi 9139 // and the live-out instruction of each reduction, at the beginning of the 9140 // dedicated latch block. 9141 if (CM.foldTailByMasking()) { 9142 VPValue *Cond = 9143 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); 9144 VPValue *Red = PhiR->getBackedgeValue(); 9145 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && 9146 "reduction recipe must be defined before latch"); 9147 FastMathFlags FMFs = RdxDesc.getFastMathFlags(); 9148 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9149 Result = 9150 PhiTy->isFloatingPointTy() 9151 ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs) 9152 : new VPInstruction(Instruction::Select, {Cond, Red, PhiR}); 9153 Result->insertBefore(&*Builder.getInsertPoint()); 9154 Red->replaceUsesWithIf( 9155 Result->getVPSingleValue(), 9156 [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); }); 9157 if (PreferPredicatedReductionSelect || 9158 TTI.preferPredicatedReductionSelect( 9159 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, 9160 TargetTransformInfo::ReductionFlags())) 9161 PhiR->setOperand(1, Result->getVPSingleValue()); 9162 } 9163 // If the vector reduction can be performed in a smaller type, we truncate 9164 // then extend the loop exit value to enable InstCombine to evaluate the 9165 // entire expression in the smaller type. 9166 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9167 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 9168 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9169 Type *RdxTy = RdxDesc.getRecurrenceType(); 9170 auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc, 9171 Result->getVPSingleValue(), RdxTy); 9172 auto *Extnd = 9173 RdxDesc.isSigned() 9174 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9175 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9176 9177 Trunc->insertAfter(Result); 9178 Extnd->insertAfter(Trunc); 9179 Result->getVPSingleValue()->replaceAllUsesWith(Extnd); 9180 Trunc->setOperand(0, Result->getVPSingleValue()); 9181 } 9182 } 9183 9184 VPlanTransforms::clearReductionWrapFlags(*Plan); 9185 } 9186 9187 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9188 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9189 VPSlotTracker &SlotTracker) const { 9190 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9191 IG->getInsertPos()->printAsOperand(O, false); 9192 O << ", "; 9193 getAddr()->printAsOperand(O, SlotTracker); 9194 VPValue *Mask = getMask(); 9195 if (Mask) { 9196 O << ", "; 9197 Mask->printAsOperand(O, SlotTracker); 9198 } 9199 9200 unsigned OpIdx = 0; 9201 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9202 if (!IG->getMember(i)) 9203 continue; 9204 if (getNumStoreOperands() > 0) { 9205 O << "\n" << Indent << " store "; 9206 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9207 O << " to index " << i; 9208 } else { 9209 O << "\n" << Indent << " "; 9210 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9211 O << " = load from index " << i; 9212 } 9213 ++OpIdx; 9214 } 9215 } 9216 #endif 9217 9218 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9219 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9220 "Not a pointer induction according to InductionDescriptor!"); 9221 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9222 "Unexpected type."); 9223 9224 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9225 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9226 9227 if (onlyScalarsGenerated(State.VF)) { 9228 // This is the normalized GEP that starts counting at zero. 9229 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9230 CanonicalIV, IndDesc.getStep()->getType()); 9231 // Determine the number of scalars we need to generate for each unroll 9232 // iteration. If the instruction is uniform, we only need to generate the 9233 // first lane. Otherwise, we generate all VF values. 9234 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9235 assert((IsUniform || !State.VF.isScalable()) && 9236 "Cannot scalarize a scalable VF"); 9237 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9238 9239 for (unsigned Part = 0; Part < State.UF; ++Part) { 9240 Value *PartStart = 9241 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9242 9243 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9244 Value *Idx = State.Builder.CreateAdd( 9245 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9246 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9247 9248 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9249 Value *SclrGep = emitTransformedIndex( 9250 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, 9251 IndDesc.getKind(), IndDesc.getInductionBinOp()); 9252 SclrGep->setName("next.gep"); 9253 State.set(this, SclrGep, VPIteration(Part, Lane)); 9254 } 9255 } 9256 return; 9257 } 9258 9259 Type *PhiType = IndDesc.getStep()->getType(); 9260 9261 // Build a pointer phi 9262 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9263 Type *ScStValueType = ScalarStartValue->getType(); 9264 PHINode *NewPointerPhi = 9265 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9266 9267 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9268 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9269 9270 // A pointer induction, performed by using a gep 9271 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9272 9273 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9274 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9275 Value *NumUnrolledElems = 9276 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9277 Value *InductionGEP = GetElementPtrInst::Create( 9278 State.Builder.getInt8Ty(), NewPointerPhi, 9279 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9280 InductionLoc); 9281 // Add induction update using an incorrect block temporarily. The phi node 9282 // will be fixed after VPlan execution. Note that at this point the latch 9283 // block cannot be used, as it does not exist yet. 9284 // TODO: Model increment value in VPlan, by turning the recipe into a 9285 // multi-def and a subclass of VPHeaderPHIRecipe. 9286 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9287 9288 // Create UF many actual address geps that use the pointer 9289 // phi as base and a vectorized version of the step value 9290 // (<step*0, ..., step*N>) as offset. 9291 for (unsigned Part = 0; Part < State.UF; ++Part) { 9292 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9293 Value *StartOffsetScalar = 9294 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9295 Value *StartOffset = 9296 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9297 // Create a vector of consecutive numbers from zero to VF. 9298 StartOffset = State.Builder.CreateAdd( 9299 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9300 9301 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9302 "scalar step must be the same across all parts"); 9303 Value *GEP = State.Builder.CreateGEP( 9304 State.Builder.getInt8Ty(), NewPointerPhi, 9305 State.Builder.CreateMul( 9306 StartOffset, 9307 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9308 "vector.gep")); 9309 State.set(this, GEP, Part); 9310 } 9311 } 9312 9313 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9314 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9315 9316 // Fast-math-flags propagate from the original induction instruction. 9317 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9318 if (FPBinOp) 9319 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9320 9321 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9322 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9323 Value *DerivedIV = emitTransformedIndex( 9324 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, 9325 Kind, cast_if_present<BinaryOperator>(FPBinOp)); 9326 DerivedIV->setName("offset.idx"); 9327 if (TruncResultTy) { 9328 assert(TruncResultTy != DerivedIV->getType() && 9329 Step->getType()->isIntegerTy() && 9330 "Truncation requires an integer step"); 9331 DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); 9332 } 9333 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9334 9335 State.set(this, DerivedIV, VPIteration(0, 0)); 9336 } 9337 9338 void VPInterleaveRecipe::execute(VPTransformState &State) { 9339 assert(!State.Instance && "Interleave group being replicated."); 9340 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9341 getStoredValues(), getMask(), 9342 NeedsMaskForGaps); 9343 } 9344 9345 void VPReductionRecipe::execute(VPTransformState &State) { 9346 assert(!State.Instance && "Reduction being replicated."); 9347 Value *PrevInChain = State.get(getChainOp(), 0); 9348 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9349 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); 9350 // Propagate the fast-math flags carried by the underlying instruction. 9351 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9352 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 9353 for (unsigned Part = 0; Part < State.UF; ++Part) { 9354 Value *NewVecOp = State.get(getVecOp(), Part); 9355 if (VPValue *Cond = getCondOp()) { 9356 Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) 9357 : State.get(Cond, {Part, 0}); 9358 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 9359 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 9360 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 9361 RdxDesc.getFastMathFlags()); 9362 if (State.VF.isVector()) { 9363 Iden = 9364 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9365 } 9366 9367 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 9368 NewVecOp = Select; 9369 } 9370 Value *NewRed; 9371 Value *NextInChain; 9372 if (IsOrdered) { 9373 if (State.VF.isVector()) 9374 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 9375 PrevInChain); 9376 else 9377 NewRed = State.Builder.CreateBinOp( 9378 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 9379 NewVecOp); 9380 PrevInChain = NewRed; 9381 } else { 9382 PrevInChain = State.get(getChainOp(), Part); 9383 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 9384 } 9385 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9386 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 9387 NewRed, PrevInChain); 9388 } else if (IsOrdered) 9389 NextInChain = NewRed; 9390 else 9391 NextInChain = State.Builder.CreateBinOp( 9392 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 9393 State.set(this, NextInChain, Part); 9394 } 9395 } 9396 9397 void VPReplicateRecipe::execute(VPTransformState &State) { 9398 Instruction *UI = getUnderlyingInstr(); 9399 if (State.Instance) { // Generate a single instance. 9400 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9401 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9402 // Insert scalar instance packing it into a vector. 9403 if (State.VF.isVector() && shouldPack()) { 9404 // If we're constructing lane 0, initialize to start from poison. 9405 if (State.Instance->Lane.isFirstLane()) { 9406 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9407 Value *Poison = PoisonValue::get( 9408 VectorType::get(UI->getType(), State.VF)); 9409 State.set(this, Poison, State.Instance->Part); 9410 } 9411 State.packScalarIntoVectorValue(this, *State.Instance); 9412 } 9413 return; 9414 } 9415 9416 if (IsUniform) { 9417 // If the recipe is uniform across all parts (instead of just per VF), only 9418 // generate a single instance. 9419 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9420 all_of(operands(), [](VPValue *Op) { 9421 return Op->isDefinedOutsideVectorRegions(); 9422 })) { 9423 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9424 if (user_begin() != user_end()) { 9425 for (unsigned Part = 1; Part < State.UF; ++Part) 9426 State.set(this, State.get(this, VPIteration(0, 0)), 9427 VPIteration(Part, 0)); 9428 } 9429 return; 9430 } 9431 9432 // Uniform within VL means we need to generate lane 0 only for each 9433 // unrolled copy. 9434 for (unsigned Part = 0; Part < State.UF; ++Part) 9435 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9436 return; 9437 } 9438 9439 // A store of a loop varying value to a uniform address only needs the last 9440 // copy of the store. 9441 if (isa<StoreInst>(UI) && 9442 vputils::isUniformAfterVectorization(getOperand(1))) { 9443 auto Lane = VPLane::getLastLaneForVF(State.VF); 9444 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9445 State); 9446 return; 9447 } 9448 9449 // Generate scalar instances for all VF lanes of all UF parts. 9450 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9451 const unsigned EndLane = State.VF.getKnownMinValue(); 9452 for (unsigned Part = 0; Part < State.UF; ++Part) 9453 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9454 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9455 } 9456 9457 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9458 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9459 9460 // Attempt to issue a wide load. 9461 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9462 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9463 9464 assert((LI || SI) && "Invalid Load/Store instruction"); 9465 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9466 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9467 9468 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9469 9470 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9471 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9472 bool CreateGatherScatter = !isConsecutive(); 9473 9474 auto &Builder = State.Builder; 9475 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9476 bool isMaskRequired = getMask(); 9477 if (isMaskRequired) { 9478 // Mask reversal is only neede for non-all-one (null) masks, as reverse of a 9479 // null all-one mask is a null mask. 9480 for (unsigned Part = 0; Part < State.UF; ++Part) { 9481 Value *Mask = State.get(getMask(), Part); 9482 if (isReverse()) 9483 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9484 BlockInMaskParts[Part] = Mask; 9485 } 9486 } 9487 9488 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9489 // Calculate the pointer for the specific unroll-part. 9490 Value *PartPtr = nullptr; 9491 9492 // Use i32 for the gep index type when the value is constant, 9493 // or query DataLayout for a more suitable index type otherwise. 9494 const DataLayout &DL = 9495 Builder.GetInsertBlock()->getModule()->getDataLayout(); 9496 Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) 9497 ? DL.getIndexType(PointerType::getUnqual( 9498 ScalarDataTy->getContext())) 9499 : Builder.getInt32Ty(); 9500 bool InBounds = false; 9501 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9502 InBounds = gep->isInBounds(); 9503 if (isReverse()) { 9504 // If the address is consecutive but reversed, then the 9505 // wide store needs to start at the last vector element. 9506 // RunTimeVF = VScale * VF.getKnownMinValue() 9507 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9508 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 9509 // NumElt = -Part * RunTimeVF 9510 Value *NumElt = 9511 Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 9512 // LastLane = 1 - RunTimeVF 9513 Value *LastLane = 9514 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 9515 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); 9516 PartPtr = 9517 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); 9518 } else { 9519 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 9520 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); 9521 } 9522 9523 return PartPtr; 9524 }; 9525 9526 // Handle Stores: 9527 if (SI) { 9528 State.setDebugLocFrom(SI->getDebugLoc()); 9529 9530 for (unsigned Part = 0; Part < State.UF; ++Part) { 9531 Instruction *NewSI = nullptr; 9532 Value *StoredVal = State.get(StoredValue, Part); 9533 if (CreateGatherScatter) { 9534 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9535 Value *VectorGep = State.get(getAddr(), Part); 9536 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9537 MaskPart); 9538 } else { 9539 if (isReverse()) { 9540 // If we store to reverse consecutive memory locations, then we need 9541 // to reverse the order of elements in the stored value. 9542 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9543 // We don't want to update the value in the map as it might be used in 9544 // another expression. So don't call resetVectorValue(StoredVal). 9545 } 9546 auto *VecPtr = 9547 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9548 if (isMaskRequired) 9549 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9550 BlockInMaskParts[Part]); 9551 else 9552 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9553 } 9554 State.addMetadata(NewSI, SI); 9555 } 9556 return; 9557 } 9558 9559 // Handle loads. 9560 assert(LI && "Must have a load instruction"); 9561 State.setDebugLocFrom(LI->getDebugLoc()); 9562 for (unsigned Part = 0; Part < State.UF; ++Part) { 9563 Value *NewLI; 9564 if (CreateGatherScatter) { 9565 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9566 Value *VectorGep = State.get(getAddr(), Part); 9567 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9568 nullptr, "wide.masked.gather"); 9569 State.addMetadata(NewLI, LI); 9570 } else { 9571 auto *VecPtr = 9572 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9573 if (isMaskRequired) 9574 NewLI = Builder.CreateMaskedLoad( 9575 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9576 PoisonValue::get(DataTy), "wide.masked.load"); 9577 else 9578 NewLI = 9579 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9580 9581 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9582 State.addMetadata(NewLI, LI); 9583 if (Reverse) 9584 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9585 } 9586 9587 State.set(getVPSingleValue(), NewLI, Part); 9588 } 9589 } 9590 9591 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9592 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9593 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9594 // for predication. 9595 static ScalarEpilogueLowering getScalarEpilogueLowering( 9596 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9597 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9598 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9599 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9600 // don't look at hints or options, and don't request a scalar epilogue. 9601 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9602 // LoopAccessInfo (due to code dependency and not being able to reliably get 9603 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9604 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9605 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9606 // back to the old way and vectorize with versioning when forced. See D81345.) 9607 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9608 PGSOQueryType::IRPass) && 9609 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9610 return CM_ScalarEpilogueNotAllowedOptSize; 9611 9612 // 2) If set, obey the directives 9613 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9614 switch (PreferPredicateOverEpilogue) { 9615 case PreferPredicateTy::ScalarEpilogue: 9616 return CM_ScalarEpilogueAllowed; 9617 case PreferPredicateTy::PredicateElseScalarEpilogue: 9618 return CM_ScalarEpilogueNotNeededUsePredicate; 9619 case PreferPredicateTy::PredicateOrDontVectorize: 9620 return CM_ScalarEpilogueNotAllowedUsePredicate; 9621 }; 9622 } 9623 9624 // 3) If set, obey the hints 9625 switch (Hints.getPredicate()) { 9626 case LoopVectorizeHints::FK_Enabled: 9627 return CM_ScalarEpilogueNotNeededUsePredicate; 9628 case LoopVectorizeHints::FK_Disabled: 9629 return CM_ScalarEpilogueAllowed; 9630 }; 9631 9632 // 4) if the TTI hook indicates this is profitable, request predication. 9633 TailFoldingInfo TFI(TLI, &LVL, IAI); 9634 if (TTI->preferPredicateOverEpilogue(&TFI)) 9635 return CM_ScalarEpilogueNotNeededUsePredicate; 9636 9637 return CM_ScalarEpilogueAllowed; 9638 } 9639 9640 // Process the loop in the VPlan-native vectorization path. This path builds 9641 // VPlan upfront in the vectorization pipeline, which allows to apply 9642 // VPlan-to-VPlan transformations from the very beginning without modifying the 9643 // input LLVM IR. 9644 static bool processLoopInVPlanNativePath( 9645 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9646 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9647 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9648 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9649 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9650 LoopVectorizationRequirements &Requirements) { 9651 9652 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9653 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9654 return false; 9655 } 9656 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9657 Function *F = L->getHeader()->getParent(); 9658 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9659 9660 ScalarEpilogueLowering SEL = 9661 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9662 9663 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9664 &Hints, IAI); 9665 // Use the planner for outer loop vectorization. 9666 // TODO: CM is not used at this point inside the planner. Turn CM into an 9667 // optional argument if we don't need it in the future. 9668 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9669 ORE); 9670 9671 // Get user vectorization factor. 9672 ElementCount UserVF = Hints.getWidth(); 9673 9674 CM.collectElementTypesForWidening(); 9675 9676 // Plan how to best vectorize, return the best VF and its cost. 9677 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9678 9679 // If we are stress testing VPlan builds, do not attempt to generate vector 9680 // code. Masked vector code generation support will follow soon. 9681 // Also, do not attempt to vectorize if no vector code will be produced. 9682 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9683 return false; 9684 9685 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9686 9687 { 9688 bool AddBranchWeights = 9689 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9690 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9691 F->getParent()->getDataLayout(), AddBranchWeights); 9692 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9693 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9694 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9695 << L->getHeader()->getParent()->getName() << "\"\n"); 9696 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9697 } 9698 9699 reportVectorization(ORE, L, VF, 1); 9700 9701 // Mark the loop as already vectorized to avoid vectorizing again. 9702 Hints.setAlreadyVectorized(); 9703 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9704 return true; 9705 } 9706 9707 // Emit a remark if there are stores to floats that required a floating point 9708 // extension. If the vectorized loop was generated with floating point there 9709 // will be a performance penalty from the conversion overhead and the change in 9710 // the vector width. 9711 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9712 SmallVector<Instruction *, 4> Worklist; 9713 for (BasicBlock *BB : L->getBlocks()) { 9714 for (Instruction &Inst : *BB) { 9715 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9716 if (S->getValueOperand()->getType()->isFloatTy()) 9717 Worklist.push_back(S); 9718 } 9719 } 9720 } 9721 9722 // Traverse the floating point stores upwards searching, for floating point 9723 // conversions. 9724 SmallPtrSet<const Instruction *, 4> Visited; 9725 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9726 while (!Worklist.empty()) { 9727 auto *I = Worklist.pop_back_val(); 9728 if (!L->contains(I)) 9729 continue; 9730 if (!Visited.insert(I).second) 9731 continue; 9732 9733 // Emit a remark if the floating point store required a floating 9734 // point conversion. 9735 // TODO: More work could be done to identify the root cause such as a 9736 // constant or a function return type and point the user to it. 9737 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9738 ORE->emit([&]() { 9739 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9740 I->getDebugLoc(), L->getHeader()) 9741 << "floating point conversion changes vector width. " 9742 << "Mixed floating point precision requires an up/down " 9743 << "cast that will negatively impact performance."; 9744 }); 9745 9746 for (Use &Op : I->operands()) 9747 if (auto *OpI = dyn_cast<Instruction>(Op)) 9748 Worklist.push_back(OpI); 9749 } 9750 } 9751 9752 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9753 VectorizationFactor &VF, 9754 std::optional<unsigned> VScale, Loop *L, 9755 ScalarEvolution &SE, 9756 ScalarEpilogueLowering SEL) { 9757 InstructionCost CheckCost = Checks.getCost(); 9758 if (!CheckCost.isValid()) 9759 return false; 9760 9761 // When interleaving only scalar and vector cost will be equal, which in turn 9762 // would lead to a divide by 0. Fall back to hard threshold. 9763 if (VF.Width.isScalar()) { 9764 if (CheckCost > VectorizeMemoryCheckThreshold) { 9765 LLVM_DEBUG( 9766 dbgs() 9767 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9768 return false; 9769 } 9770 return true; 9771 } 9772 9773 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9774 double ScalarC = *VF.ScalarCost.getValue(); 9775 if (ScalarC == 0) 9776 return true; 9777 9778 // First, compute the minimum iteration count required so that the vector 9779 // loop outperforms the scalar loop. 9780 // The total cost of the scalar loop is 9781 // ScalarC * TC 9782 // where 9783 // * TC is the actual trip count of the loop. 9784 // * ScalarC is the cost of a single scalar iteration. 9785 // 9786 // The total cost of the vector loop is 9787 // RtC + VecC * (TC / VF) + EpiC 9788 // where 9789 // * RtC is the cost of the generated runtime checks 9790 // * VecC is the cost of a single vector iteration. 9791 // * TC is the actual trip count of the loop 9792 // * VF is the vectorization factor 9793 // * EpiCost is the cost of the generated epilogue, including the cost 9794 // of the remaining scalar operations. 9795 // 9796 // Vectorization is profitable once the total vector cost is less than the 9797 // total scalar cost: 9798 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9799 // 9800 // Now we can compute the minimum required trip count TC as 9801 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 9802 // 9803 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9804 // the computations are performed on doubles, not integers and the result 9805 // is rounded up, hence we get an upper estimate of the TC. 9806 unsigned IntVF = VF.Width.getKnownMinValue(); 9807 if (VF.Width.isScalable()) { 9808 unsigned AssumedMinimumVscale = 1; 9809 if (VScale) 9810 AssumedMinimumVscale = *VScale; 9811 IntVF *= AssumedMinimumVscale; 9812 } 9813 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 9814 double RtC = *CheckCost.getValue(); 9815 double MinTC1 = RtC / (ScalarC - VecCOverVF); 9816 9817 // Second, compute a minimum iteration count so that the cost of the 9818 // runtime checks is only a fraction of the total scalar loop cost. This 9819 // adds a loop-dependent bound on the overhead incurred if the runtime 9820 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 9821 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 9822 // cost, compute 9823 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 9824 double MinTC2 = RtC * 10 / ScalarC; 9825 9826 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 9827 // epilogue is allowed, choose the next closest multiple of VF. This should 9828 // partly compensate for ignoring the epilogue cost. 9829 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 9830 if (SEL == CM_ScalarEpilogueAllowed) 9831 MinTC = alignTo(MinTC, IntVF); 9832 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 9833 9834 LLVM_DEBUG( 9835 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 9836 << VF.MinProfitableTripCount << "\n"); 9837 9838 // Skip vectorization if the expected trip count is less than the minimum 9839 // required trip count. 9840 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 9841 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 9842 VF.MinProfitableTripCount)) { 9843 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 9844 "trip count < minimum profitable VF (" 9845 << *ExpectedTC << " < " << VF.MinProfitableTripCount 9846 << ")\n"); 9847 9848 return false; 9849 } 9850 } 9851 return true; 9852 } 9853 9854 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9855 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9856 !EnableLoopInterleaving), 9857 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9858 !EnableLoopVectorization) {} 9859 9860 bool LoopVectorizePass::processLoop(Loop *L) { 9861 assert((EnableVPlanNativePath || L->isInnermost()) && 9862 "VPlan-native path is not enabled. Only process inner loops."); 9863 9864 #ifndef NDEBUG 9865 const std::string DebugLocStr = getDebugLocString(L); 9866 #endif /* NDEBUG */ 9867 9868 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 9869 << L->getHeader()->getParent()->getName() << "' from " 9870 << DebugLocStr << "\n"); 9871 9872 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 9873 9874 LLVM_DEBUG( 9875 dbgs() << "LV: Loop hints:" 9876 << " force=" 9877 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9878 ? "disabled" 9879 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9880 ? "enabled" 9881 : "?")) 9882 << " width=" << Hints.getWidth() 9883 << " interleave=" << Hints.getInterleave() << "\n"); 9884 9885 // Function containing loop 9886 Function *F = L->getHeader()->getParent(); 9887 9888 // Looking at the diagnostic output is the only way to determine if a loop 9889 // was vectorized (other than looking at the IR or machine code), so it 9890 // is important to generate an optimization remark for each loop. Most of 9891 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9892 // generated as OptimizationRemark and OptimizationRemarkMissed are 9893 // less verbose reporting vectorized loops and unvectorized loops that may 9894 // benefit from vectorization, respectively. 9895 9896 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9897 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9898 return false; 9899 } 9900 9901 PredicatedScalarEvolution PSE(*SE, *L); 9902 9903 // Check if it is legal to vectorize the loop. 9904 LoopVectorizationRequirements Requirements; 9905 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 9906 &Requirements, &Hints, DB, AC, BFI, PSI); 9907 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9908 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9909 Hints.emitRemarkWithHints(); 9910 return false; 9911 } 9912 9913 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9914 // here. They may require CFG and instruction level transformations before 9915 // even evaluating whether vectorization is profitable. Since we cannot modify 9916 // the incoming IR, we need to build VPlan upfront in the vectorization 9917 // pipeline. 9918 if (!L->isInnermost()) 9919 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9920 ORE, BFI, PSI, Hints, Requirements); 9921 9922 assert(L->isInnermost() && "Inner loop expected."); 9923 9924 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9925 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9926 9927 // If an override option has been passed in for interleaved accesses, use it. 9928 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9929 UseInterleaved = EnableInterleavedMemAccesses; 9930 9931 // Analyze interleaved memory accesses. 9932 if (UseInterleaved) 9933 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9934 9935 // Check the function attributes and profiles to find out if this function 9936 // should be optimized for size. 9937 ScalarEpilogueLowering SEL = 9938 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 9939 9940 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9941 // count by optimizing for size, to minimize overheads. 9942 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9943 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9944 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9945 << "This loop is worth vectorizing only if no scalar " 9946 << "iteration overheads are incurred."); 9947 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9948 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9949 else { 9950 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 9951 LLVM_DEBUG(dbgs() << "\n"); 9952 // Predicate tail-folded loops are efficient even when the loop 9953 // iteration count is low. However, setting the epilogue policy to 9954 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 9955 // with runtime checks. It's more effective to let 9956 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 9957 // for the loop. 9958 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 9959 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9960 } else { 9961 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 9962 "small to consider vectorizing.\n"); 9963 reportVectorizationFailure( 9964 "The trip count is below the minial threshold value.", 9965 "loop trip count is too low, avoiding vectorization", 9966 "LowTripCount", ORE, L); 9967 Hints.emitRemarkWithHints(); 9968 return false; 9969 } 9970 } 9971 } 9972 9973 // Check the function attributes to see if implicit floats or vectors are 9974 // allowed. 9975 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9976 reportVectorizationFailure( 9977 "Can't vectorize when the NoImplicitFloat attribute is used", 9978 "loop not vectorized due to NoImplicitFloat attribute", 9979 "NoImplicitFloat", ORE, L); 9980 Hints.emitRemarkWithHints(); 9981 return false; 9982 } 9983 9984 // Check if the target supports potentially unsafe FP vectorization. 9985 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9986 // for the target we're vectorizing for, to make sure none of the 9987 // additional fp-math flags can help. 9988 if (Hints.isPotentiallyUnsafe() && 9989 TTI->isFPVectorizationPotentiallyUnsafe()) { 9990 reportVectorizationFailure( 9991 "Potentially unsafe FP op prevents vectorization", 9992 "loop not vectorized due to unsafe FP support.", 9993 "UnsafeFP", ORE, L); 9994 Hints.emitRemarkWithHints(); 9995 return false; 9996 } 9997 9998 bool AllowOrderedReductions; 9999 // If the flag is set, use that instead and override the TTI behaviour. 10000 if (ForceOrderedReductions.getNumOccurrences() > 0) 10001 AllowOrderedReductions = ForceOrderedReductions; 10002 else 10003 AllowOrderedReductions = TTI->enableOrderedReductions(); 10004 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10005 ORE->emit([&]() { 10006 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10007 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10008 ExactFPMathInst->getDebugLoc(), 10009 ExactFPMathInst->getParent()) 10010 << "loop not vectorized: cannot prove it is safe to reorder " 10011 "floating-point operations"; 10012 }); 10013 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10014 "reorder floating-point operations\n"); 10015 Hints.emitRemarkWithHints(); 10016 return false; 10017 } 10018 10019 // Use the cost model. 10020 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10021 F, &Hints, IAI); 10022 // Use the planner for vectorization. 10023 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10024 ORE); 10025 10026 // Get user vectorization factor and interleave count. 10027 ElementCount UserVF = Hints.getWidth(); 10028 unsigned UserIC = Hints.getInterleave(); 10029 10030 // Plan how to best vectorize, return the best VF and its cost. 10031 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10032 10033 VectorizationFactor VF = VectorizationFactor::Disabled(); 10034 unsigned IC = 1; 10035 10036 bool AddBranchWeights = 10037 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10038 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10039 F->getParent()->getDataLayout(), AddBranchWeights); 10040 if (MaybeVF) { 10041 VF = *MaybeVF; 10042 // Select the interleave count. 10043 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10044 10045 unsigned SelectedIC = std::max(IC, UserIC); 10046 // Optimistically generate runtime checks if they are needed. Drop them if 10047 // they turn out to not be profitable. 10048 if (VF.Width.isVector() || SelectedIC > 1) 10049 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10050 10051 // Check if it is profitable to vectorize with runtime checks. 10052 bool ForceVectorization = 10053 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10054 if (!ForceVectorization && 10055 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 10056 *PSE.getSE(), SEL)) { 10057 ORE->emit([&]() { 10058 return OptimizationRemarkAnalysisAliasing( 10059 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10060 L->getHeader()) 10061 << "loop not vectorized: cannot prove it is safe to reorder " 10062 "memory operations"; 10063 }); 10064 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10065 Hints.emitRemarkWithHints(); 10066 return false; 10067 } 10068 } 10069 10070 // Identify the diagnostic messages that should be produced. 10071 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10072 bool VectorizeLoop = true, InterleaveLoop = true; 10073 if (VF.Width.isScalar()) { 10074 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10075 VecDiagMsg = std::make_pair( 10076 "VectorizationNotBeneficial", 10077 "the cost-model indicates that vectorization is not beneficial"); 10078 VectorizeLoop = false; 10079 } 10080 10081 if (!MaybeVF && UserIC > 1) { 10082 // Tell the user interleaving was avoided up-front, despite being explicitly 10083 // requested. 10084 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10085 "interleaving should be avoided up front\n"); 10086 IntDiagMsg = std::make_pair( 10087 "InterleavingAvoided", 10088 "Ignoring UserIC, because interleaving was avoided up front"); 10089 InterleaveLoop = false; 10090 } else if (IC == 1 && UserIC <= 1) { 10091 // Tell the user interleaving is not beneficial. 10092 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10093 IntDiagMsg = std::make_pair( 10094 "InterleavingNotBeneficial", 10095 "the cost-model indicates that interleaving is not beneficial"); 10096 InterleaveLoop = false; 10097 if (UserIC == 1) { 10098 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10099 IntDiagMsg.second += 10100 " and is explicitly disabled or interleave count is set to 1"; 10101 } 10102 } else if (IC > 1 && UserIC == 1) { 10103 // Tell the user interleaving is beneficial, but it explicitly disabled. 10104 LLVM_DEBUG( 10105 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10106 IntDiagMsg = std::make_pair( 10107 "InterleavingBeneficialButDisabled", 10108 "the cost-model indicates that interleaving is beneficial " 10109 "but is explicitly disabled or interleave count is set to 1"); 10110 InterleaveLoop = false; 10111 } 10112 10113 // Override IC if user provided an interleave count. 10114 IC = UserIC > 0 ? UserIC : IC; 10115 10116 // Emit diagnostic messages, if any. 10117 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10118 if (!VectorizeLoop && !InterleaveLoop) { 10119 // Do not vectorize or interleaving the loop. 10120 ORE->emit([&]() { 10121 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10122 L->getStartLoc(), L->getHeader()) 10123 << VecDiagMsg.second; 10124 }); 10125 ORE->emit([&]() { 10126 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10127 L->getStartLoc(), L->getHeader()) 10128 << IntDiagMsg.second; 10129 }); 10130 return false; 10131 } else if (!VectorizeLoop && InterleaveLoop) { 10132 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10133 ORE->emit([&]() { 10134 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10135 L->getStartLoc(), L->getHeader()) 10136 << VecDiagMsg.second; 10137 }); 10138 } else if (VectorizeLoop && !InterleaveLoop) { 10139 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10140 << ") in " << DebugLocStr << '\n'); 10141 ORE->emit([&]() { 10142 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10143 L->getStartLoc(), L->getHeader()) 10144 << IntDiagMsg.second; 10145 }); 10146 } else if (VectorizeLoop && InterleaveLoop) { 10147 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10148 << ") in " << DebugLocStr << '\n'); 10149 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10150 } 10151 10152 bool DisableRuntimeUnroll = false; 10153 MDNode *OrigLoopID = L->getLoopID(); 10154 { 10155 using namespace ore; 10156 if (!VectorizeLoop) { 10157 assert(IC > 1 && "interleave count should not be 1 or 0"); 10158 // If we decided that it is not legal to vectorize the loop, then 10159 // interleave it. 10160 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10161 &CM, BFI, PSI, Checks); 10162 10163 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10164 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10165 10166 ORE->emit([&]() { 10167 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10168 L->getHeader()) 10169 << "interleaved loop (interleaved count: " 10170 << NV("InterleaveCount", IC) << ")"; 10171 }); 10172 } else { 10173 // If we decided that it is *legal* to vectorize the loop, then do it. 10174 10175 // Consider vectorizing the epilogue too if it's profitable. 10176 VectorizationFactor EpilogueVF = 10177 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10178 if (EpilogueVF.Width.isVector()) { 10179 10180 // The first pass vectorizes the main loop and creates a scalar epilogue 10181 // to be vectorized by executing the plan (potentially with a different 10182 // factor) again shortly afterwards. 10183 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10184 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10185 EPI, &LVL, &CM, BFI, PSI, Checks); 10186 10187 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10188 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10189 BestMainPlan, MainILV, DT, true); 10190 ++LoopsVectorized; 10191 10192 // Second pass vectorizes the epilogue and adjusts the control flow 10193 // edges from the first pass. 10194 EPI.MainLoopVF = EPI.EpilogueVF; 10195 EPI.MainLoopUF = EPI.EpilogueUF; 10196 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10197 ORE, EPI, &LVL, &CM, BFI, PSI, 10198 Checks); 10199 10200 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10201 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10202 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10203 Header->setName("vec.epilog.vector.body"); 10204 10205 // Re-use the trip count and steps expanded for the main loop, as 10206 // skeleton creation needs it as a value that dominates both the scalar 10207 // and vector epilogue loops 10208 // TODO: This is a workaround needed for epilogue vectorization and it 10209 // should be removed once induction resume value creation is done 10210 // directly in VPlan. 10211 EpilogILV.setTripCount(MainILV.getTripCount()); 10212 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10213 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10214 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10215 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10216 ExpandR->replaceAllUsesWith(ExpandedVal); 10217 ExpandR->eraseFromParent(); 10218 } 10219 10220 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10221 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10222 // before vectorizing the epilogue loop. 10223 for (VPRecipeBase &R : Header->phis()) { 10224 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10225 continue; 10226 10227 Value *ResumeV = nullptr; 10228 // TODO: Move setting of resume values to prepareToExecute. 10229 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10230 ResumeV = MainILV.getReductionResumeValue( 10231 ReductionPhi->getRecurrenceDescriptor()); 10232 } else { 10233 // Create induction resume values for both widened pointer and 10234 // integer/fp inductions and update the start value of the induction 10235 // recipes to use the resume value. 10236 PHINode *IndPhi = nullptr; 10237 const InductionDescriptor *ID; 10238 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10239 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10240 ID = &Ind->getInductionDescriptor(); 10241 } else { 10242 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10243 IndPhi = WidenInd->getPHINode(); 10244 ID = &WidenInd->getInductionDescriptor(); 10245 } 10246 10247 ResumeV = MainILV.createInductionResumeValue( 10248 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10249 {EPI.MainLoopIterationCountCheck}); 10250 } 10251 assert(ResumeV && "Must have a resume value"); 10252 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10253 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10254 } 10255 10256 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10257 DT, true, &ExpandedSCEVs); 10258 ++LoopsEpilogueVectorized; 10259 10260 if (!MainILV.areSafetyChecksAdded()) 10261 DisableRuntimeUnroll = true; 10262 } else { 10263 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10264 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10265 PSI, Checks); 10266 10267 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10268 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10269 ++LoopsVectorized; 10270 10271 // Add metadata to disable runtime unrolling a scalar loop when there 10272 // are no runtime checks about strides and memory. A scalar loop that is 10273 // rarely used is not worth unrolling. 10274 if (!LB.areSafetyChecksAdded()) 10275 DisableRuntimeUnroll = true; 10276 } 10277 // Report the vectorization decision. 10278 reportVectorization(ORE, L, VF, IC); 10279 } 10280 10281 if (ORE->allowExtraAnalysis(LV_NAME)) 10282 checkMixedPrecision(L, ORE); 10283 } 10284 10285 std::optional<MDNode *> RemainderLoopID = 10286 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10287 LLVMLoopVectorizeFollowupEpilogue}); 10288 if (RemainderLoopID) { 10289 L->setLoopID(*RemainderLoopID); 10290 } else { 10291 if (DisableRuntimeUnroll) 10292 AddRuntimeUnrollDisableMetaData(L); 10293 10294 // Mark the loop as already vectorized to avoid vectorizing again. 10295 Hints.setAlreadyVectorized(); 10296 } 10297 10298 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10299 return true; 10300 } 10301 10302 LoopVectorizeResult LoopVectorizePass::runImpl( 10303 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10304 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10305 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10306 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10307 SE = &SE_; 10308 LI = &LI_; 10309 TTI = &TTI_; 10310 DT = &DT_; 10311 BFI = BFI_; 10312 TLI = TLI_; 10313 AC = &AC_; 10314 LAIs = &LAIs_; 10315 DB = &DB_; 10316 ORE = &ORE_; 10317 PSI = PSI_; 10318 10319 // Don't attempt if 10320 // 1. the target claims to have no vector registers, and 10321 // 2. interleaving won't help ILP. 10322 // 10323 // The second condition is necessary because, even if the target has no 10324 // vector registers, loop vectorization may still enable scalar 10325 // interleaving. 10326 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10327 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10328 return LoopVectorizeResult(false, false); 10329 10330 bool Changed = false, CFGChanged = false; 10331 10332 // The vectorizer requires loops to be in simplified form. 10333 // Since simplification may add new inner loops, it has to run before the 10334 // legality and profitability checks. This means running the loop vectorizer 10335 // will simplify all loops, regardless of whether anything end up being 10336 // vectorized. 10337 for (const auto &L : *LI) 10338 Changed |= CFGChanged |= 10339 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10340 10341 // Build up a worklist of inner-loops to vectorize. This is necessary as 10342 // the act of vectorizing or partially unrolling a loop creates new loops 10343 // and can invalidate iterators across the loops. 10344 SmallVector<Loop *, 8> Worklist; 10345 10346 for (Loop *L : *LI) 10347 collectSupportedLoops(*L, LI, ORE, Worklist); 10348 10349 LoopsAnalyzed += Worklist.size(); 10350 10351 // Now walk the identified inner loops. 10352 while (!Worklist.empty()) { 10353 Loop *L = Worklist.pop_back_val(); 10354 10355 // For the inner loops we actually process, form LCSSA to simplify the 10356 // transform. 10357 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10358 10359 Changed |= CFGChanged |= processLoop(L); 10360 10361 if (Changed) { 10362 LAIs->clear(); 10363 10364 #ifndef NDEBUG 10365 if (VerifySCEV) 10366 SE->verify(); 10367 #endif 10368 } 10369 } 10370 10371 // Process each loop nest in the function. 10372 return LoopVectorizeResult(Changed, CFGChanged); 10373 } 10374 10375 PreservedAnalyses LoopVectorizePass::run(Function &F, 10376 FunctionAnalysisManager &AM) { 10377 auto &LI = AM.getResult<LoopAnalysis>(F); 10378 // There are no loops in the function. Return before computing other expensive 10379 // analyses. 10380 if (LI.empty()) 10381 return PreservedAnalyses::all(); 10382 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10383 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10384 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10385 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10386 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10387 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10388 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10389 10390 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10391 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10392 ProfileSummaryInfo *PSI = 10393 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10394 BlockFrequencyInfo *BFI = nullptr; 10395 if (PSI && PSI->hasProfileSummary()) 10396 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10397 LoopVectorizeResult Result = 10398 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10399 if (!Result.MadeAnyChange) 10400 return PreservedAnalyses::all(); 10401 PreservedAnalyses PA; 10402 10403 if (isAssignmentTrackingEnabled(*F.getParent())) { 10404 for (auto &BB : F) 10405 RemoveRedundantDbgInstrs(&BB); 10406 } 10407 10408 // We currently do not preserve loopinfo/dominator analyses with outer loop 10409 // vectorization. Until this is addressed, mark these analyses as preserved 10410 // only for non-VPlan-native path. 10411 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10412 if (!EnableVPlanNativePath) { 10413 PA.preserve<LoopAnalysis>(); 10414 PA.preserve<DominatorTreeAnalysis>(); 10415 PA.preserve<ScalarEvolutionAnalysis>(); 10416 } 10417 10418 if (Result.MadeCFGChange) { 10419 // Making CFG changes likely means a loop got vectorized. Indicate that 10420 // extra simplification passes should be run. 10421 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10422 // be run if runtime checks have been added. 10423 AM.getResult<ShouldRunExtraVectorPasses>(F); 10424 PA.preserve<ShouldRunExtraVectorPasses>(); 10425 } else { 10426 PA.preserveSet<CFGAnalyses>(); 10427 } 10428 return PA; 10429 } 10430 10431 void LoopVectorizePass::printPipeline( 10432 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10433 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10434 OS, MapClassName2PassName); 10435 10436 OS << '<'; 10437 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10438 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10439 OS << '>'; 10440 } 10441