1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanPatternMatch.h" 63 #include "VPlanTransforms.h" 64 #include "VPlanUtils.h" 65 #include "VPlanVerifier.h" 66 #include "llvm/ADT/APInt.h" 67 #include "llvm/ADT/ArrayRef.h" 68 #include "llvm/ADT/DenseMap.h" 69 #include "llvm/ADT/DenseMapInfo.h" 70 #include "llvm/ADT/Hashing.h" 71 #include "llvm/ADT/MapVector.h" 72 #include "llvm/ADT/STLExtras.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/TypeSwitch.h" 79 #include "llvm/ADT/iterator_range.h" 80 #include "llvm/Analysis/AssumptionCache.h" 81 #include "llvm/Analysis/BasicAliasAnalysis.h" 82 #include "llvm/Analysis/BlockFrequencyInfo.h" 83 #include "llvm/Analysis/CFG.h" 84 #include "llvm/Analysis/CodeMetrics.h" 85 #include "llvm/Analysis/DemandedBits.h" 86 #include "llvm/Analysis/GlobalsModRef.h" 87 #include "llvm/Analysis/LoopAccessAnalysis.h" 88 #include "llvm/Analysis/LoopAnalysisManager.h" 89 #include "llvm/Analysis/LoopInfo.h" 90 #include "llvm/Analysis/LoopIterator.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/ValueTracking.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfo.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/MDBuilder.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/PatternMatch.h" 122 #include "llvm/IR/ProfDataUtils.h" 123 #include "llvm/IR/Type.h" 124 #include "llvm/IR/Use.h" 125 #include "llvm/IR/User.h" 126 #include "llvm/IR/Value.h" 127 #include "llvm/IR/Verifier.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/NativeFormatting.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/Local.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 251 "data-and-control-without-rt-check", 252 "Similar to data-and-control, but remove the runtime check"), 253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", 254 "Use predicated EVL instructions for tail folding. If EVL " 255 "is unsupported, fallback to data-without-lane-mask."))); 256 257 static cl::opt<bool> MaximizeBandwidth( 258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 259 cl::desc("Maximize bandwidth when selecting vectorization factor which " 260 "will be determined by the smallest type in loop.")); 261 262 static cl::opt<bool> EnableInterleavedMemAccesses( 263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 264 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 265 266 /// An interleave-group may need masking if it resides in a block that needs 267 /// predication, or in order to mask away gaps. 268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// The number of stores in a loop that are allowed to need predication. 320 static cl::opt<unsigned> NumberOfStoresToPredicate( 321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 322 cl::desc("Max number of stores to be predicated behind an if.")); 323 324 static cl::opt<bool> EnableIndVarRegisterHeur( 325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 326 cl::desc("Count the induction variable only once when interleaving")); 327 328 static cl::opt<bool> EnableCondStoresVectorization( 329 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 330 cl::desc("Enable if predication of stores during vectorization.")); 331 332 static cl::opt<unsigned> MaxNestedScalarReductionIC( 333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 334 cl::desc("The maximum interleave count to use when interleaving a scalar " 335 "reduction in a nested loop.")); 336 337 static cl::opt<bool> 338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 339 cl::Hidden, 340 cl::desc("Prefer in-loop vector reductions, " 341 "overriding the targets preference.")); 342 343 static cl::opt<bool> ForceOrderedReductions( 344 "force-ordered-reductions", cl::init(false), cl::Hidden, 345 cl::desc("Enable the vectorisation of loops with in-order (strict) " 346 "FP reductions")); 347 348 static cl::opt<bool> PreferPredicatedReductionSelect( 349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 350 cl::desc( 351 "Prefer predicating a reduction operation over an after loop select.")); 352 353 namespace llvm { 354 cl::opt<bool> EnableVPlanNativePath( 355 "enable-vplan-native-path", cl::Hidden, 356 cl::desc("Enable VPlan-native vectorization path with " 357 "support for outer loop vectorization.")); 358 } // namespace llvm 359 360 // This flag enables the stress testing of the VPlan H-CFG construction in the 361 // VPlan-native vectorization path. It must be used in conjuction with 362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363 // verification of the H-CFGs built. 364 static cl::opt<bool> VPlanBuildStressTest( 365 "vplan-build-stress-test", cl::init(false), cl::Hidden, 366 cl::desc( 367 "Build VPlan for every supported loop nest in the function and bail " 368 "out right after the build (stress test the VPlan H-CFG construction " 369 "in the VPlan-native vectorization path).")); 370 371 cl::opt<bool> llvm::EnableLoopInterleaving( 372 "interleave-loops", cl::init(true), cl::Hidden, 373 cl::desc("Enable loop interleaving in Loop vectorization passes")); 374 cl::opt<bool> llvm::EnableLoopVectorization( 375 "vectorize-loops", cl::init(true), cl::Hidden, 376 cl::desc("Run the Loop vectorization passes")); 377 378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 379 "force-widen-divrem-via-safe-divisor", cl::Hidden, 380 cl::desc( 381 "Override cost based safe divisor widening for div/rem instructions")); 382 383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 385 cl::Hidden, 386 cl::desc("Try wider VFs if they enable the use of vector variants")); 387 388 static cl::opt<bool> EnableEarlyExitVectorization( 389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden, 390 cl::desc( 391 "Enable vectorization of early exit loops with uncountable exits.")); 392 393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 394 // variables not overflowing do not hold. See `emitSCEVChecks`. 395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 397 // `emitMemRuntimeChecks`. 398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 399 // Likelyhood of bypassing the vectorized loop because there are zero trips left 400 // after prolog. See `emitIterationCountCheck`. 401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 402 403 /// A helper function that returns true if the given type is irregular. The 404 /// type is irregular if its allocated size doesn't equal the store size of an 405 /// element of the corresponding vector type. 406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 407 // Determine if an array of N elements of type Ty is "bitcast compatible" 408 // with a <N x Ty> vector. 409 // This is only true if there is no padding between the array elements. 410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. 418 /// 4) Returns std::nullopt if all of the above failed. 419 static std::optional<unsigned> 420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, 421 bool CanUseConstantMax = true) { 422 // Check if exact trip count is known. 423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) 424 return ExpectedTC; 425 426 // Check if there is an expected trip count available from profile data. 427 if (LoopVectorizeWithBlockFrequency) 428 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 429 return *EstimatedTC; 430 431 if (!CanUseConstantMax) 432 return std::nullopt; 433 434 // Check if upper bound estimate is known. 435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) 436 return ExpectedTC; 437 438 return std::nullopt; 439 } 440 441 namespace { 442 // Forward declare GeneratedRTChecks. 443 class GeneratedRTChecks; 444 445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 446 } // namespace 447 448 namespace llvm { 449 450 AnalysisKey ShouldRunExtraVectorPasses::Key; 451 452 /// InnerLoopVectorizer vectorizes loops which contain only one basic 453 /// block to a specified vectorization factor (VF). 454 /// This class performs the widening of scalars into vectors, or multiple 455 /// scalars. This class also implements the following features: 456 /// * It inserts an epilogue loop for handling loops that don't have iteration 457 /// counts that are known to be a multiple of the vectorization factor. 458 /// * It handles the code generation for reduction variables. 459 /// * Scalarization (implementation using scalars) of un-vectorizable 460 /// instructions. 461 /// InnerLoopVectorizer does not perform any vectorization-legality 462 /// checks, and relies on the caller to check for the different legality 463 /// aspects. The InnerLoopVectorizer relies on the 464 /// LoopVectorizationLegality class to provide information about the induction 465 /// and reduction variables that were found to a given vectorization factor. 466 class InnerLoopVectorizer { 467 public: 468 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 469 LoopInfo *LI, DominatorTree *DT, 470 const TargetLibraryInfo *TLI, 471 const TargetTransformInfo *TTI, AssumptionCache *AC, 472 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 473 ElementCount MinProfitableTripCount, 474 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 475 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, 477 VPlan &Plan) 478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 479 AC(AC), ORE(ORE), VF(VecWidth), 480 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), 481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 482 PSI(PSI), RTChecks(RTChecks), Plan(Plan), 483 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { 484 // Query this against the original loop and save it here because the profile 485 // of the original loop header may change as the transformation happens. 486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 487 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 488 } 489 490 virtual ~InnerLoopVectorizer() = default; 491 492 /// Create a new empty loop that will contain vectorized instructions later 493 /// on, while the old loop will be used as the scalar remainder. Control flow 494 /// is generated around the vectorized (and scalar epilogue) loops consisting 495 /// of various checks and bypasses. Return the pre-header block of the new 496 /// loop. In the case of epilogue vectorization, this function is overriden to 497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is 498 /// used to look up SCEV expansions for expressions needed during skeleton 499 /// creation. 500 virtual BasicBlock * 501 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 502 503 /// Fix the vectorized code, taking care of header phi's, and more. 504 void fixVectorizedLoop(VPTransformState &State); 505 506 // Return true if any runtime check is added. 507 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 508 509 /// A helper function to scalarize a single Instruction in the innermost loop. 510 /// Generates a sequence of scalar instances for each lane between \p MinLane 511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 513 /// Instr's operands. 514 void scalarizeInstruction(const Instruction *Instr, 515 VPReplicateRecipe *RepRecipe, const VPLane &Lane, 516 VPTransformState &State); 517 518 /// Fix the non-induction PHIs in \p Plan. 519 void fixNonInductionPHIs(VPTransformState &State); 520 521 /// Returns the original loop trip count. 522 Value *getTripCount() const { return TripCount; } 523 524 /// Used to set the trip count after ILV's construction and after the 525 /// preheader block has been executed. Note that this always holds the trip 526 /// count of the original loop for both main loop and epilogue vectorization. 527 void setTripCount(Value *TC) { TripCount = TC; } 528 529 // Retrieve the additional bypass value associated with an original 530 /// induction header phi. 531 Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { 532 return Induction2AdditionalBypassValue.at(OrigPhi); 533 } 534 535 /// Return the additional bypass block which targets the scalar loop by 536 /// skipping the epilogue loop after completing the main loop. 537 BasicBlock *getAdditionalBypassBlock() const { 538 assert(AdditionalBypassBlock && 539 "Trying to access AdditionalBypassBlock but it has not been set"); 540 return AdditionalBypassBlock; 541 } 542 543 protected: 544 friend class LoopVectorizationPlanner; 545 546 /// Set up the values of the IVs correctly when exiting the vector loop. 547 virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 548 Value *VectorTripCount, BasicBlock *MiddleBlock, 549 VPTransformState &State); 550 551 /// Iteratively sink the scalarized operands of a predicated instruction into 552 /// the block that was created for it. 553 void sinkScalarOperands(Instruction *PredInst); 554 555 /// Returns (and creates if needed) the trip count of the widened loop. 556 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 557 558 /// Emit a bypass check to see if the vector trip count is zero, including if 559 /// it overflows. 560 void emitIterationCountCheck(BasicBlock *Bypass); 561 562 /// Emit a bypass check to see if all of the SCEV assumptions we've 563 /// had to make are correct. Returns the block containing the checks or 564 /// nullptr if no checks have been added. 565 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 566 567 /// Emit bypass checks to check any memory assumptions we may have made. 568 /// Returns the block containing the checks or nullptr if no checks have been 569 /// added. 570 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 571 572 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 573 /// vector loop preheader, middle block and scalar preheader. 574 void createVectorLoopSkeleton(StringRef Prefix); 575 576 /// Create and record the values for induction variables to resume coming from 577 /// the additional bypass block. 578 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, 579 Value *MainVectorTripCount); 580 581 /// Allow subclasses to override and print debug traces before/after vplan 582 /// execution, when trace information is requested. 583 virtual void printDebugTracesAtStart() {} 584 virtual void printDebugTracesAtEnd() {} 585 586 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the 587 /// vector preheader and its predecessor, also connecting the new block to the 588 /// scalar preheader. 589 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); 590 591 /// The original loop. 592 Loop *OrigLoop; 593 594 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 595 /// dynamic knowledge to simplify SCEV expressions and converts them to a 596 /// more usable form. 597 PredicatedScalarEvolution &PSE; 598 599 /// Loop Info. 600 LoopInfo *LI; 601 602 /// Dominator Tree. 603 DominatorTree *DT; 604 605 /// Target Library Info. 606 const TargetLibraryInfo *TLI; 607 608 /// Target Transform Info. 609 const TargetTransformInfo *TTI; 610 611 /// Assumption Cache. 612 AssumptionCache *AC; 613 614 /// Interface to emit optimization remarks. 615 OptimizationRemarkEmitter *ORE; 616 617 /// The vectorization SIMD factor to use. Each vector will have this many 618 /// vector elements. 619 ElementCount VF; 620 621 ElementCount MinProfitableTripCount; 622 623 /// The vectorization unroll factor to use. Each scalar is vectorized to this 624 /// many different vector instructions. 625 unsigned UF; 626 627 /// The builder that we use 628 IRBuilder<> Builder; 629 630 // --- Vectorization state --- 631 632 /// The vector-loop preheader. 633 BasicBlock *LoopVectorPreHeader; 634 635 /// The scalar-loop preheader. 636 BasicBlock *LoopScalarPreHeader; 637 638 /// Middle Block between the vector and the scalar. 639 BasicBlock *LoopMiddleBlock; 640 641 /// A list of all bypass blocks. The first block is the entry of the loop. 642 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 643 644 /// Store instructions that were predicated. 645 SmallVector<Instruction *, 4> PredicatedInstructions; 646 647 /// Trip count of the original loop. 648 Value *TripCount = nullptr; 649 650 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 651 Value *VectorTripCount = nullptr; 652 653 /// The legality analysis. 654 LoopVectorizationLegality *Legal; 655 656 /// The profitablity analysis. 657 LoopVectorizationCostModel *Cost; 658 659 // Record whether runtime checks are added. 660 bool AddedSafetyChecks = false; 661 662 /// BFI and PSI are used to check for profile guided size optimizations. 663 BlockFrequencyInfo *BFI; 664 ProfileSummaryInfo *PSI; 665 666 // Whether this loop should be optimized for size based on profile guided size 667 // optimizatios. 668 bool OptForSizeBasedOnProfile; 669 670 /// Structure to hold information about generated runtime checks, responsible 671 /// for cleaning the checks, if vectorization turns out unprofitable. 672 GeneratedRTChecks &RTChecks; 673 674 /// Mapping of induction phis to their additional bypass values. They 675 /// need to be added as operands to phi nodes in the scalar loop preheader 676 /// after the epilogue skeleton has been created. 677 DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue; 678 679 /// The additional bypass block which conditionally skips over the epilogue 680 /// loop after executing the main loop. Needed to resume inductions and 681 /// reductions during epilogue vectorization. 682 BasicBlock *AdditionalBypassBlock = nullptr; 683 684 VPlan &Plan; 685 686 /// The vector preheader block of \p Plan, used as target for check blocks 687 /// introduced during skeleton creation. 688 VPBlockBase *VectorPHVPB; 689 }; 690 691 /// Encapsulate information regarding vectorization of a loop and its epilogue. 692 /// This information is meant to be updated and used across two stages of 693 /// epilogue vectorization. 694 struct EpilogueLoopVectorizationInfo { 695 ElementCount MainLoopVF = ElementCount::getFixed(0); 696 unsigned MainLoopUF = 0; 697 ElementCount EpilogueVF = ElementCount::getFixed(0); 698 unsigned EpilogueUF = 0; 699 BasicBlock *MainLoopIterationCountCheck = nullptr; 700 BasicBlock *EpilogueIterationCountCheck = nullptr; 701 BasicBlock *SCEVSafetyCheck = nullptr; 702 BasicBlock *MemSafetyCheck = nullptr; 703 Value *TripCount = nullptr; 704 Value *VectorTripCount = nullptr; 705 VPlan &EpiloguePlan; 706 707 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 708 ElementCount EVF, unsigned EUF, 709 VPlan &EpiloguePlan) 710 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), 711 EpiloguePlan(EpiloguePlan) { 712 assert(EUF == 1 && 713 "A high UF for the epilogue loop is likely not beneficial."); 714 } 715 }; 716 717 /// An extension of the inner loop vectorizer that creates a skeleton for a 718 /// vectorized loop that has its epilogue (residual) also vectorized. 719 /// The idea is to run the vplan on a given loop twice, firstly to setup the 720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 721 /// from the first step and vectorize the epilogue. This is achieved by 722 /// deriving two concrete strategy classes from this base class and invoking 723 /// them in succession from the loop vectorizer planner. 724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 725 public: 726 InnerLoopAndEpilogueVectorizer( 727 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 728 DominatorTree *DT, const TargetLibraryInfo *TLI, 729 const TargetTransformInfo *TTI, AssumptionCache *AC, 730 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 731 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 732 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 733 GeneratedRTChecks &Checks, VPlan &Plan) 734 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 735 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 736 CM, BFI, PSI, Checks, Plan), 737 EPI(EPI) {} 738 739 // Override this function to handle the more complex control flow around the 740 // three loops. 741 BasicBlock * 742 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { 743 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 744 } 745 746 /// The interface for creating a vectorized skeleton using one of two 747 /// different strategies, each corresponding to one execution of the vplan 748 /// as described above. 749 virtual BasicBlock * 750 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 751 752 /// Holds and updates state information required to vectorize the main loop 753 /// and its epilogue in two separate passes. This setup helps us avoid 754 /// regenerating and recomputing runtime safety checks. It also helps us to 755 /// shorten the iteration-count-check path length for the cases where the 756 /// iteration count of the loop is so small that the main vector loop is 757 /// completely skipped. 758 EpilogueLoopVectorizationInfo &EPI; 759 }; 760 761 /// A specialized derived class of inner loop vectorizer that performs 762 /// vectorization of *main* loops in the process of vectorizing loops and their 763 /// epilogues. 764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 765 public: 766 EpilogueVectorizerMainLoop( 767 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 768 DominatorTree *DT, const TargetLibraryInfo *TLI, 769 const TargetTransformInfo *TTI, AssumptionCache *AC, 770 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 771 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 772 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 773 GeneratedRTChecks &Check, VPlan &Plan) 774 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 775 EPI, LVL, CM, BFI, PSI, Check, Plan) {} 776 /// Implements the interface for creating a vectorized skeleton using the 777 /// *main loop* strategy (ie the first pass of vplan execution). 778 BasicBlock * 779 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 780 781 protected: 782 /// Emits an iteration count bypass check once for the main loop (when \p 783 /// ForEpilogue is false) and once for the epilogue loop (when \p 784 /// ForEpilogue is true). 785 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 786 void printDebugTracesAtStart() override; 787 void printDebugTracesAtEnd() override; 788 789 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 790 Value *VectorTripCount, BasicBlock *MiddleBlock, 791 VPTransformState &State) override {}; 792 }; 793 794 // A specialized derived class of inner loop vectorizer that performs 795 // vectorization of *epilogue* loops in the process of vectorizing loops and 796 // their epilogues. 797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 798 public: 799 EpilogueVectorizerEpilogueLoop( 800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 801 DominatorTree *DT, const TargetLibraryInfo *TLI, 802 const TargetTransformInfo *TTI, AssumptionCache *AC, 803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 806 GeneratedRTChecks &Checks, VPlan &Plan) 807 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 808 EPI, LVL, CM, BFI, PSI, Checks, Plan) { 809 TripCount = EPI.TripCount; 810 } 811 /// Implements the interface for creating a vectorized skeleton using the 812 /// *epilogue loop* strategy (ie the second pass of vplan execution). 813 BasicBlock * 814 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 815 816 protected: 817 /// Emits an iteration count bypass check after the main vector loop has 818 /// finished to see if there are any iterations left to execute by either 819 /// the vector epilogue or the scalar epilogue. 820 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 821 BasicBlock *Bypass, 822 BasicBlock *Insert); 823 void printDebugTracesAtStart() override; 824 void printDebugTracesAtEnd() override; 825 }; 826 } // end namespace llvm 827 828 /// Look for a meaningful debug location on the instruction or its operands. 829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 830 if (!I) 831 return DebugLoc(); 832 833 DebugLoc Empty; 834 if (I->getDebugLoc() != Empty) 835 return I->getDebugLoc(); 836 837 for (Use &Op : I->operands()) { 838 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 839 if (OpInst->getDebugLoc() != Empty) 840 return OpInst->getDebugLoc(); 841 } 842 843 return I->getDebugLoc(); 844 } 845 846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 847 /// is passed, the message relates to that particular instruction. 848 #ifndef NDEBUG 849 static void debugVectorizationMessage(const StringRef Prefix, 850 const StringRef DebugMsg, 851 Instruction *I) { 852 dbgs() << "LV: " << Prefix << DebugMsg; 853 if (I != nullptr) 854 dbgs() << " " << *I; 855 else 856 dbgs() << '.'; 857 dbgs() << '\n'; 858 } 859 #endif 860 861 /// Create an analysis remark that explains why vectorization failed 862 /// 863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 864 /// RemarkName is the identifier for the remark. If \p I is passed it is an 865 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 866 /// the location of the remark. If \p DL is passed, use it as debug location for 867 /// the remark. \return the remark object that can be streamed to. 868 static OptimizationRemarkAnalysis 869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, 870 Instruction *I, DebugLoc DL = {}) { 871 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); 872 // If debug location is attached to the instruction, use it. Otherwise if DL 873 // was not provided, use the loop's. 874 if (I && I->getDebugLoc()) 875 DL = I->getDebugLoc(); 876 else if (!DL) 877 DL = TheLoop->getStartLoc(); 878 879 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 880 } 881 882 namespace llvm { 883 884 /// Return a value for Step multiplied by VF. 885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 886 int64_t Step) { 887 assert(Ty->isIntegerTy() && "Expected an integer step"); 888 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 889 } 890 891 /// Return the runtime value for VF. 892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 893 return B.CreateElementCount(Ty, VF); 894 } 895 896 void reportVectorizationFailure(const StringRef DebugMsg, 897 const StringRef OREMsg, const StringRef ORETag, 898 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 899 Instruction *I) { 900 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 901 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 902 ORE->emit( 903 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 904 << "loop not vectorized: " << OREMsg); 905 } 906 907 /// Reports an informative message: print \p Msg for debugging purposes as well 908 /// as an optimization remark. Uses either \p I as location of the remark, or 909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the 910 /// remark. If \p DL is passed, use it as debug location for the remark. 911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 912 OptimizationRemarkEmitter *ORE, 913 Loop *TheLoop, Instruction *I = nullptr, 914 DebugLoc DL = {}) { 915 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 916 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 917 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, 918 I, DL) 919 << Msg); 920 } 921 922 /// Report successful vectorization of the loop. In case an outer loop is 923 /// vectorized, prepend "outer" to the vectorization remark. 924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 925 VectorizationFactor VF, unsigned IC) { 926 LLVM_DEBUG(debugVectorizationMessage( 927 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 928 nullptr)); 929 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 930 ORE->emit([&]() { 931 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 932 TheLoop->getHeader()) 933 << "vectorized " << LoopType << "loop (vectorization width: " 934 << ore::NV("VectorizationFactor", VF.Width) 935 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 936 }); 937 } 938 939 } // end namespace llvm 940 941 namespace llvm { 942 943 // Loop vectorization cost-model hints how the scalar epilogue loop should be 944 // lowered. 945 enum ScalarEpilogueLowering { 946 947 // The default: allowing scalar epilogues. 948 CM_ScalarEpilogueAllowed, 949 950 // Vectorization with OptForSize: don't allow epilogues. 951 CM_ScalarEpilogueNotAllowedOptSize, 952 953 // A special case of vectorisation with OptForSize: loops with a very small 954 // trip count are considered for vectorization under OptForSize, thereby 955 // making sure the cost of their loop body is dominant, free of runtime 956 // guards and scalar iteration overheads. 957 CM_ScalarEpilogueNotAllowedLowTripLoop, 958 959 // Loop hint predicate indicating an epilogue is undesired. 960 CM_ScalarEpilogueNotNeededUsePredicate, 961 962 // Directive indicating we must either tail fold or not vectorize 963 CM_ScalarEpilogueNotAllowedUsePredicate 964 }; 965 966 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 967 968 /// LoopVectorizationCostModel - estimates the expected speedups due to 969 /// vectorization. 970 /// In many cases vectorization is not profitable. This can happen because of 971 /// a number of reasons. In this class we mainly attempt to predict the 972 /// expected speedup/slowdowns due to the supported instruction set. We use the 973 /// TargetTransformInfo to query the different backends for the cost of 974 /// different operations. 975 class LoopVectorizationCostModel { 976 friend class LoopVectorizationPlanner; 977 978 public: 979 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 980 PredicatedScalarEvolution &PSE, LoopInfo *LI, 981 LoopVectorizationLegality *Legal, 982 const TargetTransformInfo &TTI, 983 const TargetLibraryInfo *TLI, DemandedBits *DB, 984 AssumptionCache *AC, 985 OptimizationRemarkEmitter *ORE, const Function *F, 986 const LoopVectorizeHints *Hints, 987 InterleavedAccessInfo &IAI) 988 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 989 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 990 Hints(Hints), InterleaveInfo(IAI) {} 991 992 /// \return An upper bound for the vectorization factors (both fixed and 993 /// scalable). If the factors are 0, vectorization and interleaving should be 994 /// avoided up front. 995 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 996 997 /// \return True if runtime checks are required for vectorization, and false 998 /// otherwise. 999 bool runtimeChecksRequired(); 1000 1001 /// Setup cost-based decisions for user vectorization factor. 1002 /// \return true if the UserVF is a feasible VF to be chosen. 1003 bool selectUserVectorizationFactor(ElementCount UserVF) { 1004 collectUniformsAndScalars(UserVF); 1005 collectInstsToScalarize(UserVF); 1006 return expectedCost(UserVF).isValid(); 1007 } 1008 1009 /// \return The size (in bits) of the smallest and widest types in the code 1010 /// that needs to be vectorized. We ignore values that remain scalar such as 1011 /// 64 bit loop indices. 1012 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1013 1014 /// \return The desired interleave count. 1015 /// If interleave count has been specified by metadata it will be returned. 1016 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1017 /// are the selected vectorization factor and the cost of the selected VF. 1018 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1019 1020 /// Memory access instruction may be vectorized in more than one way. 1021 /// Form of instruction after vectorization depends on cost. 1022 /// This function takes cost-based decisions for Load/Store instructions 1023 /// and collects them in a map. This decisions map is used for building 1024 /// the lists of loop-uniform and loop-scalar instructions. 1025 /// The calculated cost is saved with widening decision in order to 1026 /// avoid redundant calculations. 1027 void setCostBasedWideningDecision(ElementCount VF); 1028 1029 /// A call may be vectorized in different ways depending on whether we have 1030 /// vectorized variants available and whether the target supports masking. 1031 /// This function analyzes all calls in the function at the supplied VF, 1032 /// makes a decision based on the costs of available options, and stores that 1033 /// decision in a map for use in planning and plan execution. 1034 void setVectorizedCallDecision(ElementCount VF); 1035 1036 /// A struct that represents some properties of the register usage 1037 /// of a loop. 1038 struct RegisterUsage { 1039 /// Holds the number of loop invariant values that are used in the loop. 1040 /// The key is ClassID of target-provided register class. 1041 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1042 /// Holds the maximum number of concurrent live intervals in the loop. 1043 /// The key is ClassID of target-provided register class. 1044 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1045 }; 1046 1047 /// \return Returns information about the register usages of the loop for the 1048 /// given vectorization factors. 1049 SmallVector<RegisterUsage, 8> 1050 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1051 1052 /// Collect values we want to ignore in the cost model. 1053 void collectValuesToIgnore(); 1054 1055 /// Collect all element types in the loop for which widening is needed. 1056 void collectElementTypesForWidening(); 1057 1058 /// Split reductions into those that happen in the loop, and those that happen 1059 /// outside. In loop reductions are collected into InLoopReductions. 1060 void collectInLoopReductions(); 1061 1062 /// Returns true if we should use strict in-order reductions for the given 1063 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1064 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1065 /// of FP operations. 1066 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1067 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1068 } 1069 1070 /// \returns The smallest bitwidth each instruction can be represented with. 1071 /// The vector equivalents of these instructions should be truncated to this 1072 /// type. 1073 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1074 return MinBWs; 1075 } 1076 1077 /// \returns True if it is more profitable to scalarize instruction \p I for 1078 /// vectorization factor \p VF. 1079 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1080 assert(VF.isVector() && 1081 "Profitable to scalarize relevant only for VF > 1."); 1082 assert( 1083 TheLoop->isInnermost() && 1084 "cost-model should not be used for outer loops (in VPlan-native path)"); 1085 1086 auto Scalars = InstsToScalarize.find(VF); 1087 assert(Scalars != InstsToScalarize.end() && 1088 "VF not yet analyzed for scalarization profitability"); 1089 return Scalars->second.contains(I); 1090 } 1091 1092 /// Returns true if \p I is known to be uniform after vectorization. 1093 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1094 assert( 1095 TheLoop->isInnermost() && 1096 "cost-model should not be used for outer loops (in VPlan-native path)"); 1097 // Pseudo probe needs to be duplicated for each unrolled iteration and 1098 // vector lane so that profiled loop trip count can be accurately 1099 // accumulated instead of being under counted. 1100 if (isa<PseudoProbeInst>(I)) 1101 return false; 1102 1103 if (VF.isScalar()) 1104 return true; 1105 1106 auto UniformsPerVF = Uniforms.find(VF); 1107 assert(UniformsPerVF != Uniforms.end() && 1108 "VF not yet analyzed for uniformity"); 1109 return UniformsPerVF->second.count(I); 1110 } 1111 1112 /// Returns true if \p I is known to be scalar after vectorization. 1113 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1114 assert( 1115 TheLoop->isInnermost() && 1116 "cost-model should not be used for outer loops (in VPlan-native path)"); 1117 if (VF.isScalar()) 1118 return true; 1119 1120 auto ScalarsPerVF = Scalars.find(VF); 1121 assert(ScalarsPerVF != Scalars.end() && 1122 "Scalar values are not calculated for VF"); 1123 return ScalarsPerVF->second.count(I); 1124 } 1125 1126 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1127 /// for vectorization factor \p VF. 1128 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1129 return VF.isVector() && MinBWs.contains(I) && 1130 !isProfitableToScalarize(I, VF) && 1131 !isScalarAfterVectorization(I, VF); 1132 } 1133 1134 /// Decision that was taken during cost calculation for memory instruction. 1135 enum InstWidening { 1136 CM_Unknown, 1137 CM_Widen, // For consecutive accesses with stride +1. 1138 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1139 CM_Interleave, 1140 CM_GatherScatter, 1141 CM_Scalarize, 1142 CM_VectorCall, 1143 CM_IntrinsicCall 1144 }; 1145 1146 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1147 /// instruction \p I and vector width \p VF. 1148 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1149 InstructionCost Cost) { 1150 assert(VF.isVector() && "Expected VF >=2"); 1151 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1152 } 1153 1154 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1155 /// interleaving group \p Grp and vector width \p VF. 1156 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1157 ElementCount VF, InstWidening W, 1158 InstructionCost Cost) { 1159 assert(VF.isVector() && "Expected VF >=2"); 1160 /// Broadcast this decicion to all instructions inside the group. 1161 /// When interleaving, the cost will only be assigned one instruction, the 1162 /// insert position. For other cases, add the appropriate fraction of the 1163 /// total cost to each instruction. This ensures accurate costs are used, 1164 /// even if the insert position instruction is not used. 1165 InstructionCost InsertPosCost = Cost; 1166 InstructionCost OtherMemberCost = 0; 1167 if (W != CM_Interleave) 1168 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); 1169 ; 1170 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { 1171 if (auto *I = Grp->getMember(Idx)) { 1172 if (Grp->getInsertPos() == I) 1173 WideningDecisions[std::make_pair(I, VF)] = 1174 std::make_pair(W, InsertPosCost); 1175 else 1176 WideningDecisions[std::make_pair(I, VF)] = 1177 std::make_pair(W, OtherMemberCost); 1178 } 1179 } 1180 } 1181 1182 /// Return the cost model decision for the given instruction \p I and vector 1183 /// width \p VF. Return CM_Unknown if this instruction did not pass 1184 /// through the cost modeling. 1185 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1186 assert(VF.isVector() && "Expected VF to be a vector VF"); 1187 assert( 1188 TheLoop->isInnermost() && 1189 "cost-model should not be used for outer loops (in VPlan-native path)"); 1190 1191 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1192 auto Itr = WideningDecisions.find(InstOnVF); 1193 if (Itr == WideningDecisions.end()) 1194 return CM_Unknown; 1195 return Itr->second.first; 1196 } 1197 1198 /// Return the vectorization cost for the given instruction \p I and vector 1199 /// width \p VF. 1200 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1201 assert(VF.isVector() && "Expected VF >=2"); 1202 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1203 assert(WideningDecisions.contains(InstOnVF) && 1204 "The cost is not calculated"); 1205 return WideningDecisions[InstOnVF].second; 1206 } 1207 1208 struct CallWideningDecision { 1209 InstWidening Kind; 1210 Function *Variant; 1211 Intrinsic::ID IID; 1212 std::optional<unsigned> MaskPos; 1213 InstructionCost Cost; 1214 }; 1215 1216 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1217 Function *Variant, Intrinsic::ID IID, 1218 std::optional<unsigned> MaskPos, 1219 InstructionCost Cost) { 1220 assert(!VF.isScalar() && "Expected vector VF"); 1221 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1222 MaskPos, Cost}; 1223 } 1224 1225 CallWideningDecision getCallWideningDecision(CallInst *CI, 1226 ElementCount VF) const { 1227 assert(!VF.isScalar() && "Expected vector VF"); 1228 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1229 } 1230 1231 /// Return True if instruction \p I is an optimizable truncate whose operand 1232 /// is an induction variable. Such a truncate will be removed by adding a new 1233 /// induction variable with the destination type. 1234 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1235 // If the instruction is not a truncate, return false. 1236 auto *Trunc = dyn_cast<TruncInst>(I); 1237 if (!Trunc) 1238 return false; 1239 1240 // Get the source and destination types of the truncate. 1241 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1242 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1243 1244 // If the truncate is free for the given types, return false. Replacing a 1245 // free truncate with an induction variable would add an induction variable 1246 // update instruction to each iteration of the loop. We exclude from this 1247 // check the primary induction variable since it will need an update 1248 // instruction regardless. 1249 Value *Op = Trunc->getOperand(0); 1250 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1251 return false; 1252 1253 // If the truncated value is not an induction variable, return false. 1254 return Legal->isInductionPhi(Op); 1255 } 1256 1257 /// Collects the instructions to scalarize for each predicated instruction in 1258 /// the loop. 1259 void collectInstsToScalarize(ElementCount VF); 1260 1261 /// Collect Uniform and Scalar values for the given \p VF. 1262 /// The sets depend on CM decision for Load/Store instructions 1263 /// that may be vectorized as interleave, gather-scatter or scalarized. 1264 /// Also make a decision on what to do about call instructions in the loop 1265 /// at that VF -- scalarize, call a known vector routine, or call a 1266 /// vector intrinsic. 1267 void collectUniformsAndScalars(ElementCount VF) { 1268 // Do the analysis once. 1269 if (VF.isScalar() || Uniforms.contains(VF)) 1270 return; 1271 setCostBasedWideningDecision(VF); 1272 collectLoopUniforms(VF); 1273 setVectorizedCallDecision(VF); 1274 collectLoopScalars(VF); 1275 } 1276 1277 /// Returns true if the target machine supports masked store operation 1278 /// for the given \p DataType and kind of access to \p Ptr. 1279 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1280 return Legal->isConsecutivePtr(DataType, Ptr) && 1281 TTI.isLegalMaskedStore(DataType, Alignment); 1282 } 1283 1284 /// Returns true if the target machine supports masked load operation 1285 /// for the given \p DataType and kind of access to \p Ptr. 1286 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1287 return Legal->isConsecutivePtr(DataType, Ptr) && 1288 TTI.isLegalMaskedLoad(DataType, Alignment); 1289 } 1290 1291 /// Returns true if the target machine can represent \p V as a masked gather 1292 /// or scatter operation. 1293 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1294 bool LI = isa<LoadInst>(V); 1295 bool SI = isa<StoreInst>(V); 1296 if (!LI && !SI) 1297 return false; 1298 auto *Ty = getLoadStoreType(V); 1299 Align Align = getLoadStoreAlignment(V); 1300 if (VF.isVector()) 1301 Ty = VectorType::get(Ty, VF); 1302 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1303 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1304 } 1305 1306 /// Returns true if the target machine supports all of the reduction 1307 /// variables found for the given VF. 1308 bool canVectorizeReductions(ElementCount VF) const { 1309 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1310 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1311 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1312 })); 1313 } 1314 1315 /// Given costs for both strategies, return true if the scalar predication 1316 /// lowering should be used for div/rem. This incorporates an override 1317 /// option so it is not simply a cost comparison. 1318 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1319 InstructionCost SafeDivisorCost) const { 1320 switch (ForceSafeDivisor) { 1321 case cl::BOU_UNSET: 1322 return ScalarCost < SafeDivisorCost; 1323 case cl::BOU_TRUE: 1324 return false; 1325 case cl::BOU_FALSE: 1326 return true; 1327 } 1328 llvm_unreachable("impossible case value"); 1329 } 1330 1331 /// Returns true if \p I is an instruction which requires predication and 1332 /// for which our chosen predication strategy is scalarization (i.e. we 1333 /// don't have an alternate strategy such as masking available). 1334 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1335 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1336 1337 /// Returns true if \p I is an instruction that needs to be predicated 1338 /// at runtime. The result is independent of the predication mechanism. 1339 /// Superset of instructions that return true for isScalarWithPredication. 1340 bool isPredicatedInst(Instruction *I) const; 1341 1342 /// Return the costs for our two available strategies for lowering a 1343 /// div/rem operation which requires speculating at least one lane. 1344 /// First result is for scalarization (will be invalid for scalable 1345 /// vectors); second is for the safe-divisor strategy. 1346 std::pair<InstructionCost, InstructionCost> 1347 getDivRemSpeculationCost(Instruction *I, 1348 ElementCount VF) const; 1349 1350 /// Returns true if \p I is a memory instruction with consecutive memory 1351 /// access that can be widened. 1352 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1353 1354 /// Returns true if \p I is a memory instruction in an interleaved-group 1355 /// of memory accesses that can be vectorized with wide vector loads/stores 1356 /// and shuffles. 1357 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; 1358 1359 /// Check if \p Instr belongs to any interleaved access group. 1360 bool isAccessInterleaved(Instruction *Instr) const { 1361 return InterleaveInfo.isInterleaved(Instr); 1362 } 1363 1364 /// Get the interleaved access group that \p Instr belongs to. 1365 const InterleaveGroup<Instruction> * 1366 getInterleavedAccessGroup(Instruction *Instr) const { 1367 return InterleaveInfo.getInterleaveGroup(Instr); 1368 } 1369 1370 /// Returns true if we're required to use a scalar epilogue for at least 1371 /// the final iteration of the original loop. 1372 bool requiresScalarEpilogue(bool IsVectorizing) const { 1373 if (!isScalarEpilogueAllowed()) { 1374 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1375 return false; 1376 } 1377 // If we might exit from anywhere but the latch and early exit vectorization 1378 // is disabled, we must run the exiting iteration in scalar form. 1379 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && 1380 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { 1381 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " 1382 "from latch block\n"); 1383 return true; 1384 } 1385 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { 1386 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " 1387 "interleaved group requires scalar epilogue\n"); 1388 return true; 1389 } 1390 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1391 return false; 1392 } 1393 1394 /// Returns true if we're required to use a scalar epilogue for at least 1395 /// the final iteration of the original loop for all VFs in \p Range. 1396 /// A scalar epilogue must either be required for all VFs in \p Range or for 1397 /// none. 1398 bool requiresScalarEpilogue(VFRange Range) const { 1399 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1400 return requiresScalarEpilogue(VF.isVector()); 1401 }; 1402 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1403 assert( 1404 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1405 "all VFs in range must agree on whether a scalar epilogue is required"); 1406 return IsRequired; 1407 } 1408 1409 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1410 /// loop hint annotation. 1411 bool isScalarEpilogueAllowed() const { 1412 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1413 } 1414 1415 /// Returns the TailFoldingStyle that is best for the current loop. 1416 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1417 if (!ChosenTailFoldingStyle) 1418 return TailFoldingStyle::None; 1419 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first 1420 : ChosenTailFoldingStyle->second; 1421 } 1422 1423 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1424 /// overflow or not. 1425 /// \param IsScalableVF true if scalable vector factors enabled. 1426 /// \param UserIC User specific interleave count. 1427 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { 1428 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); 1429 if (!Legal->canFoldTailByMasking()) { 1430 ChosenTailFoldingStyle = 1431 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); 1432 return; 1433 } 1434 1435 if (!ForceTailFoldingStyle.getNumOccurrences()) { 1436 ChosenTailFoldingStyle = std::make_pair( 1437 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), 1438 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); 1439 return; 1440 } 1441 1442 // Set styles when forced. 1443 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), 1444 ForceTailFoldingStyle.getValue()); 1445 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) 1446 return; 1447 // Override forced styles if needed. 1448 // FIXME: use actual opcode/data type for analysis here. 1449 // FIXME: Investigate opportunity for fixed vector factor. 1450 bool EVLIsLegal = 1451 UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && 1452 !EnableVPlanNativePath && 1453 // FIXME: remove this once fixed-ordered recurrence is supported. 1454 Legal->getFixedOrderRecurrences().empty(); 1455 if (!EVLIsLegal) { 1456 // If for some reason EVL mode is unsupported, fallback to 1457 // DataWithoutLaneMask to try to vectorize the loop with folded tail 1458 // in a generic way. 1459 ChosenTailFoldingStyle = 1460 std::make_pair(TailFoldingStyle::DataWithoutLaneMask, 1461 TailFoldingStyle::DataWithoutLaneMask); 1462 LLVM_DEBUG( 1463 dbgs() 1464 << "LV: Preference for VP intrinsics indicated. Will " 1465 "not try to generate VP Intrinsics " 1466 << (UserIC > 1 1467 ? "since interleave count specified is greater than 1.\n" 1468 : "due to non-interleaving reasons.\n")); 1469 } 1470 } 1471 1472 /// Returns true if all loop blocks should be masked to fold tail loop. 1473 bool foldTailByMasking() const { 1474 // TODO: check if it is possible to check for None style independent of 1475 // IVUpdateMayOverflow flag in getTailFoldingStyle. 1476 return getTailFoldingStyle() != TailFoldingStyle::None; 1477 } 1478 1479 /// Return maximum safe number of elements to be processed per vector 1480 /// iteration, which do not prevent store-load forwarding and are safe with 1481 /// regard to the memory dependencies. Required for EVL-based VPlans to 1482 /// correctly calculate AVL (application vector length) as min(remaining AVL, 1483 /// MaxSafeElements). 1484 /// TODO: need to consider adjusting cost model to use this value as a 1485 /// vectorization factor for EVL-based vectorization. 1486 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; } 1487 1488 /// Returns true if the instructions in this block requires predication 1489 /// for any reason, e.g. because tail folding now requires a predicate 1490 /// or because the block in the original loop was predicated. 1491 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1492 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1493 } 1494 1495 /// Returns true if VP intrinsics with explicit vector length support should 1496 /// be generated in the tail folded loop. 1497 bool foldTailWithEVL() const { 1498 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; 1499 } 1500 1501 /// Returns true if the Phi is part of an inloop reduction. 1502 bool isInLoopReduction(PHINode *Phi) const { 1503 return InLoopReductions.contains(Phi); 1504 } 1505 1506 /// Returns true if the predicated reduction select should be used to set the 1507 /// incoming value for the reduction phi. 1508 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { 1509 // Force to use predicated reduction select since the EVL of the 1510 // second-to-last iteration might not be VF*UF. 1511 if (foldTailWithEVL()) 1512 return true; 1513 return PreferPredicatedReductionSelect || 1514 TTI.preferPredicatedReductionSelect( 1515 Opcode, PhiTy, TargetTransformInfo::ReductionFlags()); 1516 } 1517 1518 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1519 /// with factor VF. Return the cost of the instruction, including 1520 /// scalarization overhead if it's needed. 1521 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1522 1523 /// Estimate cost of a call instruction CI if it were vectorized with factor 1524 /// VF. Return the cost of the instruction, including scalarization overhead 1525 /// if it's needed. 1526 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1527 1528 /// Invalidates decisions already taken by the cost model. 1529 void invalidateCostModelingDecisions() { 1530 WideningDecisions.clear(); 1531 CallWideningDecisions.clear(); 1532 Uniforms.clear(); 1533 Scalars.clear(); 1534 } 1535 1536 /// Returns the expected execution cost. The unit of the cost does 1537 /// not matter because we use the 'cost' units to compare different 1538 /// vector widths. The cost that is returned is *not* normalized by 1539 /// the factor width. 1540 InstructionCost expectedCost(ElementCount VF); 1541 1542 bool hasPredStores() const { return NumPredStores > 0; } 1543 1544 /// Returns true if epilogue vectorization is considered profitable, and 1545 /// false otherwise. 1546 /// \p VF is the vectorization factor chosen for the original loop. 1547 /// \p Multiplier is an aditional scaling factor applied to VF before 1548 /// comparing to EpilogueVectorizationMinVF. 1549 bool isEpilogueVectorizationProfitable(const ElementCount VF, 1550 const unsigned IC) const; 1551 1552 /// Returns the execution time cost of an instruction for a given vector 1553 /// width. Vector width of one means scalar. 1554 InstructionCost getInstructionCost(Instruction *I, ElementCount VF); 1555 1556 /// Return the cost of instructions in an inloop reduction pattern, if I is 1557 /// part of that pattern. 1558 std::optional<InstructionCost> 1559 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1560 TTI::TargetCostKind CostKind) const; 1561 1562 /// Returns true if \p Op should be considered invariant and if it is 1563 /// trivially hoistable. 1564 bool shouldConsiderInvariant(Value *Op); 1565 1566 private: 1567 unsigned NumPredStores = 0; 1568 1569 /// \return An upper bound for the vectorization factors for both 1570 /// fixed and scalable vectorization, where the minimum-known number of 1571 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1572 /// disabled or unsupported, then the scalable part will be equal to 1573 /// ElementCount::getScalable(0). 1574 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1575 ElementCount UserVF, 1576 bool FoldTailByMasking); 1577 1578 /// \return the maximized element count based on the targets vector 1579 /// registers and the loop trip-count, but limited to a maximum safe VF. 1580 /// This is a helper function of computeFeasibleMaxVF. 1581 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1582 unsigned SmallestType, 1583 unsigned WidestType, 1584 ElementCount MaxSafeVF, 1585 bool FoldTailByMasking); 1586 1587 /// Checks if scalable vectorization is supported and enabled. Caches the 1588 /// result to avoid repeated debug dumps for repeated queries. 1589 bool isScalableVectorizationAllowed(); 1590 1591 /// \return the maximum legal scalable VF, based on the safe max number 1592 /// of elements. 1593 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1594 1595 /// Calculate vectorization cost of memory instruction \p I. 1596 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1597 1598 /// The cost computation for scalarized memory instruction. 1599 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1600 1601 /// The cost computation for interleaving group of memory instructions. 1602 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1603 1604 /// The cost computation for Gather/Scatter instruction. 1605 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1606 1607 /// The cost computation for widening instruction \p I with consecutive 1608 /// memory access. 1609 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1610 1611 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1612 /// Load: scalar load + broadcast. 1613 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1614 /// element) 1615 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1616 1617 /// Estimate the overhead of scalarizing an instruction. This is a 1618 /// convenience wrapper for the type-based getScalarizationOverhead API. 1619 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1620 TTI::TargetCostKind CostKind) const; 1621 1622 /// Returns true if an artificially high cost for emulated masked memrefs 1623 /// should be used. 1624 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1625 1626 /// Map of scalar integer values to the smallest bitwidth they can be legally 1627 /// represented as. The vector equivalents of these values should be truncated 1628 /// to this type. 1629 MapVector<Instruction *, uint64_t> MinBWs; 1630 1631 /// A type representing the costs for instructions if they were to be 1632 /// scalarized rather than vectorized. The entries are Instruction-Cost 1633 /// pairs. 1634 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1635 1636 /// A set containing all BasicBlocks that are known to present after 1637 /// vectorization as a predicated block. 1638 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1639 PredicatedBBsAfterVectorization; 1640 1641 /// Records whether it is allowed to have the original scalar loop execute at 1642 /// least once. This may be needed as a fallback loop in case runtime 1643 /// aliasing/dependence checks fail, or to handle the tail/remainder 1644 /// iterations when the trip count is unknown or doesn't divide by the VF, 1645 /// or as a peel-loop to handle gaps in interleave-groups. 1646 /// Under optsize and when the trip count is very small we don't allow any 1647 /// iterations to execute in the scalar loop. 1648 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1649 1650 /// Control finally chosen tail folding style. The first element is used if 1651 /// the IV update may overflow, the second element - if it does not. 1652 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> 1653 ChosenTailFoldingStyle; 1654 1655 /// true if scalable vectorization is supported and enabled. 1656 std::optional<bool> IsScalableVectorizationAllowed; 1657 1658 /// Maximum safe number of elements to be processed per vector iteration, 1659 /// which do not prevent store-load forwarding and are safe with regard to the 1660 /// memory dependencies. Required for EVL-based veectorization, where this 1661 /// value is used as the upper bound of the safe AVL. 1662 std::optional<unsigned> MaxSafeElements; 1663 1664 /// A map holding scalar costs for different vectorization factors. The 1665 /// presence of a cost for an instruction in the mapping indicates that the 1666 /// instruction will be scalarized when vectorizing with the associated 1667 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1668 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1669 1670 /// Holds the instructions known to be uniform after vectorization. 1671 /// The data is collected per VF. 1672 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1673 1674 /// Holds the instructions known to be scalar after vectorization. 1675 /// The data is collected per VF. 1676 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1677 1678 /// Holds the instructions (address computations) that are forced to be 1679 /// scalarized. 1680 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1681 1682 /// PHINodes of the reductions that should be expanded in-loop. 1683 SmallPtrSet<PHINode *, 4> InLoopReductions; 1684 1685 /// A Map of inloop reduction operations and their immediate chain operand. 1686 /// FIXME: This can be removed once reductions can be costed correctly in 1687 /// VPlan. This was added to allow quick lookup of the inloop operations. 1688 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1689 1690 /// Returns the expected difference in cost from scalarizing the expression 1691 /// feeding a predicated instruction \p PredInst. The instructions to 1692 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1693 /// non-negative return value implies the expression will be scalarized. 1694 /// Currently, only single-use chains are considered for scalarization. 1695 InstructionCost computePredInstDiscount(Instruction *PredInst, 1696 ScalarCostsTy &ScalarCosts, 1697 ElementCount VF); 1698 1699 /// Collect the instructions that are uniform after vectorization. An 1700 /// instruction is uniform if we represent it with a single scalar value in 1701 /// the vectorized loop corresponding to each vector iteration. Examples of 1702 /// uniform instructions include pointer operands of consecutive or 1703 /// interleaved memory accesses. Note that although uniformity implies an 1704 /// instruction will be scalar, the reverse is not true. In general, a 1705 /// scalarized instruction will be represented by VF scalar values in the 1706 /// vectorized loop, each corresponding to an iteration of the original 1707 /// scalar loop. 1708 void collectLoopUniforms(ElementCount VF); 1709 1710 /// Collect the instructions that are scalar after vectorization. An 1711 /// instruction is scalar if it is known to be uniform or will be scalarized 1712 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1713 /// to the list if they are used by a load/store instruction that is marked as 1714 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1715 /// VF values in the vectorized loop, each corresponding to an iteration of 1716 /// the original scalar loop. 1717 void collectLoopScalars(ElementCount VF); 1718 1719 /// Keeps cost model vectorization decision and cost for instructions. 1720 /// Right now it is used for memory instructions only. 1721 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1722 std::pair<InstWidening, InstructionCost>>; 1723 1724 DecisionList WideningDecisions; 1725 1726 using CallDecisionList = 1727 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1728 1729 CallDecisionList CallWideningDecisions; 1730 1731 /// Returns true if \p V is expected to be vectorized and it needs to be 1732 /// extracted. 1733 bool needsExtract(Value *V, ElementCount VF) const { 1734 Instruction *I = dyn_cast<Instruction>(V); 1735 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1736 TheLoop->isLoopInvariant(I) || 1737 getWideningDecision(I, VF) == CM_Scalarize) 1738 return false; 1739 1740 // Assume we can vectorize V (and hence we need extraction) if the 1741 // scalars are not computed yet. This can happen, because it is called 1742 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1743 // the scalars are collected. That should be a safe assumption in most 1744 // cases, because we check if the operands have vectorizable types 1745 // beforehand in LoopVectorizationLegality. 1746 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1747 }; 1748 1749 /// Returns a range containing only operands needing to be extracted. 1750 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1751 ElementCount VF) const { 1752 return SmallVector<Value *, 4>(make_filter_range( 1753 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1754 } 1755 1756 public: 1757 /// The loop that we evaluate. 1758 Loop *TheLoop; 1759 1760 /// Predicated scalar evolution analysis. 1761 PredicatedScalarEvolution &PSE; 1762 1763 /// Loop Info analysis. 1764 LoopInfo *LI; 1765 1766 /// Vectorization legality. 1767 LoopVectorizationLegality *Legal; 1768 1769 /// Vector target information. 1770 const TargetTransformInfo &TTI; 1771 1772 /// Target Library Info. 1773 const TargetLibraryInfo *TLI; 1774 1775 /// Demanded bits analysis. 1776 DemandedBits *DB; 1777 1778 /// Assumption cache. 1779 AssumptionCache *AC; 1780 1781 /// Interface to emit optimization remarks. 1782 OptimizationRemarkEmitter *ORE; 1783 1784 const Function *TheFunction; 1785 1786 /// Loop Vectorize Hint. 1787 const LoopVectorizeHints *Hints; 1788 1789 /// The interleave access information contains groups of interleaved accesses 1790 /// with the same stride and close to each other. 1791 InterleavedAccessInfo &InterleaveInfo; 1792 1793 /// Values to ignore in the cost model. 1794 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1795 1796 /// Values to ignore in the cost model when VF > 1. 1797 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1798 1799 /// All element types found in the loop. 1800 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1801 }; 1802 } // end namespace llvm 1803 1804 namespace { 1805 /// Helper struct to manage generating runtime checks for vectorization. 1806 /// 1807 /// The runtime checks are created up-front in temporary blocks to allow better 1808 /// estimating the cost and un-linked from the existing IR. After deciding to 1809 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1810 /// temporary blocks are completely removed. 1811 class GeneratedRTChecks { 1812 /// Basic block which contains the generated SCEV checks, if any. 1813 BasicBlock *SCEVCheckBlock = nullptr; 1814 1815 /// The value representing the result of the generated SCEV checks. If it is 1816 /// nullptr, either no SCEV checks have been generated or they have been used. 1817 Value *SCEVCheckCond = nullptr; 1818 1819 /// Basic block which contains the generated memory runtime checks, if any. 1820 BasicBlock *MemCheckBlock = nullptr; 1821 1822 /// The value representing the result of the generated memory runtime checks. 1823 /// If it is nullptr, either no memory runtime checks have been generated or 1824 /// they have been used. 1825 Value *MemRuntimeCheckCond = nullptr; 1826 1827 DominatorTree *DT; 1828 LoopInfo *LI; 1829 TargetTransformInfo *TTI; 1830 1831 SCEVExpander SCEVExp; 1832 SCEVExpander MemCheckExp; 1833 1834 bool CostTooHigh = false; 1835 const bool AddBranchWeights; 1836 1837 Loop *OuterLoop = nullptr; 1838 1839 PredicatedScalarEvolution &PSE; 1840 1841 public: 1842 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, 1843 LoopInfo *LI, TargetTransformInfo *TTI, 1844 const DataLayout &DL, bool AddBranchWeights) 1845 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), 1846 MemCheckExp(*PSE.getSE(), DL, "scev.check"), 1847 AddBranchWeights(AddBranchWeights), PSE(PSE) {} 1848 1849 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1850 /// accurately estimate the cost of the runtime checks. The blocks are 1851 /// un-linked from the IR and are added back during vector code generation. If 1852 /// there is no vector code generation, the check blocks are removed 1853 /// completely. 1854 void create(Loop *L, const LoopAccessInfo &LAI, 1855 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1856 1857 // Hard cutoff to limit compile-time increase in case a very large number of 1858 // runtime checks needs to be generated. 1859 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1860 // profile info. 1861 CostTooHigh = 1862 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1863 if (CostTooHigh) 1864 return; 1865 1866 BasicBlock *LoopHeader = L->getHeader(); 1867 BasicBlock *Preheader = L->getLoopPreheader(); 1868 1869 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1870 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1871 // may be used by SCEVExpander. The blocks will be un-linked from their 1872 // predecessors and removed from LI & DT at the end of the function. 1873 if (!UnionPred.isAlwaysTrue()) { 1874 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1875 nullptr, "vector.scevcheck"); 1876 1877 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1878 &UnionPred, SCEVCheckBlock->getTerminator()); 1879 } 1880 1881 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1882 if (RtPtrChecking.Need) { 1883 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1884 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1885 "vector.memcheck"); 1886 1887 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1888 if (DiffChecks) { 1889 Value *RuntimeVF = nullptr; 1890 MemRuntimeCheckCond = addDiffRuntimeChecks( 1891 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1892 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1893 if (!RuntimeVF) 1894 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1895 return RuntimeVF; 1896 }, 1897 IC); 1898 } else { 1899 MemRuntimeCheckCond = addRuntimeChecks( 1900 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 1901 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 1902 } 1903 assert(MemRuntimeCheckCond && 1904 "no RT checks generated although RtPtrChecking " 1905 "claimed checks are required"); 1906 } 1907 1908 if (!MemCheckBlock && !SCEVCheckBlock) 1909 return; 1910 1911 // Unhook the temporary block with the checks, update various places 1912 // accordingly. 1913 if (SCEVCheckBlock) 1914 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1915 if (MemCheckBlock) 1916 MemCheckBlock->replaceAllUsesWith(Preheader); 1917 1918 if (SCEVCheckBlock) { 1919 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1920 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1921 Preheader->getTerminator()->eraseFromParent(); 1922 } 1923 if (MemCheckBlock) { 1924 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1925 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1926 Preheader->getTerminator()->eraseFromParent(); 1927 } 1928 1929 DT->changeImmediateDominator(LoopHeader, Preheader); 1930 if (MemCheckBlock) { 1931 DT->eraseNode(MemCheckBlock); 1932 LI->removeBlock(MemCheckBlock); 1933 } 1934 if (SCEVCheckBlock) { 1935 DT->eraseNode(SCEVCheckBlock); 1936 LI->removeBlock(SCEVCheckBlock); 1937 } 1938 1939 // Outer loop is used as part of the later cost calculations. 1940 OuterLoop = L->getParentLoop(); 1941 } 1942 1943 InstructionCost getCost() { 1944 if (SCEVCheckBlock || MemCheckBlock) 1945 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1946 1947 if (CostTooHigh) { 1948 InstructionCost Cost; 1949 Cost.setInvalid(); 1950 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1951 return Cost; 1952 } 1953 1954 InstructionCost RTCheckCost = 0; 1955 if (SCEVCheckBlock) 1956 for (Instruction &I : *SCEVCheckBlock) { 1957 if (SCEVCheckBlock->getTerminator() == &I) 1958 continue; 1959 InstructionCost C = 1960 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1961 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1962 RTCheckCost += C; 1963 } 1964 if (MemCheckBlock) { 1965 InstructionCost MemCheckCost = 0; 1966 for (Instruction &I : *MemCheckBlock) { 1967 if (MemCheckBlock->getTerminator() == &I) 1968 continue; 1969 InstructionCost C = 1970 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1971 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1972 MemCheckCost += C; 1973 } 1974 1975 // If the runtime memory checks are being created inside an outer loop 1976 // we should find out if these checks are outer loop invariant. If so, 1977 // the checks will likely be hoisted out and so the effective cost will 1978 // reduce according to the outer loop trip count. 1979 if (OuterLoop) { 1980 ScalarEvolution *SE = MemCheckExp.getSE(); 1981 // TODO: If profitable, we could refine this further by analysing every 1982 // individual memory check, since there could be a mixture of loop 1983 // variant and invariant checks that mean the final condition is 1984 // variant. 1985 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 1986 if (SE->isLoopInvariant(Cond, OuterLoop)) { 1987 // It seems reasonable to assume that we can reduce the effective 1988 // cost of the checks even when we know nothing about the trip 1989 // count. Assume that the outer loop executes at least twice. 1990 unsigned BestTripCount = 2; 1991 1992 // Get the best known TC estimate. 1993 if (auto EstimatedTC = getSmallBestKnownTC( 1994 PSE, OuterLoop, /* CanUseConstantMax = */ false)) 1995 BestTripCount = *EstimatedTC; 1996 1997 BestTripCount = std::max(BestTripCount, 1U); 1998 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 1999 2000 // Let's ensure the cost is always at least 1. 2001 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 2002 (InstructionCost::CostType)1); 2003 2004 if (BestTripCount > 1) 2005 LLVM_DEBUG(dbgs() 2006 << "We expect runtime memory checks to be hoisted " 2007 << "out of the outer loop. Cost reduced from " 2008 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2009 2010 MemCheckCost = NewMemCheckCost; 2011 } 2012 } 2013 2014 RTCheckCost += MemCheckCost; 2015 } 2016 2017 if (SCEVCheckBlock || MemCheckBlock) 2018 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2019 << "\n"); 2020 2021 return RTCheckCost; 2022 } 2023 2024 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2025 /// unused. 2026 ~GeneratedRTChecks() { 2027 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2028 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2029 if (!SCEVCheckCond) 2030 SCEVCleaner.markResultUsed(); 2031 2032 if (!MemRuntimeCheckCond) 2033 MemCheckCleaner.markResultUsed(); 2034 2035 if (MemRuntimeCheckCond) { 2036 auto &SE = *MemCheckExp.getSE(); 2037 // Memory runtime check generation creates compares that use expanded 2038 // values. Remove them before running the SCEVExpanderCleaners. 2039 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2040 if (MemCheckExp.isInsertedInstruction(&I)) 2041 continue; 2042 SE.forgetValue(&I); 2043 I.eraseFromParent(); 2044 } 2045 } 2046 MemCheckCleaner.cleanup(); 2047 SCEVCleaner.cleanup(); 2048 2049 if (SCEVCheckCond) 2050 SCEVCheckBlock->eraseFromParent(); 2051 if (MemRuntimeCheckCond) 2052 MemCheckBlock->eraseFromParent(); 2053 } 2054 2055 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2056 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2057 /// depending on the generated condition. 2058 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2059 BasicBlock *LoopVectorPreHeader) { 2060 if (!SCEVCheckCond) 2061 return nullptr; 2062 2063 Value *Cond = SCEVCheckCond; 2064 // Mark the check as used, to prevent it from being removed during cleanup. 2065 SCEVCheckCond = nullptr; 2066 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2067 if (C->isZero()) 2068 return nullptr; 2069 2070 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2071 2072 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2073 // Create new preheader for vector loop. 2074 if (OuterLoop) 2075 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2076 2077 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2078 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2079 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2080 SCEVCheckBlock); 2081 2082 DT->addNewBlock(SCEVCheckBlock, Pred); 2083 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2084 2085 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2086 if (AddBranchWeights) 2087 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); 2088 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2089 return SCEVCheckBlock; 2090 } 2091 2092 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2093 /// the branches to branch to the vector preheader or \p Bypass, depending on 2094 /// the generated condition. 2095 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2096 BasicBlock *LoopVectorPreHeader) { 2097 // Check if we generated code that checks in runtime if arrays overlap. 2098 if (!MemRuntimeCheckCond) 2099 return nullptr; 2100 2101 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2102 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2103 MemCheckBlock); 2104 2105 DT->addNewBlock(MemCheckBlock, Pred); 2106 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2107 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2108 2109 if (OuterLoop) 2110 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2111 2112 BranchInst &BI = 2113 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2114 if (AddBranchWeights) { 2115 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); 2116 } 2117 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2118 MemCheckBlock->getTerminator()->setDebugLoc( 2119 Pred->getTerminator()->getDebugLoc()); 2120 2121 // Mark the check as used, to prevent it from being removed during cleanup. 2122 MemRuntimeCheckCond = nullptr; 2123 return MemCheckBlock; 2124 } 2125 }; 2126 } // namespace 2127 2128 static bool useActiveLaneMask(TailFoldingStyle Style) { 2129 return Style == TailFoldingStyle::Data || 2130 Style == TailFoldingStyle::DataAndControlFlow || 2131 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2132 } 2133 2134 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2135 return Style == TailFoldingStyle::DataAndControlFlow || 2136 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2137 } 2138 2139 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2140 // vectorization. The loop needs to be annotated with #pragma omp simd 2141 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2142 // vector length information is not provided, vectorization is not considered 2143 // explicit. Interleave hints are not allowed either. These limitations will be 2144 // relaxed in the future. 2145 // Please, note that we are currently forced to abuse the pragma 'clang 2146 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2147 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2148 // provides *explicit vectorization hints* (LV can bypass legal checks and 2149 // assume that vectorization is legal). However, both hints are implemented 2150 // using the same metadata (llvm.loop.vectorize, processed by 2151 // LoopVectorizeHints). This will be fixed in the future when the native IR 2152 // representation for pragma 'omp simd' is introduced. 2153 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2154 OptimizationRemarkEmitter *ORE) { 2155 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2156 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2157 2158 // Only outer loops with an explicit vectorization hint are supported. 2159 // Unannotated outer loops are ignored. 2160 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2161 return false; 2162 2163 Function *Fn = OuterLp->getHeader()->getParent(); 2164 if (!Hints.allowVectorization(Fn, OuterLp, 2165 true /*VectorizeOnlyWhenForced*/)) { 2166 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2167 return false; 2168 } 2169 2170 if (Hints.getInterleave() > 1) { 2171 // TODO: Interleave support is future work. 2172 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2173 "outer loops.\n"); 2174 Hints.emitRemarkWithHints(); 2175 return false; 2176 } 2177 2178 return true; 2179 } 2180 2181 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2182 OptimizationRemarkEmitter *ORE, 2183 SmallVectorImpl<Loop *> &V) { 2184 // Collect inner loops and outer loops without irreducible control flow. For 2185 // now, only collect outer loops that have explicit vectorization hints. If we 2186 // are stress testing the VPlan H-CFG construction, we collect the outermost 2187 // loop of every loop nest. 2188 if (L.isInnermost() || VPlanBuildStressTest || 2189 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2190 LoopBlocksRPO RPOT(&L); 2191 RPOT.perform(LI); 2192 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2193 V.push_back(&L); 2194 // TODO: Collect inner loops inside marked outer loops in case 2195 // vectorization fails for the outer loop. Do not invoke 2196 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2197 // already known to be reducible. We can use an inherited attribute for 2198 // that. 2199 return; 2200 } 2201 } 2202 for (Loop *InnerL : L) 2203 collectSupportedLoops(*InnerL, LI, ORE, V); 2204 } 2205 2206 //===----------------------------------------------------------------------===// 2207 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2208 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2209 //===----------------------------------------------------------------------===// 2210 2211 /// Compute the transformed value of Index at offset StartValue using step 2212 /// StepValue. 2213 /// For integer induction, returns StartValue + Index * StepValue. 2214 /// For pointer induction, returns StartValue[Index * StepValue]. 2215 /// FIXME: The newly created binary instructions should contain nsw/nuw 2216 /// flags, which can be found from the original scalar operations. 2217 static Value * 2218 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2219 Value *Step, 2220 InductionDescriptor::InductionKind InductionKind, 2221 const BinaryOperator *InductionBinOp) { 2222 Type *StepTy = Step->getType(); 2223 Value *CastedIndex = StepTy->isIntegerTy() 2224 ? B.CreateSExtOrTrunc(Index, StepTy) 2225 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2226 if (CastedIndex != Index) { 2227 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2228 Index = CastedIndex; 2229 } 2230 2231 // Note: the IR at this point is broken. We cannot use SE to create any new 2232 // SCEV and then expand it, hoping that SCEV's simplification will give us 2233 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2234 // lead to various SCEV crashes. So all we can do is to use builder and rely 2235 // on InstCombine for future simplifications. Here we handle some trivial 2236 // cases only. 2237 auto CreateAdd = [&B](Value *X, Value *Y) { 2238 assert(X->getType() == Y->getType() && "Types don't match!"); 2239 if (auto *CX = dyn_cast<ConstantInt>(X)) 2240 if (CX->isZero()) 2241 return Y; 2242 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2243 if (CY->isZero()) 2244 return X; 2245 return B.CreateAdd(X, Y); 2246 }; 2247 2248 // We allow X to be a vector type, in which case Y will potentially be 2249 // splatted into a vector with the same element count. 2250 auto CreateMul = [&B](Value *X, Value *Y) { 2251 assert(X->getType()->getScalarType() == Y->getType() && 2252 "Types don't match!"); 2253 if (auto *CX = dyn_cast<ConstantInt>(X)) 2254 if (CX->isOne()) 2255 return Y; 2256 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2257 if (CY->isOne()) 2258 return X; 2259 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2260 if (XVTy && !isa<VectorType>(Y->getType())) 2261 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2262 return B.CreateMul(X, Y); 2263 }; 2264 2265 switch (InductionKind) { 2266 case InductionDescriptor::IK_IntInduction: { 2267 assert(!isa<VectorType>(Index->getType()) && 2268 "Vector indices not supported for integer inductions yet"); 2269 assert(Index->getType() == StartValue->getType() && 2270 "Index type does not match StartValue type"); 2271 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2272 return B.CreateSub(StartValue, Index); 2273 auto *Offset = CreateMul(Index, Step); 2274 return CreateAdd(StartValue, Offset); 2275 } 2276 case InductionDescriptor::IK_PtrInduction: 2277 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2278 case InductionDescriptor::IK_FpInduction: { 2279 assert(!isa<VectorType>(Index->getType()) && 2280 "Vector indices not supported for FP inductions yet"); 2281 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2282 assert(InductionBinOp && 2283 (InductionBinOp->getOpcode() == Instruction::FAdd || 2284 InductionBinOp->getOpcode() == Instruction::FSub) && 2285 "Original bin op should be defined for FP induction"); 2286 2287 Value *MulExp = B.CreateFMul(Step, Index); 2288 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2289 "induction"); 2290 } 2291 case InductionDescriptor::IK_NoInduction: 2292 return nullptr; 2293 } 2294 llvm_unreachable("invalid enum"); 2295 } 2296 2297 std::optional<unsigned> getMaxVScale(const Function &F, 2298 const TargetTransformInfo &TTI) { 2299 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2300 return MaxVScale; 2301 2302 if (F.hasFnAttribute(Attribute::VScaleRange)) 2303 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2304 2305 return std::nullopt; 2306 } 2307 2308 /// For the given VF and UF and maximum trip count computed for the loop, return 2309 /// whether the induction variable might overflow in the vectorized loop. If not, 2310 /// then we know a runtime overflow check always evaluates to false and can be 2311 /// removed. 2312 static bool isIndvarOverflowCheckKnownFalse( 2313 const LoopVectorizationCostModel *Cost, 2314 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2315 // Always be conservative if we don't know the exact unroll factor. 2316 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2317 2318 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2319 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2320 2321 // We know the runtime overflow check is known false iff the (max) trip-count 2322 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2323 // the vector loop induction variable. 2324 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { 2325 uint64_t MaxVF = VF.getKnownMinValue(); 2326 if (VF.isScalable()) { 2327 std::optional<unsigned> MaxVScale = 2328 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2329 if (!MaxVScale) 2330 return false; 2331 MaxVF *= *MaxVScale; 2332 } 2333 2334 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2335 } 2336 2337 return false; 2338 } 2339 2340 // Return whether we allow using masked interleave-groups (for dealing with 2341 // strided loads/stores that reside in predicated blocks, or for dealing 2342 // with gaps). 2343 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2344 // If an override option has been passed in for interleaved accesses, use it. 2345 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2346 return EnableMaskedInterleavedMemAccesses; 2347 2348 return TTI.enableMaskedInterleavedAccessVectorization(); 2349 } 2350 2351 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2352 VPReplicateRecipe *RepRecipe, 2353 const VPLane &Lane, 2354 VPTransformState &State) { 2355 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2356 2357 // Does this instruction return a value ? 2358 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2359 2360 Instruction *Cloned = Instr->clone(); 2361 if (!IsVoidRetTy) { 2362 Cloned->setName(Instr->getName() + ".cloned"); 2363 #if !defined(NDEBUG) 2364 // Verify that VPlan type inference results agree with the type of the 2365 // generated values. 2366 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2367 "inferred type and type from generated instructions do not match"); 2368 #endif 2369 } 2370 2371 RepRecipe->setFlags(Cloned); 2372 2373 if (auto DL = Instr->getDebugLoc()) 2374 State.setDebugLocFrom(DL); 2375 2376 // Replace the operands of the cloned instructions with their scalar 2377 // equivalents in the new loop. 2378 for (const auto &I : enumerate(RepRecipe->operands())) { 2379 auto InputLane = Lane; 2380 VPValue *Operand = I.value(); 2381 if (vputils::isUniformAfterVectorization(Operand)) 2382 InputLane = VPLane::getFirstLane(); 2383 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2384 } 2385 State.addNewMetadata(Cloned, Instr); 2386 2387 // Place the cloned scalar in the new loop. 2388 State.Builder.Insert(Cloned); 2389 2390 State.set(RepRecipe, Cloned, Lane); 2391 2392 // If we just cloned a new assumption, add it the assumption cache. 2393 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2394 AC->registerAssumption(II); 2395 2396 // End if-block. 2397 VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); 2398 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; 2399 assert( 2400 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2401 all_of(RepRecipe->operands(), 2402 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2403 "Expected a recipe is either within a region or all of its operands " 2404 "are defined outside the vectorized region."); 2405 if (IfPredicateInstr) 2406 PredicatedInstructions.push_back(Cloned); 2407 } 2408 2409 Value * 2410 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2411 if (VectorTripCount) 2412 return VectorTripCount; 2413 2414 Value *TC = getTripCount(); 2415 IRBuilder<> Builder(InsertBlock->getTerminator()); 2416 2417 Type *Ty = TC->getType(); 2418 // This is where we can make the step a runtime constant. 2419 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2420 2421 // If the tail is to be folded by masking, round the number of iterations N 2422 // up to a multiple of Step instead of rounding down. This is done by first 2423 // adding Step-1 and then rounding down. Note that it's ok if this addition 2424 // overflows: the vector induction variable will eventually wrap to zero given 2425 // that it starts at zero and its Step is a power of two; the loop will then 2426 // exit, with the last early-exit vector comparison also producing all-true. 2427 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2428 // is accounted for in emitIterationCountCheck that adds an overflow check. 2429 if (Cost->foldTailByMasking()) { 2430 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2431 "VF*UF must be a power of 2 when folding tail by masking"); 2432 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), 2433 "n.rnd.up"); 2434 } 2435 2436 // Now we need to generate the expression for the part of the loop that the 2437 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2438 // iterations are not required for correctness, or N - Step, otherwise. Step 2439 // is equal to the vectorization factor (number of SIMD elements) times the 2440 // unroll factor (number of SIMD instructions). 2441 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2442 2443 // There are cases where we *must* run at least one iteration in the remainder 2444 // loop. See the cost model for when this can happen. If the step evenly 2445 // divides the trip count, we set the remainder to be equal to the step. If 2446 // the step does not evenly divide the trip count, no adjustment is necessary 2447 // since there will already be scalar iterations. Note that the minimum 2448 // iterations check ensures that N >= Step. 2449 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2450 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2451 R = Builder.CreateSelect(IsZero, Step, R); 2452 } 2453 2454 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2455 2456 return VectorTripCount; 2457 } 2458 2459 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { 2460 VPBlockBase *ScalarPH = Plan.getScalarPreheader(); 2461 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); 2462 if (PreVectorPH->getNumSuccessors() != 1) { 2463 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); 2464 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && 2465 "Unexpected successor"); 2466 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); 2467 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); 2468 PreVectorPH = CheckVPIRBB; 2469 } 2470 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); 2471 PreVectorPH->swapSuccessors(); 2472 } 2473 2474 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2475 Value *Count = getTripCount(); 2476 // Reuse existing vector loop preheader for TC checks. 2477 // Note that new preheader block is generated for vector loop. 2478 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2479 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2480 2481 // Generate code to check if the loop's trip count is less than VF * UF, or 2482 // equal to it in case a scalar epilogue is required; this implies that the 2483 // vector trip count is zero. This check also covers the case where adding one 2484 // to the backedge-taken count overflowed leading to an incorrect trip count 2485 // of zero. In this case we will also jump to the scalar loop. 2486 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2487 : ICmpInst::ICMP_ULT; 2488 2489 // If tail is to be folded, vector loop takes care of all iterations. 2490 Type *CountTy = Count->getType(); 2491 Value *CheckMinIters = Builder.getFalse(); 2492 auto CreateStep = [&]() -> Value * { 2493 // Create step with max(MinProTripCount, UF * VF). 2494 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2495 return createStepForVF(Builder, CountTy, VF, UF); 2496 2497 Value *MinProfTC = 2498 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2499 if (!VF.isScalable()) 2500 return MinProfTC; 2501 return Builder.CreateBinaryIntrinsic( 2502 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2503 }; 2504 2505 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2506 if (Style == TailFoldingStyle::None) { 2507 Value *Step = CreateStep(); 2508 ScalarEvolution &SE = *PSE.getSE(); 2509 // TODO: Emit unconditional branch to vector preheader instead of 2510 // conditional branch with known condition. 2511 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); 2512 // Check if the trip count is < the step. 2513 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { 2514 // TODO: Ensure step is at most the trip count when determining max VF and 2515 // UF, w/o tail folding. 2516 CheckMinIters = Builder.getTrue(); 2517 } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), 2518 TripCountSCEV, SE.getSCEV(Step))) { 2519 // Generate the minimum iteration check only if we cannot prove the 2520 // check is known to be true, or known to be false. 2521 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2522 } // else step known to be < trip count, use CheckMinIters preset to false. 2523 } else if (VF.isScalable() && 2524 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2525 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2526 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2527 // an overflow to zero when updating induction variables and so an 2528 // additional overflow check is required before entering the vector loop. 2529 2530 // Get the maximum unsigned value for the type. 2531 Value *MaxUIntTripCount = 2532 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2533 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2534 2535 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2536 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2537 } 2538 2539 // Create new preheader for vector loop. 2540 LoopVectorPreHeader = 2541 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2542 "vector.ph"); 2543 2544 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2545 DT->getNode(Bypass)->getIDom()) && 2546 "TC check is expected to dominate Bypass"); 2547 2548 BranchInst &BI = 2549 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2550 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2551 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 2552 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2553 LoopBypassBlocks.push_back(TCCheckBlock); 2554 2555 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. 2556 introduceCheckBlockInVPlan(TCCheckBlock); 2557 } 2558 2559 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2560 BasicBlock *const SCEVCheckBlock = 2561 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader); 2562 if (!SCEVCheckBlock) 2563 return nullptr; 2564 2565 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2566 (OptForSizeBasedOnProfile && 2567 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2568 "Cannot SCEV check stride or overflow when optimizing for size"); 2569 assert(!LoopBypassBlocks.empty() && 2570 "Should already be a bypass block due to iteration count check"); 2571 LoopBypassBlocks.push_back(SCEVCheckBlock); 2572 AddedSafetyChecks = true; 2573 2574 introduceCheckBlockInVPlan(SCEVCheckBlock); 2575 return SCEVCheckBlock; 2576 } 2577 2578 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2579 // VPlan-native path does not do any analysis for runtime checks currently. 2580 if (EnableVPlanNativePath) 2581 return nullptr; 2582 2583 BasicBlock *const MemCheckBlock = 2584 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2585 2586 // Check if we generated code that checks in runtime if arrays overlap. We put 2587 // the checks into a separate block to make the more common case of few 2588 // elements faster. 2589 if (!MemCheckBlock) 2590 return nullptr; 2591 2592 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2593 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2594 "Cannot emit memory checks when optimizing for size, unless forced " 2595 "to vectorize."); 2596 ORE->emit([&]() { 2597 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2598 OrigLoop->getStartLoc(), 2599 OrigLoop->getHeader()) 2600 << "Code-size may be reduced by not forcing " 2601 "vectorization, or by source-code modifications " 2602 "eliminating the need for runtime checks " 2603 "(e.g., adding 'restrict')."; 2604 }); 2605 } 2606 2607 LoopBypassBlocks.push_back(MemCheckBlock); 2608 2609 AddedSafetyChecks = true; 2610 2611 introduceCheckBlockInVPlan(MemCheckBlock); 2612 return MemCheckBlock; 2613 } 2614 2615 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p 2616 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must 2617 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All 2618 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. 2619 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { 2620 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); 2621 for (auto &R : make_early_inc_range(*VPBB)) { 2622 assert(!R.isPhi() && "Tried to move phi recipe to end of block"); 2623 R.moveBefore(*IRVPBB, IRVPBB->end()); 2624 } 2625 2626 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); 2627 // VPBB is now dead and will be cleaned up when the plan gets destroyed. 2628 } 2629 2630 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2631 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2632 assert(LoopVectorPreHeader && "Invalid loop structure"); 2633 assert((OrigLoop->getUniqueLatchExitBlock() || 2634 Cost->requiresScalarEpilogue(VF.isVector())) && 2635 "loops not exiting via the latch without required epilogue?"); 2636 2637 LoopMiddleBlock = 2638 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2639 LI, nullptr, Twine(Prefix) + "middle.block"); 2640 replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); 2641 LoopScalarPreHeader = 2642 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2643 nullptr, Twine(Prefix) + "scalar.ph"); 2644 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); 2645 } 2646 2647 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2648 /// expansion results. 2649 static Value *getExpandedStep(const InductionDescriptor &ID, 2650 const SCEV2ValueTy &ExpandedSCEVs) { 2651 const SCEV *Step = ID.getStep(); 2652 if (auto *C = dyn_cast<SCEVConstant>(Step)) 2653 return C->getValue(); 2654 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 2655 return U->getValue(); 2656 auto I = ExpandedSCEVs.find(Step); 2657 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 2658 return I->second; 2659 } 2660 2661 /// Knowing that loop \p L executes a single vector iteration, add instructions 2662 /// that will get simplified and thus should not have any cost to \p 2663 /// InstsToIgnore. 2664 static void addFullyUnrolledInstructionsToIgnore( 2665 Loop *L, const LoopVectorizationLegality::InductionList &IL, 2666 SmallPtrSetImpl<Instruction *> &InstsToIgnore) { 2667 auto *Cmp = L->getLatchCmpInst(); 2668 if (Cmp) 2669 InstsToIgnore.insert(Cmp); 2670 for (const auto &KV : IL) { 2671 // Extract the key by hand so that it can be used in the lambda below. Note 2672 // that captured structured bindings are a C++20 extension. 2673 const PHINode *IV = KV.first; 2674 2675 // Get next iteration value of the induction variable. 2676 Instruction *IVInst = 2677 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch())); 2678 if (all_of(IVInst->users(), 2679 [&](const User *U) { return U == IV || U == Cmp; })) 2680 InstsToIgnore.insert(IVInst); 2681 } 2682 } 2683 2684 void InnerLoopVectorizer::createInductionAdditionalBypassValues( 2685 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { 2686 assert(MainVectorTripCount && "Must have bypass information"); 2687 2688 Instruction *OldInduction = Legal->getPrimaryInduction(); 2689 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), 2690 getAdditionalBypassBlock()->getFirstInsertionPt()); 2691 for (const auto &InductionEntry : Legal->getInductionVars()) { 2692 PHINode *OrigPhi = InductionEntry.first; 2693 const InductionDescriptor &II = InductionEntry.second; 2694 Value *Step = getExpandedStep(II, ExpandedSCEVs); 2695 // For the primary induction the additional bypass end value is known. 2696 // Otherwise it is computed. 2697 Value *EndValueFromAdditionalBypass = MainVectorTripCount; 2698 if (OrigPhi != OldInduction) { 2699 auto *BinOp = II.getInductionBinOp(); 2700 // Fast-math-flags propagate from the original induction instruction. 2701 if (isa_and_nonnull<FPMathOperator>(BinOp)) 2702 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); 2703 2704 // Compute the end value for the additional bypass. 2705 EndValueFromAdditionalBypass = 2706 emitTransformedIndex(BypassBuilder, MainVectorTripCount, 2707 II.getStartValue(), Step, II.getKind(), BinOp); 2708 EndValueFromAdditionalBypass->setName("ind.end"); 2709 } 2710 2711 // Store the bypass value here, as it needs to be added as operand to its 2712 // scalar preheader phi node after the epilogue skeleton has been created. 2713 // TODO: Directly add as extra operand to the VPResumePHI recipe. 2714 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && 2715 "entry for OrigPhi already exits"); 2716 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; 2717 } 2718 } 2719 2720 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( 2721 const SCEV2ValueTy &ExpandedSCEVs) { 2722 /* 2723 In this function we generate a new loop. The new loop will contain 2724 the vectorized instructions while the old loop will continue to run the 2725 scalar remainder. 2726 2727 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2728 / | preheader are expanded here. Eventually all required SCEV 2729 / | expansion should happen here. 2730 / v 2731 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2732 | / | 2733 | / v 2734 || [ ] <-- vector pre header. 2735 |/ | 2736 | v 2737 | [ ] \ 2738 | [ ]_| <-- vector loop (created during VPlan execution). 2739 | | 2740 | v 2741 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2742 | | successors created during VPlan execution) 2743 \/ | 2744 /\ v 2745 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2746 | | 2747 (opt) v <-- edge from middle to exit iff epilogue is not required. 2748 | [ ] \ 2749 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header 2750 | | wrapped in VPIRBasicBlock). 2751 \ | 2752 \ v 2753 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2754 ... 2755 */ 2756 2757 // Create an empty vector loop, and prepare basic blocks for the runtime 2758 // checks. 2759 createVectorLoopSkeleton(""); 2760 2761 // Now, compare the new count to zero. If it is zero skip the vector loop and 2762 // jump to the scalar loop. This check also covers the case where the 2763 // backedge-taken count is uint##_max: adding one to it will overflow leading 2764 // to an incorrect trip count of zero. In this (rare) case we will also jump 2765 // to the scalar loop. 2766 emitIterationCountCheck(LoopScalarPreHeader); 2767 2768 // Generate the code to check any assumptions that we've made for SCEV 2769 // expressions. 2770 emitSCEVChecks(LoopScalarPreHeader); 2771 2772 // Generate the code that checks in runtime if arrays overlap. We put the 2773 // checks into a separate block to make the more common case of few elements 2774 // faster. 2775 emitMemRuntimeChecks(LoopScalarPreHeader); 2776 2777 return LoopVectorPreHeader; 2778 } 2779 2780 // Fix up external users of the induction variable. At this point, we are 2781 // in LCSSA form, with all external PHIs that use the IV having one input value, 2782 // coming from the remainder loop. We need those PHIs to also have a correct 2783 // value for the IV when arriving directly from the middle block. 2784 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 2785 const InductionDescriptor &II, 2786 Value *VectorTripCount, 2787 BasicBlock *MiddleBlock, 2788 VPTransformState &State) { 2789 // There are two kinds of external IV usages - those that use the value 2790 // computed in the last iteration (the PHI) and those that use the penultimate 2791 // value (the value that feeds into the phi from the loop latch). 2792 // We allow both, but they, obviously, have different values. 2793 2794 DenseMap<Value *, Value *> MissingVals; 2795 2796 Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock( 2797 OrigLoop->getLoopPreheader())) 2798 ->getIncomingValueForBlock(MiddleBlock); 2799 2800 // An external user of the last iteration's value should see the value that 2801 // the remainder loop uses to initialize its own IV. 2802 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 2803 for (User *U : PostInc->users()) { 2804 Instruction *UI = cast<Instruction>(U); 2805 if (!OrigLoop->contains(UI)) { 2806 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2807 MissingVals[UI] = EndValue; 2808 } 2809 } 2810 2811 // An external user of the penultimate value need to see EndValue - Step. 2812 // The simplest way to get this is to recompute it from the constituent SCEVs, 2813 // that is Start + (Step * (CRD - 1)). 2814 for (User *U : OrigPhi->users()) { 2815 auto *UI = cast<Instruction>(U); 2816 if (!OrigLoop->contains(UI)) { 2817 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2818 IRBuilder<> B(MiddleBlock->getTerminator()); 2819 2820 // Fast-math-flags propagate from the original induction instruction. 2821 if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp())) 2822 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 2823 2824 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 2825 assert(StepVPV && "step must have been expanded during VPlan execution"); 2826 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 2827 : State.get(StepVPV, VPLane(0)); 2828 Value *Escape = nullptr; 2829 if (EndValue->getType()->isIntegerTy()) 2830 Escape = B.CreateSub(EndValue, Step); 2831 else if (EndValue->getType()->isPointerTy()) 2832 Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step)); 2833 else { 2834 assert(EndValue->getType()->isFloatingPointTy() && 2835 "Unexpected induction type"); 2836 Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() == 2837 Instruction::FAdd 2838 ? Instruction::FSub 2839 : Instruction::FAdd, 2840 EndValue, Step); 2841 } 2842 Escape->setName("ind.escape"); 2843 MissingVals[UI] = Escape; 2844 } 2845 } 2846 2847 assert((MissingVals.empty() || 2848 all_of(MissingVals, 2849 [MiddleBlock, this](const std::pair<Value *, Value *> &P) { 2850 return all_of( 2851 predecessors(cast<Instruction>(P.first)->getParent()), 2852 [MiddleBlock, this](BasicBlock *Pred) { 2853 return Pred == MiddleBlock || 2854 Pred == OrigLoop->getLoopLatch(); 2855 }); 2856 })) && 2857 "Expected escaping values from latch/middle.block only"); 2858 2859 for (auto &I : MissingVals) { 2860 PHINode *PHI = cast<PHINode>(I.first); 2861 // One corner case we have to handle is two IVs "chasing" each-other, 2862 // that is %IV2 = phi [...], [ %IV1, %latch ] 2863 // In this case, if IV1 has an external use, we need to avoid adding both 2864 // "last value of IV1" and "penultimate value of IV2". So, verify that we 2865 // don't already have an incoming value for the middle block. 2866 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 2867 PHI->addIncoming(I.second, MiddleBlock); 2868 } 2869 } 2870 2871 namespace { 2872 2873 struct CSEDenseMapInfo { 2874 static bool canHandle(const Instruction *I) { 2875 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2876 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2877 } 2878 2879 static inline Instruction *getEmptyKey() { 2880 return DenseMapInfo<Instruction *>::getEmptyKey(); 2881 } 2882 2883 static inline Instruction *getTombstoneKey() { 2884 return DenseMapInfo<Instruction *>::getTombstoneKey(); 2885 } 2886 2887 static unsigned getHashValue(const Instruction *I) { 2888 assert(canHandle(I) && "Unknown instruction!"); 2889 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2890 I->value_op_end())); 2891 } 2892 2893 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 2894 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2895 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2896 return LHS == RHS; 2897 return LHS->isIdenticalTo(RHS); 2898 } 2899 }; 2900 2901 } // end anonymous namespace 2902 2903 ///Perform cse of induction variable instructions. 2904 static void cse(BasicBlock *BB) { 2905 // Perform simple cse. 2906 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2907 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 2908 if (!CSEDenseMapInfo::canHandle(&In)) 2909 continue; 2910 2911 // Check if we can replace this instruction with any of the 2912 // visited instructions. 2913 if (Instruction *V = CSEMap.lookup(&In)) { 2914 In.replaceAllUsesWith(V); 2915 In.eraseFromParent(); 2916 continue; 2917 } 2918 2919 CSEMap[&In] = &In; 2920 } 2921 } 2922 2923 InstructionCost 2924 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 2925 ElementCount VF) const { 2926 // We only need to calculate a cost if the VF is scalar; for actual vectors 2927 // we should already have a pre-calculated cost at each VF. 2928 if (!VF.isScalar()) 2929 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 2930 2931 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2932 Type *RetTy = CI->getType(); 2933 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 2934 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 2935 return *RedCost; 2936 2937 SmallVector<Type *, 4> Tys; 2938 for (auto &ArgOp : CI->args()) 2939 Tys.push_back(ArgOp->getType()); 2940 2941 InstructionCost ScalarCallCost = 2942 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 2943 2944 // If this is an intrinsic we may have a lower cost for it. 2945 if (getVectorIntrinsicIDForCall(CI, TLI)) { 2946 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 2947 return std::min(ScalarCallCost, IntrinsicCost); 2948 } 2949 return ScalarCallCost; 2950 } 2951 2952 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { 2953 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 2954 return Elt; 2955 return VectorType::get(Elt, VF); 2956 } 2957 2958 InstructionCost 2959 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 2960 ElementCount VF) const { 2961 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 2962 assert(ID && "Expected intrinsic call!"); 2963 Type *RetTy = maybeVectorizeType(CI->getType(), VF); 2964 FastMathFlags FMF; 2965 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 2966 FMF = FPMO->getFastMathFlags(); 2967 2968 SmallVector<const Value *> Arguments(CI->args()); 2969 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 2970 SmallVector<Type *> ParamTys; 2971 std::transform(FTy->param_begin(), FTy->param_end(), 2972 std::back_inserter(ParamTys), 2973 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); 2974 2975 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 2976 dyn_cast<IntrinsicInst>(CI)); 2977 return TTI.getIntrinsicInstrCost(CostAttrs, 2978 TargetTransformInfo::TCK_RecipThroughput); 2979 } 2980 2981 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 2982 // Fix widened non-induction PHIs by setting up the PHI operands. 2983 if (EnableVPlanNativePath) 2984 fixNonInductionPHIs(State); 2985 2986 // Forget the original basic block. 2987 PSE.getSE()->forgetLoop(OrigLoop); 2988 PSE.getSE()->forgetBlockAndLoopDispositions(); 2989 2990 // After vectorization, the exit blocks of the original loop will have 2991 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 2992 // looked through single-entry phis. 2993 SmallVector<BasicBlock *> ExitBlocks; 2994 OrigLoop->getExitBlocks(ExitBlocks); 2995 for (BasicBlock *Exit : ExitBlocks) 2996 for (PHINode &PN : Exit->phis()) 2997 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 2998 2999 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3000 // No edge from the middle block to the unique exit block has been inserted 3001 // and there is nothing to fix from vector loop; phis should have incoming 3002 // from scalar loop only. 3003 } else { 3004 // TODO: Check in VPlan to see if IV users need fixing instead of checking 3005 // the cost model. 3006 3007 // If we inserted an edge from the middle block to the unique exit block, 3008 // update uses outside the loop (phis) to account for the newly inserted 3009 // edge. 3010 3011 // Fix-up external users of the induction variables. 3012 for (const auto &Entry : Legal->getInductionVars()) 3013 fixupIVUsers(Entry.first, Entry.second, 3014 getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); 3015 } 3016 3017 // Don't apply optimizations below when no vector region remains, as they all 3018 // require a vector loop at the moment. 3019 if (!State.Plan->getVectorLoopRegion()) 3020 return; 3021 3022 for (Instruction *PI : PredicatedInstructions) 3023 sinkScalarOperands(&*PI); 3024 3025 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 3026 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 3027 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; 3028 3029 // Remove redundant induction instructions. 3030 cse(HeaderBB); 3031 3032 // Set/update profile weights for the vector and remainder loops as original 3033 // loop iterations are now distributed among them. Note that original loop 3034 // becomes the scalar remainder loop after vectorization. 3035 // 3036 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3037 // end up getting slightly roughened result but that should be OK since 3038 // profile is not inherently precise anyway. Note also possible bypass of 3039 // vector code caused by legality checks is ignored, assigning all the weight 3040 // to the vector loop, optimistically. 3041 // 3042 // For scalable vectorization we can't know at compile time how many 3043 // iterations of the loop are handled in one vector iteration, so instead 3044 // assume a pessimistic vscale of '1'. 3045 Loop *VectorLoop = LI->getLoopFor(HeaderBB); 3046 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, 3047 VF.getKnownMinValue() * UF); 3048 } 3049 3050 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3051 // The basic block and loop containing the predicated instruction. 3052 auto *PredBB = PredInst->getParent(); 3053 auto *VectorLoop = LI->getLoopFor(PredBB); 3054 3055 // Initialize a worklist with the operands of the predicated instruction. 3056 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3057 3058 // Holds instructions that we need to analyze again. An instruction may be 3059 // reanalyzed if we don't yet know if we can sink it or not. 3060 SmallVector<Instruction *, 8> InstsToReanalyze; 3061 3062 // Returns true if a given use occurs in the predicated block. Phi nodes use 3063 // their operands in their corresponding predecessor blocks. 3064 auto IsBlockOfUsePredicated = [&](Use &U) -> bool { 3065 auto *I = cast<Instruction>(U.getUser()); 3066 BasicBlock *BB = I->getParent(); 3067 if (auto *Phi = dyn_cast<PHINode>(I)) 3068 BB = Phi->getIncomingBlock( 3069 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3070 return BB == PredBB; 3071 }; 3072 3073 // Iteratively sink the scalarized operands of the predicated instruction 3074 // into the block we created for it. When an instruction is sunk, it's 3075 // operands are then added to the worklist. The algorithm ends after one pass 3076 // through the worklist doesn't sink a single instruction. 3077 bool Changed; 3078 do { 3079 // Add the instructions that need to be reanalyzed to the worklist, and 3080 // reset the changed indicator. 3081 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3082 InstsToReanalyze.clear(); 3083 Changed = false; 3084 3085 while (!Worklist.empty()) { 3086 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3087 3088 // We can't sink an instruction if it is a phi node, is not in the loop, 3089 // may have side effects or may read from memory. 3090 // TODO: Could do more granular checking to allow sinking 3091 // a load past non-store instructions. 3092 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3093 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3094 continue; 3095 3096 // If the instruction is already in PredBB, check if we can sink its 3097 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3098 // sinking the scalar instruction I, hence it appears in PredBB; but it 3099 // may have failed to sink I's operands (recursively), which we try 3100 // (again) here. 3101 if (I->getParent() == PredBB) { 3102 Worklist.insert(I->op_begin(), I->op_end()); 3103 continue; 3104 } 3105 3106 // It's legal to sink the instruction if all its uses occur in the 3107 // predicated block. Otherwise, there's nothing to do yet, and we may 3108 // need to reanalyze the instruction. 3109 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { 3110 InstsToReanalyze.push_back(I); 3111 continue; 3112 } 3113 3114 // Move the instruction to the beginning of the predicated block, and add 3115 // it's operands to the worklist. 3116 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3117 Worklist.insert(I->op_begin(), I->op_end()); 3118 3119 // The sinking may have enabled other instructions to be sunk, so we will 3120 // need to iterate. 3121 Changed = true; 3122 } 3123 } while (Changed); 3124 } 3125 3126 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 3127 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3128 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3129 for (VPRecipeBase &P : VPBB->phis()) { 3130 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3131 if (!VPPhi) 3132 continue; 3133 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi)); 3134 // Make sure the builder has a valid insert point. 3135 Builder.SetInsertPoint(NewPhi); 3136 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) { 3137 VPValue *Inc = VPPhi->getIncomingValue(Idx); 3138 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); 3139 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); 3140 } 3141 } 3142 } 3143 } 3144 3145 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3146 // We should not collect Scalars more than once per VF. Right now, this 3147 // function is called from collectUniformsAndScalars(), which already does 3148 // this check. Collecting Scalars for VF=1 does not make any sense. 3149 assert(VF.isVector() && !Scalars.contains(VF) && 3150 "This function should not be visited twice for the same VF"); 3151 3152 // This avoids any chances of creating a REPLICATE recipe during planning 3153 // since that would result in generation of scalarized code during execution, 3154 // which is not supported for scalable vectors. 3155 if (VF.isScalable()) { 3156 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3157 return; 3158 } 3159 3160 SmallSetVector<Instruction *, 8> Worklist; 3161 3162 // These sets are used to seed the analysis with pointers used by memory 3163 // accesses that will remain scalar. 3164 SmallSetVector<Instruction *, 8> ScalarPtrs; 3165 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3166 auto *Latch = TheLoop->getLoopLatch(); 3167 3168 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3169 // The pointer operands of loads and stores will be scalar as long as the 3170 // memory access is not a gather or scatter operation. The value operand of a 3171 // store will remain scalar if the store is scalarized. 3172 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3173 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3174 assert(WideningDecision != CM_Unknown && 3175 "Widening decision should be ready at this moment"); 3176 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3177 if (Ptr == Store->getValueOperand()) 3178 return WideningDecision == CM_Scalarize; 3179 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3180 "Ptr is neither a value or pointer operand"); 3181 return WideningDecision != CM_GatherScatter; 3182 }; 3183 3184 // A helper that returns true if the given value is a getelementptr 3185 // instruction contained in the loop. 3186 auto IsLoopVaryingGEP = [&](Value *V) { 3187 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V); 3188 }; 3189 3190 // A helper that evaluates a memory access's use of a pointer. If the use will 3191 // be a scalar use and the pointer is only used by memory accesses, we place 3192 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3193 // PossibleNonScalarPtrs. 3194 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3195 // We only care about bitcast and getelementptr instructions contained in 3196 // the loop. 3197 if (!IsLoopVaryingGEP(Ptr)) 3198 return; 3199 3200 // If the pointer has already been identified as scalar (e.g., if it was 3201 // also identified as uniform), there's nothing to do. 3202 auto *I = cast<Instruction>(Ptr); 3203 if (Worklist.count(I)) 3204 return; 3205 3206 // If the use of the pointer will be a scalar use, and all users of the 3207 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3208 // place the pointer in PossibleNonScalarPtrs. 3209 if (IsScalarUse(MemAccess, Ptr) && 3210 all_of(I->users(), IsaPred<LoadInst, StoreInst>)) 3211 ScalarPtrs.insert(I); 3212 else 3213 PossibleNonScalarPtrs.insert(I); 3214 }; 3215 3216 // We seed the scalars analysis with three classes of instructions: (1) 3217 // instructions marked uniform-after-vectorization and (2) bitcast, 3218 // getelementptr and (pointer) phi instructions used by memory accesses 3219 // requiring a scalar use. 3220 // 3221 // (1) Add to the worklist all instructions that have been identified as 3222 // uniform-after-vectorization. 3223 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3224 3225 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3226 // memory accesses requiring a scalar use. The pointer operands of loads and 3227 // stores will be scalar unless the operation is a gather or scatter. 3228 // The value operand of a store will remain scalar if the store is scalarized. 3229 for (auto *BB : TheLoop->blocks()) 3230 for (auto &I : *BB) { 3231 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3232 EvaluatePtrUse(Load, Load->getPointerOperand()); 3233 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3234 EvaluatePtrUse(Store, Store->getPointerOperand()); 3235 EvaluatePtrUse(Store, Store->getValueOperand()); 3236 } 3237 } 3238 for (auto *I : ScalarPtrs) 3239 if (!PossibleNonScalarPtrs.count(I)) { 3240 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3241 Worklist.insert(I); 3242 } 3243 3244 // Insert the forced scalars. 3245 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3246 // induction variable when the PHI user is scalarized. 3247 auto ForcedScalar = ForcedScalars.find(VF); 3248 if (ForcedScalar != ForcedScalars.end()) 3249 for (auto *I : ForcedScalar->second) { 3250 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3251 Worklist.insert(I); 3252 } 3253 3254 // Expand the worklist by looking through any bitcasts and getelementptr 3255 // instructions we've already identified as scalar. This is similar to the 3256 // expansion step in collectLoopUniforms(); however, here we're only 3257 // expanding to include additional bitcasts and getelementptr instructions. 3258 unsigned Idx = 0; 3259 while (Idx != Worklist.size()) { 3260 Instruction *Dst = Worklist[Idx++]; 3261 if (!IsLoopVaryingGEP(Dst->getOperand(0))) 3262 continue; 3263 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3264 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3265 auto *J = cast<Instruction>(U); 3266 return !TheLoop->contains(J) || Worklist.count(J) || 3267 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3268 IsScalarUse(J, Src)); 3269 })) { 3270 Worklist.insert(Src); 3271 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3272 } 3273 } 3274 3275 // An induction variable will remain scalar if all users of the induction 3276 // variable and induction variable update remain scalar. 3277 for (const auto &Induction : Legal->getInductionVars()) { 3278 auto *Ind = Induction.first; 3279 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3280 3281 // If tail-folding is applied, the primary induction variable will be used 3282 // to feed a vector compare. 3283 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3284 continue; 3285 3286 // Returns true if \p Indvar is a pointer induction that is used directly by 3287 // load/store instruction \p I. 3288 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3289 Instruction *I) { 3290 return Induction.second.getKind() == 3291 InductionDescriptor::IK_PtrInduction && 3292 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3293 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar); 3294 }; 3295 3296 // Determine if all users of the induction variable are scalar after 3297 // vectorization. 3298 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { 3299 auto *I = cast<Instruction>(U); 3300 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3301 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3302 }); 3303 if (!ScalarInd) 3304 continue; 3305 3306 // If the induction variable update is a fixed-order recurrence, neither the 3307 // induction variable or its update should be marked scalar after 3308 // vectorization. 3309 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); 3310 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) 3311 continue; 3312 3313 // Determine if all users of the induction variable update instruction are 3314 // scalar after vectorization. 3315 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3316 auto *I = cast<Instruction>(U); 3317 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3318 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3319 }); 3320 if (!ScalarIndUpdate) 3321 continue; 3322 3323 // The induction variable and its update instruction will remain scalar. 3324 Worklist.insert(Ind); 3325 Worklist.insert(IndUpdate); 3326 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3327 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3328 << "\n"); 3329 } 3330 3331 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3332 } 3333 3334 bool LoopVectorizationCostModel::isScalarWithPredication( 3335 Instruction *I, ElementCount VF) const { 3336 if (!isPredicatedInst(I)) 3337 return false; 3338 3339 // Do we have a non-scalar lowering for this predicated 3340 // instruction? No - it is scalar with predication. 3341 switch(I->getOpcode()) { 3342 default: 3343 return true; 3344 case Instruction::Call: 3345 if (VF.isScalar()) 3346 return true; 3347 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3348 .Kind == CM_Scalarize; 3349 case Instruction::Load: 3350 case Instruction::Store: { 3351 auto *Ptr = getLoadStorePointerOperand(I); 3352 auto *Ty = getLoadStoreType(I); 3353 Type *VTy = Ty; 3354 if (VF.isVector()) 3355 VTy = VectorType::get(Ty, VF); 3356 const Align Alignment = getLoadStoreAlignment(I); 3357 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3358 TTI.isLegalMaskedGather(VTy, Alignment)) 3359 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3360 TTI.isLegalMaskedScatter(VTy, Alignment)); 3361 } 3362 case Instruction::UDiv: 3363 case Instruction::SDiv: 3364 case Instruction::SRem: 3365 case Instruction::URem: { 3366 // We have the option to use the safe-divisor idiom to avoid predication. 3367 // The cost based decision here will always select safe-divisor for 3368 // scalable vectors as scalarization isn't legal. 3369 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3370 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3371 } 3372 } 3373 } 3374 3375 // TODO: Fold into LoopVectorizationLegality::isMaskRequired. 3376 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3377 // If predication is not needed, avoid it. 3378 // TODO: We can use the loop-preheader as context point here and get 3379 // context sensitive reasoning for isSafeToSpeculativelyExecute. 3380 if (!blockNeedsPredicationForAnyReason(I->getParent()) || 3381 isSafeToSpeculativelyExecute(I) || 3382 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) || 3383 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I)) 3384 return false; 3385 3386 // If the instruction was executed conditionally in the original scalar loop, 3387 // predication is needed with a mask whose lanes are all possibly inactive. 3388 if (Legal->blockNeedsPredication(I->getParent())) 3389 return true; 3390 3391 // All that remain are instructions with side-effects originally executed in 3392 // the loop unconditionally, but now execute under a tail-fold mask (only) 3393 // having at least one active lane (the first). If the side-effects of the 3394 // instruction are invariant, executing it w/o (the tail-folding) mask is safe 3395 // - it will cause the same side-effects as when masked. 3396 switch(I->getOpcode()) { 3397 default: 3398 llvm_unreachable( 3399 "instruction should have been considered by earlier checks"); 3400 case Instruction::Call: 3401 // Side-effects of a Call are assumed to be non-invariant, needing a 3402 // (fold-tail) mask. 3403 assert(Legal->isMaskRequired(I) && 3404 "should have returned earlier for calls not needing a mask"); 3405 return true; 3406 case Instruction::Load: 3407 // If the address is loop invariant no predication is needed. 3408 return !Legal->isInvariant(getLoadStorePointerOperand(I)); 3409 case Instruction::Store: { 3410 // For stores, we need to prove both speculation safety (which follows from 3411 // the same argument as loads), but also must prove the value being stored 3412 // is correct. The easiest form of the later is to require that all values 3413 // stored are the same. 3414 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && 3415 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); 3416 } 3417 case Instruction::UDiv: 3418 case Instruction::SDiv: 3419 case Instruction::SRem: 3420 case Instruction::URem: 3421 // If the divisor is loop-invariant no predication is needed. 3422 return !TheLoop->isLoopInvariant(I->getOperand(1)); 3423 } 3424 } 3425 3426 std::pair<InstructionCost, InstructionCost> 3427 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3428 ElementCount VF) const { 3429 assert(I->getOpcode() == Instruction::UDiv || 3430 I->getOpcode() == Instruction::SDiv || 3431 I->getOpcode() == Instruction::SRem || 3432 I->getOpcode() == Instruction::URem); 3433 assert(!isSafeToSpeculativelyExecute(I)); 3434 3435 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3436 3437 // Scalarization isn't legal for scalable vector types 3438 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3439 if (!VF.isScalable()) { 3440 // Get the scalarization cost and scale this amount by the probability of 3441 // executing the predicated block. If the instruction is not predicated, 3442 // we fall through to the next case. 3443 ScalarizationCost = 0; 3444 3445 // These instructions have a non-void type, so account for the phi nodes 3446 // that we will create. This cost is likely to be zero. The phi node 3447 // cost, if any, should be scaled by the block probability because it 3448 // models a copy at the end of each predicated block. 3449 ScalarizationCost += VF.getKnownMinValue() * 3450 TTI.getCFInstrCost(Instruction::PHI, CostKind); 3451 3452 // The cost of the non-predicated instruction. 3453 ScalarizationCost += VF.getKnownMinValue() * 3454 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 3455 3456 // The cost of insertelement and extractelement instructions needed for 3457 // scalarization. 3458 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 3459 3460 // Scale the cost by the probability of executing the predicated blocks. 3461 // This assumes the predicated block for each vector lane is equally 3462 // likely. 3463 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 3464 } 3465 InstructionCost SafeDivisorCost = 0; 3466 3467 auto *VecTy = toVectorTy(I->getType(), VF); 3468 3469 // The cost of the select guard to ensure all lanes are well defined 3470 // after we speculate above any internal control flow. 3471 SafeDivisorCost += 3472 TTI.getCmpSelInstrCost(Instruction::Select, VecTy, 3473 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 3474 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3475 3476 // Certain instructions can be cheaper to vectorize if they have a constant 3477 // second vector operand. One example of this are shifts on x86. 3478 Value *Op2 = I->getOperand(1); 3479 auto Op2Info = TTI.getOperandInfo(Op2); 3480 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 3481 Legal->isInvariant(Op2)) 3482 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 3483 3484 SmallVector<const Value *, 4> Operands(I->operand_values()); 3485 SafeDivisorCost += TTI.getArithmeticInstrCost( 3486 I->getOpcode(), VecTy, CostKind, 3487 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 3488 Op2Info, Operands, I); 3489 return {ScalarizationCost, SafeDivisorCost}; 3490 } 3491 3492 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 3493 Instruction *I, ElementCount VF) const { 3494 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 3495 assert(getWideningDecision(I, VF) == CM_Unknown && 3496 "Decision should not be set yet."); 3497 auto *Group = getInterleavedAccessGroup(I); 3498 assert(Group && "Must have a group."); 3499 unsigned InterleaveFactor = Group->getFactor(); 3500 3501 // If the instruction's allocated size doesn't equal its type size, it 3502 // requires padding and will be scalarized. 3503 auto &DL = I->getDataLayout(); 3504 auto *ScalarTy = getLoadStoreType(I); 3505 if (hasIrregularType(ScalarTy, DL)) 3506 return false; 3507 3508 // We currently only know how to emit interleave/deinterleave with 3509 // Factor=2 for scalable vectors. This is purely an implementation 3510 // limit. 3511 if (VF.isScalable() && InterleaveFactor != 2) 3512 return false; 3513 3514 // If the group involves a non-integral pointer, we may not be able to 3515 // losslessly cast all values to a common type. 3516 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 3517 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) { 3518 Instruction *Member = Group->getMember(Idx); 3519 if (!Member) 3520 continue; 3521 auto *MemberTy = getLoadStoreType(Member); 3522 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 3523 // Don't coerce non-integral pointers to integers or vice versa. 3524 if (MemberNI != ScalarNI) 3525 // TODO: Consider adding special nullptr value case here 3526 return false; 3527 if (MemberNI && ScalarNI && 3528 ScalarTy->getPointerAddressSpace() != 3529 MemberTy->getPointerAddressSpace()) 3530 return false; 3531 } 3532 3533 // Check if masking is required. 3534 // A Group may need masking for one of two reasons: it resides in a block that 3535 // needs predication, or it was decided to use masking to deal with gaps 3536 // (either a gap at the end of a load-access that may result in a speculative 3537 // load, or any gaps in a store-access). 3538 bool PredicatedAccessRequiresMasking = 3539 blockNeedsPredicationForAnyReason(I->getParent()) && 3540 Legal->isMaskRequired(I); 3541 bool LoadAccessWithGapsRequiresEpilogMasking = 3542 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 3543 !isScalarEpilogueAllowed(); 3544 bool StoreAccessWithGapsRequiresMasking = 3545 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 3546 if (!PredicatedAccessRequiresMasking && 3547 !LoadAccessWithGapsRequiresEpilogMasking && 3548 !StoreAccessWithGapsRequiresMasking) 3549 return true; 3550 3551 // If masked interleaving is required, we expect that the user/target had 3552 // enabled it, because otherwise it either wouldn't have been created or 3553 // it should have been invalidated by the CostModel. 3554 assert(useMaskedInterleavedAccesses(TTI) && 3555 "Masked interleave-groups for predicated accesses are not enabled."); 3556 3557 if (Group->isReverse()) 3558 return false; 3559 3560 auto *Ty = getLoadStoreType(I); 3561 const Align Alignment = getLoadStoreAlignment(I); 3562 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 3563 : TTI.isLegalMaskedStore(Ty, Alignment); 3564 } 3565 3566 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 3567 Instruction *I, ElementCount VF) { 3568 // Get and ensure we have a valid memory instruction. 3569 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 3570 3571 auto *Ptr = getLoadStorePointerOperand(I); 3572 auto *ScalarTy = getLoadStoreType(I); 3573 3574 // In order to be widened, the pointer should be consecutive, first of all. 3575 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 3576 return false; 3577 3578 // If the instruction is a store located in a predicated block, it will be 3579 // scalarized. 3580 if (isScalarWithPredication(I, VF)) 3581 return false; 3582 3583 // If the instruction's allocated size doesn't equal it's type size, it 3584 // requires padding and will be scalarized. 3585 auto &DL = I->getDataLayout(); 3586 if (hasIrregularType(ScalarTy, DL)) 3587 return false; 3588 3589 return true; 3590 } 3591 3592 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 3593 // We should not collect Uniforms more than once per VF. Right now, 3594 // this function is called from collectUniformsAndScalars(), which 3595 // already does this check. Collecting Uniforms for VF=1 does not make any 3596 // sense. 3597 3598 assert(VF.isVector() && !Uniforms.contains(VF) && 3599 "This function should not be visited twice for the same VF"); 3600 3601 // Visit the list of Uniforms. If we find no uniform value, we won't 3602 // analyze again. Uniforms.count(VF) will return 1. 3603 Uniforms[VF].clear(); 3604 3605 // Now we know that the loop is vectorizable! 3606 // Collect instructions inside the loop that will remain uniform after 3607 // vectorization. 3608 3609 // Global values, params and instructions outside of current loop are out of 3610 // scope. 3611 auto IsOutOfScope = [&](Value *V) -> bool { 3612 Instruction *I = dyn_cast<Instruction>(V); 3613 return (!I || !TheLoop->contains(I)); 3614 }; 3615 3616 // Worklist containing uniform instructions demanding lane 0. 3617 SetVector<Instruction *> Worklist; 3618 3619 // Add uniform instructions demanding lane 0 to the worklist. Instructions 3620 // that require predication must not be considered uniform after 3621 // vectorization, because that would create an erroneous replicating region 3622 // where only a single instance out of VF should be formed. 3623 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void { 3624 if (IsOutOfScope(I)) { 3625 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 3626 << *I << "\n"); 3627 return; 3628 } 3629 if (isPredicatedInst(I)) { 3630 LLVM_DEBUG( 3631 dbgs() << "LV: Found not uniform due to requiring predication: " << *I 3632 << "\n"); 3633 return; 3634 } 3635 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 3636 Worklist.insert(I); 3637 }; 3638 3639 // Start with the conditional branches exiting the loop. If the branch 3640 // condition is an instruction contained in the loop that is only used by the 3641 // branch, it is uniform. Note conditions from uncountable early exits are not 3642 // uniform. 3643 SmallVector<BasicBlock *> Exiting; 3644 TheLoop->getExitingBlocks(Exiting); 3645 for (BasicBlock *E : Exiting) { 3646 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) 3647 continue; 3648 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); 3649 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 3650 AddToWorklistIfAllowed(Cmp); 3651 } 3652 3653 auto PrevVF = VF.divideCoefficientBy(2); 3654 // Return true if all lanes perform the same memory operation, and we can 3655 // thus choose to execute only one. 3656 auto IsUniformMemOpUse = [&](Instruction *I) { 3657 // If the value was already known to not be uniform for the previous 3658 // (smaller VF), it cannot be uniform for the larger VF. 3659 if (PrevVF.isVector()) { 3660 auto Iter = Uniforms.find(PrevVF); 3661 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 3662 return false; 3663 } 3664 if (!Legal->isUniformMemOp(*I, VF)) 3665 return false; 3666 if (isa<LoadInst>(I)) 3667 // Loading the same address always produces the same result - at least 3668 // assuming aliasing and ordering which have already been checked. 3669 return true; 3670 // Storing the same value on every iteration. 3671 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 3672 }; 3673 3674 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) { 3675 InstWidening WideningDecision = getWideningDecision(I, VF); 3676 assert(WideningDecision != CM_Unknown && 3677 "Widening decision should be ready at this moment"); 3678 3679 if (IsUniformMemOpUse(I)) 3680 return true; 3681 3682 return (WideningDecision == CM_Widen || 3683 WideningDecision == CM_Widen_Reverse || 3684 WideningDecision == CM_Interleave); 3685 }; 3686 3687 // Returns true if Ptr is the pointer operand of a memory access instruction 3688 // I, I is known to not require scalarization, and the pointer is not also 3689 // stored. 3690 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 3691 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 3692 return false; 3693 return getLoadStorePointerOperand(I) == Ptr && 3694 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 3695 }; 3696 3697 // Holds a list of values which are known to have at least one uniform use. 3698 // Note that there may be other uses which aren't uniform. A "uniform use" 3699 // here is something which only demands lane 0 of the unrolled iterations; 3700 // it does not imply that all lanes produce the same value (e.g. this is not 3701 // the usual meaning of uniform) 3702 SetVector<Value *> HasUniformUse; 3703 3704 // Scan the loop for instructions which are either a) known to have only 3705 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3706 for (auto *BB : TheLoop->blocks()) 3707 for (auto &I : *BB) { 3708 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 3709 switch (II->getIntrinsicID()) { 3710 case Intrinsic::sideeffect: 3711 case Intrinsic::experimental_noalias_scope_decl: 3712 case Intrinsic::assume: 3713 case Intrinsic::lifetime_start: 3714 case Intrinsic::lifetime_end: 3715 if (TheLoop->hasLoopInvariantOperands(&I)) 3716 AddToWorklistIfAllowed(&I); 3717 break; 3718 default: 3719 break; 3720 } 3721 } 3722 3723 // ExtractValue instructions must be uniform, because the operands are 3724 // known to be loop-invariant. 3725 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 3726 assert(IsOutOfScope(EVI->getAggregateOperand()) && 3727 "Expected aggregate value to be loop invariant"); 3728 AddToWorklistIfAllowed(EVI); 3729 continue; 3730 } 3731 3732 // If there's no pointer operand, there's nothing to do. 3733 auto *Ptr = getLoadStorePointerOperand(&I); 3734 if (!Ptr) 3735 continue; 3736 3737 if (IsUniformMemOpUse(&I)) 3738 AddToWorklistIfAllowed(&I); 3739 3740 if (IsVectorizedMemAccessUse(&I, Ptr)) 3741 HasUniformUse.insert(Ptr); 3742 } 3743 3744 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 3745 // demanding) users. Since loops are assumed to be in LCSSA form, this 3746 // disallows uses outside the loop as well. 3747 for (auto *V : HasUniformUse) { 3748 if (IsOutOfScope(V)) 3749 continue; 3750 auto *I = cast<Instruction>(V); 3751 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool { 3752 auto *UI = cast<Instruction>(U); 3753 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V); 3754 }); 3755 if (UsersAreMemAccesses) 3756 AddToWorklistIfAllowed(I); 3757 } 3758 3759 // Expand Worklist in topological order: whenever a new instruction 3760 // is added , its users should be already inside Worklist. It ensures 3761 // a uniform instruction will only be used by uniform instructions. 3762 unsigned Idx = 0; 3763 while (Idx != Worklist.size()) { 3764 Instruction *I = Worklist[Idx++]; 3765 3766 for (auto *OV : I->operand_values()) { 3767 // isOutOfScope operands cannot be uniform instructions. 3768 if (IsOutOfScope(OV)) 3769 continue; 3770 // First order recurrence Phi's should typically be considered 3771 // non-uniform. 3772 auto *OP = dyn_cast<PHINode>(OV); 3773 if (OP && Legal->isFixedOrderRecurrence(OP)) 3774 continue; 3775 // If all the users of the operand are uniform, then add the 3776 // operand into the uniform worklist. 3777 auto *OI = cast<Instruction>(OV); 3778 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 3779 auto *J = cast<Instruction>(U); 3780 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI); 3781 })) 3782 AddToWorklistIfAllowed(OI); 3783 } 3784 } 3785 3786 // For an instruction to be added into Worklist above, all its users inside 3787 // the loop should also be in Worklist. However, this condition cannot be 3788 // true for phi nodes that form a cyclic dependence. We must process phi 3789 // nodes separately. An induction variable will remain uniform if all users 3790 // of the induction variable and induction variable update remain uniform. 3791 // The code below handles both pointer and non-pointer induction variables. 3792 BasicBlock *Latch = TheLoop->getLoopLatch(); 3793 for (const auto &Induction : Legal->getInductionVars()) { 3794 auto *Ind = Induction.first; 3795 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3796 3797 // Determine if all users of the induction variable are uniform after 3798 // vectorization. 3799 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool { 3800 auto *I = cast<Instruction>(U); 3801 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3802 IsVectorizedMemAccessUse(I, Ind); 3803 }); 3804 if (!UniformInd) 3805 continue; 3806 3807 // Determine if all users of the induction variable update instruction are 3808 // uniform after vectorization. 3809 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3810 auto *I = cast<Instruction>(U); 3811 return I == Ind || Worklist.count(I) || 3812 IsVectorizedMemAccessUse(I, IndUpdate); 3813 }); 3814 if (!UniformIndUpdate) 3815 continue; 3816 3817 // The induction variable and its update instruction will remain uniform. 3818 AddToWorklistIfAllowed(Ind); 3819 AddToWorklistIfAllowed(IndUpdate); 3820 } 3821 3822 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 3823 } 3824 3825 bool LoopVectorizationCostModel::runtimeChecksRequired() { 3826 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 3827 3828 if (Legal->getRuntimePointerChecking()->Need) { 3829 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 3830 "runtime pointer checks needed. Enable vectorization of this " 3831 "loop with '#pragma clang loop vectorize(enable)' when " 3832 "compiling with -Os/-Oz", 3833 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3834 return true; 3835 } 3836 3837 if (!PSE.getPredicate().isAlwaysTrue()) { 3838 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 3839 "runtime SCEV checks needed. Enable vectorization of this " 3840 "loop with '#pragma clang loop vectorize(enable)' when " 3841 "compiling with -Os/-Oz", 3842 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3843 return true; 3844 } 3845 3846 // FIXME: Avoid specializing for stride==1 instead of bailing out. 3847 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 3848 reportVectorizationFailure("Runtime stride check for small trip count", 3849 "runtime stride == 1 checks needed. Enable vectorization of " 3850 "this loop without such check by compiling with -Os/-Oz", 3851 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3852 return true; 3853 } 3854 3855 return false; 3856 } 3857 3858 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { 3859 if (IsScalableVectorizationAllowed) 3860 return *IsScalableVectorizationAllowed; 3861 3862 IsScalableVectorizationAllowed = false; 3863 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 3864 return false; 3865 3866 if (Hints->isScalableVectorizationDisabled()) { 3867 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 3868 "ScalableVectorizationDisabled", ORE, TheLoop); 3869 return false; 3870 } 3871 3872 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 3873 3874 auto MaxScalableVF = ElementCount::getScalable( 3875 std::numeric_limits<ElementCount::ScalarTy>::max()); 3876 3877 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 3878 // FIXME: While for scalable vectors this is currently sufficient, this should 3879 // be replaced by a more detailed mechanism that filters out specific VFs, 3880 // instead of invalidating vectorization for a whole set of VFs based on the 3881 // MaxVF. 3882 3883 // Disable scalable vectorization if the loop contains unsupported reductions. 3884 if (!canVectorizeReductions(MaxScalableVF)) { 3885 reportVectorizationInfo( 3886 "Scalable vectorization not supported for the reduction " 3887 "operations found in this loop.", 3888 "ScalableVFUnfeasible", ORE, TheLoop); 3889 return false; 3890 } 3891 3892 // Disable scalable vectorization if the loop contains any instructions 3893 // with element types not supported for scalable vectors. 3894 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 3895 return !Ty->isVoidTy() && 3896 !this->TTI.isElementTypeLegalForScalableVector(Ty); 3897 })) { 3898 reportVectorizationInfo("Scalable vectorization is not supported " 3899 "for all element types found in this loop.", 3900 "ScalableVFUnfeasible", ORE, TheLoop); 3901 return false; 3902 } 3903 3904 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { 3905 reportVectorizationInfo("The target does not provide maximum vscale value " 3906 "for safe distance analysis.", 3907 "ScalableVFUnfeasible", ORE, TheLoop); 3908 return false; 3909 } 3910 3911 IsScalableVectorizationAllowed = true; 3912 return true; 3913 } 3914 3915 ElementCount 3916 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 3917 if (!isScalableVectorizationAllowed()) 3918 return ElementCount::getScalable(0); 3919 3920 auto MaxScalableVF = ElementCount::getScalable( 3921 std::numeric_limits<ElementCount::ScalarTy>::max()); 3922 if (Legal->isSafeForAnyVectorWidth()) 3923 return MaxScalableVF; 3924 3925 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 3926 // Limit MaxScalableVF by the maximum safe dependence distance. 3927 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 3928 3929 if (!MaxScalableVF) 3930 reportVectorizationInfo( 3931 "Max legal vector width too small, scalable vectorization " 3932 "unfeasible.", 3933 "ScalableVFUnfeasible", ORE, TheLoop); 3934 3935 return MaxScalableVF; 3936 } 3937 3938 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 3939 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 3940 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 3941 unsigned SmallestType, WidestType; 3942 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 3943 3944 // Get the maximum safe dependence distance in bits computed by LAA. 3945 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3946 // the memory accesses that is most restrictive (involved in the smallest 3947 // dependence distance). 3948 unsigned MaxSafeElements = 3949 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 3950 3951 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 3952 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 3953 if (!Legal->isSafeForAnyVectorWidth()) 3954 this->MaxSafeElements = MaxSafeElements; 3955 3956 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 3957 << ".\n"); 3958 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 3959 << ".\n"); 3960 3961 // First analyze the UserVF, fall back if the UserVF should be ignored. 3962 if (UserVF) { 3963 auto MaxSafeUserVF = 3964 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 3965 3966 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 3967 // If `VF=vscale x N` is safe, then so is `VF=N` 3968 if (UserVF.isScalable()) 3969 return FixedScalableVFPair( 3970 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 3971 3972 return UserVF; 3973 } 3974 3975 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 3976 3977 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3978 // is better to ignore the hint and let the compiler choose a suitable VF. 3979 if (!UserVF.isScalable()) { 3980 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3981 << " is unsafe, clamping to max safe VF=" 3982 << MaxSafeFixedVF << ".\n"); 3983 ORE->emit([&]() { 3984 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3985 TheLoop->getStartLoc(), 3986 TheLoop->getHeader()) 3987 << "User-specified vectorization factor " 3988 << ore::NV("UserVectorizationFactor", UserVF) 3989 << " is unsafe, clamping to maximum safe vectorization factor " 3990 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 3991 }); 3992 return MaxSafeFixedVF; 3993 } 3994 3995 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 3996 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3997 << " is ignored because scalable vectors are not " 3998 "available.\n"); 3999 ORE->emit([&]() { 4000 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4001 TheLoop->getStartLoc(), 4002 TheLoop->getHeader()) 4003 << "User-specified vectorization factor " 4004 << ore::NV("UserVectorizationFactor", UserVF) 4005 << " is ignored because the target does not support scalable " 4006 "vectors. The compiler will pick a more suitable value."; 4007 }); 4008 } else { 4009 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4010 << " is unsafe. Ignoring scalable UserVF.\n"); 4011 ORE->emit([&]() { 4012 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4013 TheLoop->getStartLoc(), 4014 TheLoop->getHeader()) 4015 << "User-specified vectorization factor " 4016 << ore::NV("UserVectorizationFactor", UserVF) 4017 << " is unsafe. Ignoring the hint to let the compiler pick a " 4018 "more suitable value."; 4019 }); 4020 } 4021 } 4022 4023 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4024 << " / " << WidestType << " bits.\n"); 4025 4026 FixedScalableVFPair Result(ElementCount::getFixed(1), 4027 ElementCount::getScalable(0)); 4028 if (auto MaxVF = 4029 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4030 MaxSafeFixedVF, FoldTailByMasking)) 4031 Result.FixedVF = MaxVF; 4032 4033 if (auto MaxVF = 4034 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4035 MaxSafeScalableVF, FoldTailByMasking)) 4036 if (MaxVF.isScalable()) { 4037 Result.ScalableVF = MaxVF; 4038 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4039 << "\n"); 4040 } 4041 4042 return Result; 4043 } 4044 4045 FixedScalableVFPair 4046 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4047 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4048 // TODO: It may be useful to do since it's still likely to be dynamically 4049 // uniform if the target can skip. 4050 reportVectorizationFailure( 4051 "Not inserting runtime ptr check for divergent target", 4052 "runtime pointer checks needed. Not enabled for divergent target", 4053 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4054 return FixedScalableVFPair::getNone(); 4055 } 4056 4057 ScalarEvolution *SE = PSE.getSE(); 4058 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4059 unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); 4060 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4061 if (TC != MaxTC) 4062 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); 4063 if (TC == 1) { 4064 reportVectorizationFailure("Single iteration (non) loop", 4065 "loop trip count is one, irrelevant for vectorization", 4066 "SingleIterationLoop", ORE, TheLoop); 4067 return FixedScalableVFPair::getNone(); 4068 } 4069 4070 // If BTC matches the widest induction type and is -1 then the trip count 4071 // computation will wrap to 0 and the vector trip count will be 0. Do not try 4072 // to vectorize. 4073 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); 4074 if (!isa<SCEVCouldNotCompute>(BTC) && 4075 BTC->getType()->getScalarSizeInBits() >= 4076 Legal->getWidestInductionType()->getScalarSizeInBits() && 4077 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, 4078 SE->getMinusOne(BTC->getType()))) { 4079 reportVectorizationFailure( 4080 "Trip count computation wrapped", 4081 "backedge-taken count is -1, loop trip count wrapped to 0", 4082 "TripCountWrapped", ORE, TheLoop); 4083 return FixedScalableVFPair::getNone(); 4084 } 4085 4086 switch (ScalarEpilogueStatus) { 4087 case CM_ScalarEpilogueAllowed: 4088 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4089 case CM_ScalarEpilogueNotAllowedUsePredicate: 4090 [[fallthrough]]; 4091 case CM_ScalarEpilogueNotNeededUsePredicate: 4092 LLVM_DEBUG( 4093 dbgs() << "LV: vector predicate hint/switch found.\n" 4094 << "LV: Not allowing scalar epilogue, creating predicated " 4095 << "vector loop.\n"); 4096 break; 4097 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4098 // fallthrough as a special case of OptForSize 4099 case CM_ScalarEpilogueNotAllowedOptSize: 4100 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4101 LLVM_DEBUG( 4102 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4103 else 4104 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4105 << "count.\n"); 4106 4107 // Bail if runtime checks are required, which are not good when optimising 4108 // for size. 4109 if (runtimeChecksRequired()) 4110 return FixedScalableVFPair::getNone(); 4111 4112 break; 4113 } 4114 4115 // The only loops we can vectorize without a scalar epilogue, are loops with 4116 // a bottom-test and a single exiting block. We'd have to handle the fact 4117 // that not every instruction executes on the last iteration. This will 4118 // require a lane mask which varies through the vector loop body. (TODO) 4119 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4120 // If there was a tail-folding hint/switch, but we can't fold the tail by 4121 // masking, fallback to a vectorization with a scalar epilogue. 4122 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4123 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4124 "scalar epilogue instead.\n"); 4125 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4126 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4127 } 4128 return FixedScalableVFPair::getNone(); 4129 } 4130 4131 // Now try the tail folding 4132 4133 // Invalidate interleave groups that require an epilogue if we can't mask 4134 // the interleave-group. 4135 if (!useMaskedInterleavedAccesses(TTI)) { 4136 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4137 "No decisions should have been taken at this point"); 4138 // Note: There is no need to invalidate any cost modeling decisions here, as 4139 // none were taken so far. 4140 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4141 } 4142 4143 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4144 4145 // Avoid tail folding if the trip count is known to be a multiple of any VF 4146 // we choose. 4147 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4148 MaxFactors.FixedVF.getFixedValue(); 4149 if (MaxFactors.ScalableVF) { 4150 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4151 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4152 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4153 *MaxPowerOf2RuntimeVF, 4154 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4155 } else 4156 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4157 } 4158 4159 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4160 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4161 "MaxFixedVF must be a power of 2"); 4162 unsigned MaxVFtimesIC = 4163 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4164 ScalarEvolution *SE = PSE.getSE(); 4165 // Currently only loops with countable exits are vectorized, but calling 4166 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with 4167 // uncountable exits whilst also ensuring the symbolic maximum and known 4168 // back-edge taken count remain identical for loops with countable exits. 4169 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount(); 4170 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() && 4171 "Invalid loop count"); 4172 const SCEV *ExitCount = SE->getAddExpr( 4173 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4174 const SCEV *Rem = SE->getURemExpr( 4175 SE->applyLoopGuards(ExitCount, TheLoop), 4176 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4177 if (Rem->isZero()) { 4178 // Accept MaxFixedVF if we do not have a tail. 4179 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4180 return MaxFactors; 4181 } 4182 } 4183 4184 // If we don't know the precise trip count, or if the trip count that we 4185 // found modulo the vectorization factor is not zero, try to fold the tail 4186 // by masking. 4187 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4188 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); 4189 if (foldTailByMasking()) { 4190 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { 4191 LLVM_DEBUG( 4192 dbgs() 4193 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4194 "try to generate VP Intrinsics with scalable vector " 4195 "factors only.\n"); 4196 // Tail folded loop using VP intrinsics restricts the VF to be scalable 4197 // for now. 4198 // TODO: extend it for fixed vectors, if required. 4199 assert(MaxFactors.ScalableVF.isScalable() && 4200 "Expected scalable vector factor."); 4201 4202 MaxFactors.FixedVF = ElementCount::getFixed(1); 4203 } 4204 return MaxFactors; 4205 } 4206 4207 // If there was a tail-folding hint/switch, but we can't fold the tail by 4208 // masking, fallback to a vectorization with a scalar epilogue. 4209 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4210 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4211 "scalar epilogue instead.\n"); 4212 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4213 return MaxFactors; 4214 } 4215 4216 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4217 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4218 return FixedScalableVFPair::getNone(); 4219 } 4220 4221 if (TC == 0) { 4222 reportVectorizationFailure( 4223 "unable to calculate the loop count due to complex control flow", 4224 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4225 return FixedScalableVFPair::getNone(); 4226 } 4227 4228 reportVectorizationFailure( 4229 "Cannot optimize for size and vectorize at the same time.", 4230 "cannot optimize for size and vectorize at the same time. " 4231 "Enable vectorization of this loop with '#pragma clang loop " 4232 "vectorize(enable)' when compiling with -Os/-Oz", 4233 "NoTailLoopWithOptForSize", ORE, TheLoop); 4234 return FixedScalableVFPair::getNone(); 4235 } 4236 4237 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4238 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4239 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4240 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4241 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4242 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4243 : TargetTransformInfo::RGK_FixedWidthVector); 4244 4245 // Convenience function to return the minimum of two ElementCounts. 4246 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4247 assert((LHS.isScalable() == RHS.isScalable()) && 4248 "Scalable flags must match"); 4249 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4250 }; 4251 4252 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4253 // Note that both WidestRegister and WidestType may not be a powers of 2. 4254 auto MaxVectorElementCount = ElementCount::get( 4255 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4256 ComputeScalableMaxVF); 4257 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4258 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4259 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4260 4261 if (!MaxVectorElementCount) { 4262 LLVM_DEBUG(dbgs() << "LV: The target has no " 4263 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4264 << " vector registers.\n"); 4265 return ElementCount::getFixed(1); 4266 } 4267 4268 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4269 if (MaxVectorElementCount.isScalable() && 4270 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4271 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4272 auto Min = Attr.getVScaleRangeMin(); 4273 WidestRegisterMinEC *= Min; 4274 } 4275 4276 // When a scalar epilogue is required, at least one iteration of the scalar 4277 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4278 // max VF that results in a dead vector loop. 4279 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4280 MaxTripCount -= 1; 4281 4282 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4283 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4284 // If upper bound loop trip count (TC) is known at compile time there is no 4285 // point in choosing VF greater than TC (as done in the loop below). Select 4286 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4287 // scalable, we only fall back on a fixed VF when the TC is less than or 4288 // equal to the known number of lanes. 4289 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4290 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4291 "exceeding the constant trip count: " 4292 << ClampedUpperTripCount << "\n"); 4293 return ElementCount::get( 4294 ClampedUpperTripCount, 4295 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4296 } 4297 4298 TargetTransformInfo::RegisterKind RegKind = 4299 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4300 : TargetTransformInfo::RGK_FixedWidthVector; 4301 ElementCount MaxVF = MaxVectorElementCount; 4302 if (MaximizeBandwidth || 4303 (MaximizeBandwidth.getNumOccurrences() == 0 && 4304 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4305 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4306 auto MaxVectorElementCountMaxBW = ElementCount::get( 4307 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4308 ComputeScalableMaxVF); 4309 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4310 4311 // Collect all viable vectorization factors larger than the default MaxVF 4312 // (i.e. MaxVectorElementCount). 4313 SmallVector<ElementCount, 8> VFs; 4314 for (ElementCount VS = MaxVectorElementCount * 2; 4315 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4316 VFs.push_back(VS); 4317 4318 // For each VF calculate its register usage. 4319 auto RUs = calculateRegisterUsage(VFs); 4320 4321 // Select the largest VF which doesn't require more registers than existing 4322 // ones. 4323 for (int I = RUs.size() - 1; I >= 0; --I) { 4324 const auto &MLU = RUs[I].MaxLocalUsers; 4325 if (all_of(MLU, [&](decltype(MLU.front()) &LU) { 4326 return LU.second <= TTI.getNumberOfRegisters(LU.first); 4327 })) { 4328 MaxVF = VFs[I]; 4329 break; 4330 } 4331 } 4332 if (ElementCount MinVF = 4333 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4334 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4335 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4336 << ") with target's minimum: " << MinVF << '\n'); 4337 MaxVF = MinVF; 4338 } 4339 } 4340 4341 // Invalidate any widening decisions we might have made, in case the loop 4342 // requires prediction (decided later), but we have already made some 4343 // load/store widening decisions. 4344 invalidateCostModelingDecisions(); 4345 } 4346 return MaxVF; 4347 } 4348 4349 /// Convenience function that returns the value of vscale_range iff 4350 /// vscale_range.min == vscale_range.max or otherwise returns the value 4351 /// returned by the corresponding TTI method. 4352 static std::optional<unsigned> 4353 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4354 const Function *Fn = L->getHeader()->getParent(); 4355 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4356 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4357 auto Min = Attr.getVScaleRangeMin(); 4358 auto Max = Attr.getVScaleRangeMax(); 4359 if (Max && Min == Max) 4360 return Max; 4361 } 4362 4363 return TTI.getVScaleForTuning(); 4364 } 4365 4366 /// This function attempts to return a value that represents the vectorization 4367 /// factor at runtime. For fixed-width VFs we know this precisely at compile 4368 /// time, but for scalable VFs we calculate it based on an estimate of the 4369 /// vscale value. 4370 static unsigned getEstimatedRuntimeVF(const Loop *L, 4371 const TargetTransformInfo &TTI, 4372 ElementCount VF) { 4373 unsigned EstimatedVF = VF.getKnownMinValue(); 4374 if (VF.isScalable()) 4375 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI)) 4376 EstimatedVF *= *VScale; 4377 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 4378 return EstimatedVF; 4379 } 4380 4381 bool LoopVectorizationPlanner::isMoreProfitable( 4382 const VectorizationFactor &A, const VectorizationFactor &B, 4383 const unsigned MaxTripCount) const { 4384 InstructionCost CostA = A.Cost; 4385 InstructionCost CostB = B.Cost; 4386 4387 // Improve estimate for the vector width if it is scalable. 4388 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4389 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4390 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4391 if (A.Width.isScalable()) 4392 EstimatedWidthA *= *VScale; 4393 if (B.Width.isScalable()) 4394 EstimatedWidthB *= *VScale; 4395 } 4396 4397 // Assume vscale may be larger than 1 (or the value being tuned for), 4398 // so that scalable vectorization is slightly favorable over fixed-width 4399 // vectorization. 4400 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && 4401 A.Width.isScalable() && !B.Width.isScalable(); 4402 4403 auto CmpFn = [PreferScalable](const InstructionCost &LHS, 4404 const InstructionCost &RHS) { 4405 return PreferScalable ? LHS <= RHS : LHS < RHS; 4406 }; 4407 4408 // To avoid the need for FP division: 4409 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4410 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4411 if (!MaxTripCount) 4412 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); 4413 4414 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4415 InstructionCost VectorCost, 4416 InstructionCost ScalarCost) { 4417 // If the trip count is a known (possibly small) constant, the trip count 4418 // will be rounded up to an integer number of iterations under 4419 // FoldTailByMasking. The total cost in that case will be 4420 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4421 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4422 // some extra overheads, but for the purpose of comparing the costs of 4423 // different VFs we can use this to compare the total loop-body cost 4424 // expected after vectorization. 4425 if (CM.foldTailByMasking()) 4426 return VectorCost * divideCeil(MaxTripCount, VF); 4427 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); 4428 }; 4429 4430 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); 4431 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); 4432 return CmpFn(RTCostA, RTCostB); 4433 } 4434 4435 bool LoopVectorizationPlanner::isMoreProfitable( 4436 const VectorizationFactor &A, const VectorizationFactor &B) const { 4437 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); 4438 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); 4439 } 4440 4441 void LoopVectorizationPlanner::emitInvalidCostRemarks( 4442 OptimizationRemarkEmitter *ORE) { 4443 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; 4444 SmallVector<RecipeVFPair> InvalidCosts; 4445 for (const auto &Plan : VPlans) { 4446 for (ElementCount VF : Plan->vectorFactors()) { 4447 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), 4448 CM); 4449 precomputeCosts(*Plan, VF, CostCtx); 4450 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); 4451 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4452 for (auto &R : *VPBB) { 4453 if (!R.cost(VF, CostCtx).isValid()) 4454 InvalidCosts.emplace_back(&R, VF); 4455 } 4456 } 4457 } 4458 } 4459 if (InvalidCosts.empty()) 4460 return; 4461 4462 // Emit a report of VFs with invalid costs in the loop. 4463 4464 // Group the remarks per recipe, keeping the recipe order from InvalidCosts. 4465 DenseMap<VPRecipeBase *, unsigned> Numbering; 4466 unsigned I = 0; 4467 for (auto &Pair : InvalidCosts) 4468 if (!Numbering.count(Pair.first)) 4469 Numbering[Pair.first] = I++; 4470 4471 // Sort the list, first on recipe(number) then on VF. 4472 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) { 4473 if (Numbering[A.first] != Numbering[B.first]) 4474 return Numbering[A.first] < Numbering[B.first]; 4475 const auto &LHS = A.second; 4476 const auto &RHS = B.second; 4477 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 4478 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 4479 }); 4480 4481 // For a list of ordered recipe-VF pairs: 4482 // [(load, VF1), (load, VF2), (store, VF1)] 4483 // group the recipes together to emit separate remarks for: 4484 // load (VF1, VF2) 4485 // store (VF1) 4486 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts); 4487 auto Subset = ArrayRef<RecipeVFPair>(); 4488 do { 4489 if (Subset.empty()) 4490 Subset = Tail.take_front(1); 4491 4492 VPRecipeBase *R = Subset.front().first; 4493 4494 unsigned Opcode = 4495 TypeSwitch<const VPRecipeBase *, unsigned>(R) 4496 .Case<VPHeaderPHIRecipe>( 4497 [](const auto *R) { return Instruction::PHI; }) 4498 .Case<VPWidenSelectRecipe>( 4499 [](const auto *R) { return Instruction::Select; }) 4500 .Case<VPWidenStoreRecipe>( 4501 [](const auto *R) { return Instruction::Store; }) 4502 .Case<VPWidenLoadRecipe>( 4503 [](const auto *R) { return Instruction::Load; }) 4504 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>( 4505 [](const auto *R) { return Instruction::Call; }) 4506 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe, 4507 VPWidenCastRecipe>( 4508 [](const auto *R) { return R->getOpcode(); }) 4509 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) { 4510 return R->getStoredValues().empty() ? Instruction::Load 4511 : Instruction::Store; 4512 }); 4513 4514 // If the next recipe is different, or if there are no other pairs, 4515 // emit a remark for the collated subset. e.g. 4516 // [(load, VF1), (load, VF2))] 4517 // to emit: 4518 // remark: invalid costs for 'load' at VF=(VF1, VF2) 4519 if (Subset == Tail || Tail[Subset.size()].first != R) { 4520 std::string OutString; 4521 raw_string_ostream OS(OutString); 4522 assert(!Subset.empty() && "Unexpected empty range"); 4523 OS << "Recipe with invalid costs prevented vectorization at VF=("; 4524 for (const auto &Pair : Subset) 4525 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4526 OS << "):"; 4527 if (Opcode == Instruction::Call) { 4528 StringRef Name = ""; 4529 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) { 4530 Name = Int->getIntrinsicName(); 4531 } else { 4532 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R); 4533 Function *CalledFn = 4534 WidenCall ? WidenCall->getCalledScalarFunction() 4535 : cast<Function>(R->getOperand(R->getNumOperands() - 1) 4536 ->getLiveInIRValue()); 4537 Name = CalledFn->getName(); 4538 } 4539 OS << " call to " << Name; 4540 } else 4541 OS << " " << Instruction::getOpcodeName(Opcode); 4542 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr, 4543 R->getDebugLoc()); 4544 Tail = Tail.drop_front(Subset.size()); 4545 Subset = {}; 4546 } else 4547 // Grow the subset by one element 4548 Subset = Tail.take_front(Subset.size() + 1); 4549 } while (!Tail.empty()); 4550 } 4551 4552 /// Check if any recipe of \p Plan will generate a vector value, which will be 4553 /// assigned a vector register. 4554 static bool willGenerateVectors(VPlan &Plan, ElementCount VF, 4555 const TargetTransformInfo &TTI) { 4556 assert(VF.isVector() && "Checking a scalar VF?"); 4557 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 4558 DenseSet<VPRecipeBase *> EphemeralRecipes; 4559 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); 4560 // Set of already visited types. 4561 DenseSet<Type *> Visited; 4562 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 4563 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { 4564 for (VPRecipeBase &R : *VPBB) { 4565 if (EphemeralRecipes.contains(&R)) 4566 continue; 4567 // Continue early if the recipe is considered to not produce a vector 4568 // result. Note that this includes VPInstruction where some opcodes may 4569 // produce a vector, to preserve existing behavior as VPInstructions model 4570 // aspects not directly mapped to existing IR instructions. 4571 switch (R.getVPDefID()) { 4572 case VPDef::VPDerivedIVSC: 4573 case VPDef::VPScalarIVStepsSC: 4574 case VPDef::VPScalarCastSC: 4575 case VPDef::VPReplicateSC: 4576 case VPDef::VPInstructionSC: 4577 case VPDef::VPCanonicalIVPHISC: 4578 case VPDef::VPVectorPointerSC: 4579 case VPDef::VPReverseVectorPointerSC: 4580 case VPDef::VPExpandSCEVSC: 4581 case VPDef::VPEVLBasedIVPHISC: 4582 case VPDef::VPPredInstPHISC: 4583 case VPDef::VPBranchOnMaskSC: 4584 continue; 4585 case VPDef::VPReductionSC: 4586 case VPDef::VPActiveLaneMaskPHISC: 4587 case VPDef::VPWidenCallSC: 4588 case VPDef::VPWidenCanonicalIVSC: 4589 case VPDef::VPWidenCastSC: 4590 case VPDef::VPWidenGEPSC: 4591 case VPDef::VPWidenIntrinsicSC: 4592 case VPDef::VPWidenSC: 4593 case VPDef::VPWidenSelectSC: 4594 case VPDef::VPBlendSC: 4595 case VPDef::VPFirstOrderRecurrencePHISC: 4596 case VPDef::VPWidenPHISC: 4597 case VPDef::VPWidenIntOrFpInductionSC: 4598 case VPDef::VPWidenPointerInductionSC: 4599 case VPDef::VPReductionPHISC: 4600 case VPDef::VPInterleaveSC: 4601 case VPDef::VPWidenLoadEVLSC: 4602 case VPDef::VPWidenLoadSC: 4603 case VPDef::VPWidenStoreEVLSC: 4604 case VPDef::VPWidenStoreSC: 4605 break; 4606 default: 4607 llvm_unreachable("unhandled recipe"); 4608 } 4609 4610 auto WillWiden = [&TTI, VF](Type *ScalarTy) { 4611 Type *VectorTy = toVectorTy(ScalarTy, VF); 4612 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); 4613 if (!NumLegalParts) 4614 return false; 4615 if (VF.isScalable()) { 4616 // <vscale x 1 x iN> is assumed to be profitable over iN because 4617 // scalable registers are a distinct register class from scalar 4618 // ones. If we ever find a target which wants to lower scalable 4619 // vectors back to scalars, we'll need to update this code to 4620 // explicitly ask TTI about the register class uses for each part. 4621 return NumLegalParts <= VF.getKnownMinValue(); 4622 } 4623 // Two or more parts that share a register - are vectorized. 4624 return NumLegalParts < VF.getKnownMinValue(); 4625 }; 4626 4627 // If no def nor is a store, e.g., branches, continue - no value to check. 4628 if (R.getNumDefinedValues() == 0 && 4629 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( 4630 &R)) 4631 continue; 4632 // For multi-def recipes, currently only interleaved loads, suffice to 4633 // check first def only. 4634 // For stores check their stored value; for interleaved stores suffice 4635 // the check first stored value only. In all cases this is the second 4636 // operand. 4637 VPValue *ToCheck = 4638 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); 4639 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); 4640 if (!Visited.insert({ScalarTy}).second) 4641 continue; 4642 if (WillWiden(ScalarTy)) 4643 return true; 4644 } 4645 } 4646 4647 return false; 4648 } 4649 4650 #ifndef NDEBUG 4651 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { 4652 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); 4653 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4654 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4655 assert(any_of(VPlans, 4656 [](std::unique_ptr<VPlan> &P) { 4657 return P->hasVF(ElementCount::getFixed(1)); 4658 }) && 4659 "Expected Scalar VF to be a candidate"); 4660 4661 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4662 ExpectedCost); 4663 VectorizationFactor ChosenFactor = ScalarCost; 4664 4665 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4666 if (ForceVectorization && 4667 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { 4668 // Ignore scalar width, because the user explicitly wants vectorization. 4669 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4670 // evaluation. 4671 ChosenFactor.Cost = InstructionCost::getMax(); 4672 } 4673 4674 for (auto &P : VPlans) { 4675 for (ElementCount VF : P->vectorFactors()) { 4676 // The cost for scalar VF=1 is already calculated, so ignore it. 4677 if (VF.isScalar()) 4678 continue; 4679 4680 InstructionCost C = CM.expectedCost(VF); 4681 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); 4682 4683 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); 4684 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF 4685 << " costs: " << (Candidate.Cost / Width)); 4686 if (VF.isScalable()) 4687 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 4688 << getVScaleForTuning(OrigLoop, TTI).value_or(1) 4689 << ")"); 4690 LLVM_DEBUG(dbgs() << ".\n"); 4691 4692 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 4693 LLVM_DEBUG( 4694 dbgs() 4695 << "LV: Not considering vector loop of width " << VF 4696 << " because it will not generate any vector instructions.\n"); 4697 continue; 4698 } 4699 4700 if (isMoreProfitable(Candidate, ChosenFactor)) 4701 ChosenFactor = Candidate; 4702 } 4703 } 4704 4705 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 4706 reportVectorizationFailure( 4707 "There are conditional stores.", 4708 "store that is conditionally executed prevents vectorization", 4709 "ConditionalStore", ORE, OrigLoop); 4710 ChosenFactor = ScalarCost; 4711 } 4712 4713 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 4714 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 4715 << "LV: Vectorization seems to be not beneficial, " 4716 << "but was forced by a user.\n"); 4717 return ChosenFactor; 4718 } 4719 #endif 4720 4721 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 4722 ElementCount VF) const { 4723 // Cross iteration phis such as reductions need special handling and are 4724 // currently unsupported. 4725 if (any_of(OrigLoop->getHeader()->phis(), 4726 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 4727 return false; 4728 4729 // Phis with uses outside of the loop require special handling and are 4730 // currently unsupported. 4731 for (const auto &Entry : Legal->getInductionVars()) { 4732 // Look for uses of the value of the induction at the last iteration. 4733 Value *PostInc = 4734 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 4735 for (User *U : PostInc->users()) 4736 if (!OrigLoop->contains(cast<Instruction>(U))) 4737 return false; 4738 // Look for uses of penultimate value of the induction. 4739 for (User *U : Entry.first->users()) 4740 if (!OrigLoop->contains(cast<Instruction>(U))) 4741 return false; 4742 } 4743 4744 // Epilogue vectorization code has not been auditted to ensure it handles 4745 // non-latch exits properly. It may be fine, but it needs auditted and 4746 // tested. 4747 // TODO: Add support for loops with an early exit. 4748 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 4749 return false; 4750 4751 return true; 4752 } 4753 4754 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 4755 const ElementCount VF, const unsigned IC) const { 4756 // FIXME: We need a much better cost-model to take different parameters such 4757 // as register pressure, code size increase and cost of extra branches into 4758 // account. For now we apply a very crude heuristic and only consider loops 4759 // with vectorization factors larger than a certain value. 4760 4761 // Allow the target to opt out entirely. 4762 if (!TTI.preferEpilogueVectorization()) 4763 return false; 4764 4765 // We also consider epilogue vectorization unprofitable for targets that don't 4766 // consider interleaving beneficial (eg. MVE). 4767 if (TTI.getMaxInterleaveFactor(VF) <= 1) 4768 return false; 4769 4770 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable 4771 // VFs when deciding profitability. 4772 // See related "TODO: extend to support scalable VFs." in 4773 // selectEpilogueVectorizationFactor. 4774 unsigned Multiplier = VF.isFixed() ? IC : 1; 4775 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 4776 ? EpilogueVectorizationMinVF 4777 : TTI.getEpilogueVectorizationMinVF(); 4778 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; 4779 } 4780 4781 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 4782 const ElementCount MainLoopVF, unsigned IC) { 4783 VectorizationFactor Result = VectorizationFactor::Disabled(); 4784 if (!EnableEpilogueVectorization) { 4785 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 4786 return Result; 4787 } 4788 4789 if (!CM.isScalarEpilogueAllowed()) { 4790 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 4791 "epilogue is allowed.\n"); 4792 return Result; 4793 } 4794 4795 // Not really a cost consideration, but check for unsupported cases here to 4796 // simplify the logic. 4797 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 4798 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 4799 "is not a supported candidate.\n"); 4800 return Result; 4801 } 4802 4803 if (EpilogueVectorizationForceVF > 1) { 4804 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 4805 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 4806 if (hasPlanWithVF(ForcedEC)) 4807 return {ForcedEC, 0, 0}; 4808 4809 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 4810 "viable.\n"); 4811 return Result; 4812 } 4813 4814 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 4815 OrigLoop->getHeader()->getParent()->hasMinSize()) { 4816 LLVM_DEBUG( 4817 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 4818 return Result; 4819 } 4820 4821 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) { 4822 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 4823 "this loop\n"); 4824 return Result; 4825 } 4826 4827 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4828 // the main loop handles 8 lanes per iteration. We could still benefit from 4829 // vectorizing the epilogue loop with VF=4. 4830 ElementCount EstimatedRuntimeVF = 4831 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); 4832 4833 ScalarEvolution &SE = *PSE.getSE(); 4834 Type *TCType = Legal->getWidestInductionType(); 4835 const SCEV *RemainingIterations = nullptr; 4836 unsigned MaxTripCount = 0; 4837 for (auto &NextVF : ProfitableVFs) { 4838 // Skip candidate VFs without a corresponding VPlan. 4839 if (!hasPlanWithVF(NextVF.Width)) 4840 continue; 4841 4842 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable 4843 // vectors) or > the VF of the main loop (fixed vectors). 4844 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 4845 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 4846 (NextVF.Width.isScalable() && 4847 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || 4848 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && 4849 ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) 4850 continue; 4851 4852 // If NextVF is greater than the number of remaining iterations, the 4853 // epilogue loop would be dead. Skip such factors. 4854 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 4855 // TODO: extend to support scalable VFs. 4856 if (!RemainingIterations) { 4857 const SCEV *TC = vputils::getSCEVExprForVPValue( 4858 getPlanFor(NextVF.Width).getTripCount(), SE); 4859 assert(!isa<SCEVCouldNotCompute>(TC) && 4860 "Trip count SCEV must be computable"); 4861 RemainingIterations = SE.getURemExpr( 4862 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 4863 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; 4864 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, 4865 SE.getConstant(TCType, MaxTripCount))) { 4866 MaxTripCount = 4867 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); 4868 } 4869 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " 4870 << MaxTripCount << "\n"); 4871 } 4872 if (SE.isKnownPredicate( 4873 CmpInst::ICMP_UGT, 4874 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 4875 RemainingIterations)) 4876 continue; 4877 } 4878 4879 if (Result.Width.isScalar() || 4880 isMoreProfitable(NextVF, Result, MaxTripCount)) 4881 Result = NextVF; 4882 } 4883 4884 if (Result != VectorizationFactor::Disabled()) 4885 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 4886 << Result.Width << "\n"); 4887 return Result; 4888 } 4889 4890 std::pair<unsigned, unsigned> 4891 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4892 unsigned MinWidth = -1U; 4893 unsigned MaxWidth = 8; 4894 const DataLayout &DL = TheFunction->getDataLayout(); 4895 // For in-loop reductions, no element types are added to ElementTypesInLoop 4896 // if there are no loads/stores in the loop. In this case, check through the 4897 // reduction variables to determine the maximum width. 4898 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 4899 // Reset MaxWidth so that we can find the smallest type used by recurrences 4900 // in the loop. 4901 MaxWidth = -1U; 4902 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 4903 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 4904 // When finding the min width used by the recurrence we need to account 4905 // for casts on the input operands of the recurrence. 4906 MaxWidth = std::min<unsigned>( 4907 MaxWidth, std::min<unsigned>( 4908 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 4909 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 4910 } 4911 } else { 4912 for (Type *T : ElementTypesInLoop) { 4913 MinWidth = std::min<unsigned>( 4914 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4915 MaxWidth = std::max<unsigned>( 4916 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4917 } 4918 } 4919 return {MinWidth, MaxWidth}; 4920 } 4921 4922 void LoopVectorizationCostModel::collectElementTypesForWidening() { 4923 ElementTypesInLoop.clear(); 4924 // For each block. 4925 for (BasicBlock *BB : TheLoop->blocks()) { 4926 // For each instruction in the loop. 4927 for (Instruction &I : BB->instructionsWithoutDebug()) { 4928 Type *T = I.getType(); 4929 4930 // Skip ignored values. 4931 if (ValuesToIgnore.count(&I)) 4932 continue; 4933 4934 // Only examine Loads, Stores and PHINodes. 4935 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 4936 continue; 4937 4938 // Examine PHI nodes that are reduction variables. Update the type to 4939 // account for the recurrence type. 4940 if (auto *PN = dyn_cast<PHINode>(&I)) { 4941 if (!Legal->isReductionVariable(PN)) 4942 continue; 4943 const RecurrenceDescriptor &RdxDesc = 4944 Legal->getReductionVars().find(PN)->second; 4945 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 4946 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 4947 RdxDesc.getRecurrenceType(), 4948 TargetTransformInfo::ReductionFlags())) 4949 continue; 4950 T = RdxDesc.getRecurrenceType(); 4951 } 4952 4953 // Examine the stored values. 4954 if (auto *ST = dyn_cast<StoreInst>(&I)) 4955 T = ST->getValueOperand()->getType(); 4956 4957 assert(T->isSized() && 4958 "Expected the load/store/recurrence type to be sized"); 4959 4960 ElementTypesInLoop.insert(T); 4961 } 4962 } 4963 } 4964 4965 unsigned 4966 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 4967 InstructionCost LoopCost) { 4968 // -- The interleave heuristics -- 4969 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4970 // There are many micro-architectural considerations that we can't predict 4971 // at this level. For example, frontend pressure (on decode or fetch) due to 4972 // code size, or the number and capabilities of the execution ports. 4973 // 4974 // We use the following heuristics to select the interleave count: 4975 // 1. If the code has reductions, then we interleave to break the cross 4976 // iteration dependency. 4977 // 2. If the loop is really small, then we interleave to reduce the loop 4978 // overhead. 4979 // 3. We don't interleave if we think that we will spill registers to memory 4980 // due to the increased register pressure. 4981 4982 if (!isScalarEpilogueAllowed()) 4983 return 1; 4984 4985 // Do not interleave if EVL is preferred and no User IC is specified. 4986 if (foldTailWithEVL()) { 4987 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " 4988 "Unroll factor forced to be 1.\n"); 4989 return 1; 4990 } 4991 4992 // We used the distance for the interleave count. 4993 if (!Legal->isSafeForAnyVectorWidth()) 4994 return 1; 4995 4996 // We don't attempt to perform interleaving for loops with uncountable early 4997 // exits because the VPInstruction::AnyOf code cannot currently handle 4998 // multiple parts. 4999 if (Legal->hasUncountableEarlyExit()) 5000 return 1; 5001 5002 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); 5003 const bool HasReductions = !Legal->getReductionVars().empty(); 5004 5005 // If we did not calculate the cost for VF (because the user selected the VF) 5006 // then we calculate the cost of VF here. 5007 if (LoopCost == 0) { 5008 LoopCost = expectedCost(VF); 5009 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5010 5011 // Loop body is free and there is no need for interleaving. 5012 if (LoopCost == 0) 5013 return 1; 5014 } 5015 5016 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5017 // We divide by these constants so assume that we have at least one 5018 // instruction that uses at least one register. 5019 for (auto &Pair : R.MaxLocalUsers) { 5020 Pair.second = std::max(Pair.second, 1U); 5021 } 5022 5023 // We calculate the interleave count using the following formula. 5024 // Subtract the number of loop invariants from the number of available 5025 // registers. These registers are used by all of the interleaved instances. 5026 // Next, divide the remaining registers by the number of registers that is 5027 // required by the loop, in order to estimate how many parallel instances 5028 // fit without causing spills. All of this is rounded down if necessary to be 5029 // a power of two. We want power of two interleave count to simplify any 5030 // addressing operations or alignment considerations. 5031 // We also want power of two interleave counts to ensure that the induction 5032 // variable of the vector loop wraps to zero, when tail is folded by masking; 5033 // this currently happens when OptForSize, in which case IC is set to 1 above. 5034 unsigned IC = UINT_MAX; 5035 5036 for (const auto &Pair : R.MaxLocalUsers) { 5037 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first); 5038 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5039 << " registers of " 5040 << TTI.getRegisterClassName(Pair.first) 5041 << " register class\n"); 5042 if (VF.isScalar()) { 5043 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5044 TargetNumRegisters = ForceTargetNumScalarRegs; 5045 } else { 5046 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5047 TargetNumRegisters = ForceTargetNumVectorRegs; 5048 } 5049 unsigned MaxLocalUsers = Pair.second; 5050 unsigned LoopInvariantRegs = 0; 5051 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end()) 5052 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first]; 5053 5054 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5055 MaxLocalUsers); 5056 // Don't count the induction variable as interleaved. 5057 if (EnableIndVarRegisterHeur) { 5058 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5059 std::max(1U, (MaxLocalUsers - 1))); 5060 } 5061 5062 IC = std::min(IC, TmpIC); 5063 } 5064 5065 // Clamp the interleave ranges to reasonable counts. 5066 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5067 5068 // Check if the user has overridden the max. 5069 if (VF.isScalar()) { 5070 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5071 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5072 } else { 5073 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5074 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5075 } 5076 5077 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); 5078 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5079 if (KnownTC > 0) { 5080 // At least one iteration must be scalar when this constraint holds. So the 5081 // maximum available iterations for interleaving is one less. 5082 unsigned AvailableTC = 5083 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; 5084 5085 // If trip count is known we select between two prospective ICs, where 5086 // 1) the aggressive IC is capped by the trip count divided by VF 5087 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 5088 // The final IC is selected in a way that the epilogue loop trip count is 5089 // minimized while maximizing the IC itself, so that we either run the 5090 // vector loop at least once if it generates a small epilogue loop, or else 5091 // we run the vector loop at least twice. 5092 5093 unsigned InterleaveCountUB = bit_floor( 5094 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); 5095 unsigned InterleaveCountLB = bit_floor(std::max( 5096 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5097 MaxInterleaveCount = InterleaveCountLB; 5098 5099 if (InterleaveCountUB != InterleaveCountLB) { 5100 unsigned TailTripCountUB = 5101 (AvailableTC % (EstimatedVF * InterleaveCountUB)); 5102 unsigned TailTripCountLB = 5103 (AvailableTC % (EstimatedVF * InterleaveCountLB)); 5104 // If both produce same scalar tail, maximize the IC to do the same work 5105 // in fewer vector loop iterations 5106 if (TailTripCountUB == TailTripCountLB) 5107 MaxInterleaveCount = InterleaveCountUB; 5108 } 5109 } else if (BestKnownTC && *BestKnownTC > 0) { 5110 // At least one iteration must be scalar when this constraint holds. So the 5111 // maximum available iterations for interleaving is one less. 5112 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) 5113 ? (*BestKnownTC) - 1 5114 : *BestKnownTC; 5115 5116 // If trip count is an estimated compile time constant, limit the 5117 // IC to be capped by the trip count divided by VF * 2, such that the vector 5118 // loop runs at least twice to make interleaving seem profitable when there 5119 // is an epilogue loop present. Since exact Trip count is not known we 5120 // choose to be conservative in our IC estimate. 5121 MaxInterleaveCount = bit_floor(std::max( 5122 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5123 } 5124 5125 assert(MaxInterleaveCount > 0 && 5126 "Maximum interleave count must be greater than 0"); 5127 5128 // Clamp the calculated IC to be between the 1 and the max interleave count 5129 // that the target and trip count allows. 5130 if (IC > MaxInterleaveCount) 5131 IC = MaxInterleaveCount; 5132 else 5133 // Make sure IC is greater than 0. 5134 IC = std::max(1u, IC); 5135 5136 assert(IC > 0 && "Interleave count must be greater than 0."); 5137 5138 // Interleave if we vectorized this loop and there is a reduction that could 5139 // benefit from interleaving. 5140 if (VF.isVector() && HasReductions) { 5141 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5142 return IC; 5143 } 5144 5145 // For any scalar loop that either requires runtime checks or predication we 5146 // are better off leaving this to the unroller. Note that if we've already 5147 // vectorized the loop we will have done the runtime check and so interleaving 5148 // won't require further checks. 5149 bool ScalarInterleavingRequiresPredication = 5150 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5151 return Legal->blockNeedsPredication(BB); 5152 })); 5153 bool ScalarInterleavingRequiresRuntimePointerCheck = 5154 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5155 5156 // We want to interleave small loops in order to reduce the loop overhead and 5157 // potentially expose ILP opportunities. 5158 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5159 << "LV: IC is " << IC << '\n' 5160 << "LV: VF is " << VF << '\n'); 5161 const bool AggressivelyInterleaveReductions = 5162 TTI.enableAggressiveInterleaving(HasReductions); 5163 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5164 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5165 // We assume that the cost overhead is 1 and we use the cost model 5166 // to estimate the cost of the loop and interleave until the cost of the 5167 // loop overhead is about 5% of the cost of the loop. 5168 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5169 SmallLoopCost / *LoopCost.getValue())); 5170 5171 // Interleave until store/load ports (estimated by max interleave count) are 5172 // saturated. 5173 unsigned NumStores = Legal->getNumStores(); 5174 unsigned NumLoads = Legal->getNumLoads(); 5175 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5176 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5177 5178 // There is little point in interleaving for reductions containing selects 5179 // and compares when VF=1 since it may just create more overhead than it's 5180 // worth for loops with small trip counts. This is because we still have to 5181 // do the final reduction after the loop. 5182 bool HasSelectCmpReductions = 5183 HasReductions && 5184 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5185 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5186 RecurKind RK = RdxDesc.getRecurrenceKind(); 5187 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 5188 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK); 5189 }); 5190 if (HasSelectCmpReductions) { 5191 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5192 return 1; 5193 } 5194 5195 // If we have a scalar reduction (vector reductions are already dealt with 5196 // by this point), we can increase the critical path length if the loop 5197 // we're interleaving is inside another loop. For tree-wise reductions 5198 // set the limit to 2, and for ordered reductions it's best to disable 5199 // interleaving entirely. 5200 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5201 bool HasOrderedReductions = 5202 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5203 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5204 return RdxDesc.isOrdered(); 5205 }); 5206 if (HasOrderedReductions) { 5207 LLVM_DEBUG( 5208 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5209 return 1; 5210 } 5211 5212 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5213 SmallIC = std::min(SmallIC, F); 5214 StoresIC = std::min(StoresIC, F); 5215 LoadsIC = std::min(LoadsIC, F); 5216 } 5217 5218 if (EnableLoadStoreRuntimeInterleave && 5219 std::max(StoresIC, LoadsIC) > SmallIC) { 5220 LLVM_DEBUG( 5221 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5222 return std::max(StoresIC, LoadsIC); 5223 } 5224 5225 // If there are scalar reductions and TTI has enabled aggressive 5226 // interleaving for reductions, we will interleave to expose ILP. 5227 if (VF.isScalar() && AggressivelyInterleaveReductions) { 5228 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5229 // Interleave no less than SmallIC but not as aggressive as the normal IC 5230 // to satisfy the rare situation when resources are too limited. 5231 return std::max(IC / 2, SmallIC); 5232 } 5233 5234 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5235 return SmallIC; 5236 } 5237 5238 // Interleave if this is a large loop (small loops are already dealt with by 5239 // this point) that could benefit from interleaving. 5240 if (AggressivelyInterleaveReductions) { 5241 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5242 return IC; 5243 } 5244 5245 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5246 return 1; 5247 } 5248 5249 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5250 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5251 // This function calculates the register usage by measuring the highest number 5252 // of values that are alive at a single location. Obviously, this is a very 5253 // rough estimation. We scan the loop in a topological order in order and 5254 // assign a number to each instruction. We use RPO to ensure that defs are 5255 // met before their users. We assume that each instruction that has in-loop 5256 // users starts an interval. We record every time that an in-loop value is 5257 // used, so we have a list of the first and last occurrences of each 5258 // instruction. Next, we transpose this data structure into a multi map that 5259 // holds the list of intervals that *end* at a specific location. This multi 5260 // map allows us to perform a linear search. We scan the instructions linearly 5261 // and record each time that a new interval starts, by placing it in a set. 5262 // If we find this value in the multi-map then we remove it from the set. 5263 // The max register usage is the maximum size of the set. 5264 // We also search for instructions that are defined outside the loop, but are 5265 // used inside the loop. We need this number separately from the max-interval 5266 // usage number because when we unroll, loop-invariant values do not take 5267 // more register. 5268 LoopBlocksDFS DFS(TheLoop); 5269 DFS.perform(LI); 5270 5271 RegisterUsage RU; 5272 5273 // Each 'key' in the map opens a new interval. The values 5274 // of the map are the index of the 'last seen' usage of the 5275 // instruction that is the key. 5276 using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>; 5277 5278 // Maps instruction to its index. 5279 SmallVector<Instruction *, 64> IdxToInstr; 5280 // Marks the end of each interval. 5281 IntervalMap EndPoint; 5282 // Saves the list of instruction indices that are used in the loop. 5283 SmallPtrSet<Instruction *, 8> Ends; 5284 // Saves the list of values that are used in the loop but are defined outside 5285 // the loop (not including non-instruction values such as arguments and 5286 // constants). 5287 SmallSetVector<Instruction *, 8> LoopInvariants; 5288 5289 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5290 for (Instruction &I : BB->instructionsWithoutDebug()) { 5291 IdxToInstr.push_back(&I); 5292 5293 // Save the end location of each USE. 5294 for (Value *U : I.operands()) { 5295 auto *Instr = dyn_cast<Instruction>(U); 5296 5297 // Ignore non-instruction values such as arguments, constants, etc. 5298 // FIXME: Might need some motivation why these values are ignored. If 5299 // for example an argument is used inside the loop it will increase the 5300 // register pressure (so shouldn't we add it to LoopInvariants). 5301 if (!Instr) 5302 continue; 5303 5304 // If this instruction is outside the loop then record it and continue. 5305 if (!TheLoop->contains(Instr)) { 5306 LoopInvariants.insert(Instr); 5307 continue; 5308 } 5309 5310 // Overwrite previous end points. 5311 EndPoint[Instr] = IdxToInstr.size(); 5312 Ends.insert(Instr); 5313 } 5314 } 5315 } 5316 5317 // Saves the list of intervals that end with the index in 'key'. 5318 using InstrList = SmallVector<Instruction *, 2>; 5319 SmallDenseMap<unsigned, InstrList, 16> TransposeEnds; 5320 5321 // Transpose the EndPoints to a list of values that end at each index. 5322 for (auto &Interval : EndPoint) 5323 TransposeEnds[Interval.second].push_back(Interval.first); 5324 5325 SmallPtrSet<Instruction *, 8> OpenIntervals; 5326 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5327 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5328 5329 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5330 5331 const auto &TTICapture = TTI; 5332 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5333 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || 5334 (VF.isScalable() && 5335 !TTICapture.isElementTypeLegalForScalableVector(Ty))) 5336 return 0; 5337 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5338 }; 5339 5340 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { 5341 Instruction *I = IdxToInstr[Idx]; 5342 5343 // Remove all of the instructions that end at this location. 5344 InstrList &List = TransposeEnds[Idx]; 5345 for (Instruction *ToRemove : List) 5346 OpenIntervals.erase(ToRemove); 5347 5348 // Ignore instructions that are never used within the loop. 5349 if (!Ends.count(I)) 5350 continue; 5351 5352 // Skip ignored values. 5353 if (ValuesToIgnore.count(I)) 5354 continue; 5355 5356 collectInLoopReductions(); 5357 5358 // For each VF find the maximum usage of registers. 5359 for (unsigned J = 0, E = VFs.size(); J < E; ++J) { 5360 // Count the number of registers used, per register class, given all open 5361 // intervals. 5362 // Note that elements in this SmallMapVector will be default constructed 5363 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5364 // there is no previous entry for ClassID. 5365 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5366 5367 if (VFs[J].isScalar()) { 5368 for (auto *Inst : OpenIntervals) { 5369 unsigned ClassID = 5370 TTI.getRegisterClassForType(false, Inst->getType()); 5371 // FIXME: The target might use more than one register for the type 5372 // even in the scalar case. 5373 RegUsage[ClassID] += 1; 5374 } 5375 } else { 5376 collectUniformsAndScalars(VFs[J]); 5377 for (auto *Inst : OpenIntervals) { 5378 // Skip ignored values for VF > 1. 5379 if (VecValuesToIgnore.count(Inst)) 5380 continue; 5381 if (isScalarAfterVectorization(Inst, VFs[J])) { 5382 unsigned ClassID = 5383 TTI.getRegisterClassForType(false, Inst->getType()); 5384 // FIXME: The target might use more than one register for the type 5385 // even in the scalar case. 5386 RegUsage[ClassID] += 1; 5387 } else { 5388 unsigned ClassID = 5389 TTI.getRegisterClassForType(true, Inst->getType()); 5390 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); 5391 } 5392 } 5393 } 5394 5395 for (const auto &Pair : RegUsage) { 5396 auto &Entry = MaxUsages[J][Pair.first]; 5397 Entry = std::max(Entry, Pair.second); 5398 } 5399 } 5400 5401 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " 5402 << OpenIntervals.size() << '\n'); 5403 5404 // Add the current instruction to the list of open intervals. 5405 OpenIntervals.insert(I); 5406 } 5407 5408 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { 5409 // Note that elements in this SmallMapVector will be default constructed 5410 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5411 // there is no previous entry for ClassID. 5412 SmallMapVector<unsigned, unsigned, 4> Invariant; 5413 5414 for (auto *Inst : LoopInvariants) { 5415 // FIXME: The target might use more than one register for the type 5416 // even in the scalar case. 5417 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5418 auto *I = cast<Instruction>(U); 5419 return TheLoop != LI->getLoopFor(I->getParent()) || 5420 isScalarAfterVectorization(I, VFs[Idx]); 5421 }); 5422 5423 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; 5424 unsigned ClassID = 5425 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5426 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5427 } 5428 5429 LLVM_DEBUG({ 5430 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; 5431 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() 5432 << " item\n"; 5433 for (const auto &pair : MaxUsages[Idx]) { 5434 dbgs() << "LV(REG): RegisterClass: " 5435 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5436 << " registers\n"; 5437 } 5438 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5439 << " item\n"; 5440 for (const auto &pair : Invariant) { 5441 dbgs() << "LV(REG): RegisterClass: " 5442 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5443 << " registers\n"; 5444 } 5445 }); 5446 5447 RU.LoopInvariantRegs = Invariant; 5448 RU.MaxLocalUsers = MaxUsages[Idx]; 5449 RUs[Idx] = RU; 5450 } 5451 5452 return RUs; 5453 } 5454 5455 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5456 ElementCount VF) { 5457 // TODO: Cost model for emulated masked load/store is completely 5458 // broken. This hack guides the cost model to use an artificially 5459 // high enough value to practically disable vectorization with such 5460 // operations, except where previously deployed legality hack allowed 5461 // using very low cost values. This is to avoid regressions coming simply 5462 // from moving "masked load/store" check from legality to cost model. 5463 // Masked Load/Gather emulation was previously never allowed. 5464 // Limited number of Masked Store/Scatter emulation was allowed. 5465 assert((isPredicatedInst(I)) && 5466 "Expecting a scalar emulated instruction"); 5467 return isa<LoadInst>(I) || 5468 (isa<StoreInst>(I) && 5469 NumPredStores > NumberOfStoresToPredicate); 5470 } 5471 5472 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5473 // If we aren't vectorizing the loop, or if we've already collected the 5474 // instructions to scalarize, there's nothing to do. Collection may already 5475 // have occurred if we have a user-selected VF and are now computing the 5476 // expected cost for interleaving. 5477 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5478 return; 5479 5480 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5481 // not profitable to scalarize any instructions, the presence of VF in the 5482 // map will indicate that we've analyzed it already. 5483 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5484 5485 PredicatedBBsAfterVectorization[VF].clear(); 5486 5487 // Find all the instructions that are scalar with predication in the loop and 5488 // determine if it would be better to not if-convert the blocks they are in. 5489 // If so, we also record the instructions to scalarize. 5490 for (BasicBlock *BB : TheLoop->blocks()) { 5491 if (!blockNeedsPredicationForAnyReason(BB)) 5492 continue; 5493 for (Instruction &I : *BB) 5494 if (isScalarWithPredication(&I, VF)) { 5495 ScalarCostsTy ScalarCosts; 5496 // Do not apply discount logic for: 5497 // 1. Scalars after vectorization, as there will only be a single copy 5498 // of the instruction. 5499 // 2. Scalable VF, as that would lead to invalid scalarization costs. 5500 // 3. Emulated masked memrefs, if a hacked cost is needed. 5501 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && 5502 !useEmulatedMaskMemRefHack(&I, VF) && 5503 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { 5504 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5505 // Check if we decided to scalarize a call. If so, update the widening 5506 // decision of the call to CM_Scalarize with the computed scalar cost. 5507 for (const auto &[I, _] : ScalarCosts) { 5508 auto *CI = dyn_cast<CallInst>(I); 5509 if (!CI || !CallWideningDecisions.contains({CI, VF})) 5510 continue; 5511 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize; 5512 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI]; 5513 } 5514 } 5515 // Remember that BB will remain after vectorization. 5516 PredicatedBBsAfterVectorization[VF].insert(BB); 5517 for (auto *Pred : predecessors(BB)) { 5518 if (Pred->getSingleSuccessor() == BB) 5519 PredicatedBBsAfterVectorization[VF].insert(Pred); 5520 } 5521 } 5522 } 5523 } 5524 5525 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5526 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5527 assert(!isUniformAfterVectorization(PredInst, VF) && 5528 "Instruction marked uniform-after-vectorization will be predicated"); 5529 5530 // Initialize the discount to zero, meaning that the scalar version and the 5531 // vector version cost the same. 5532 InstructionCost Discount = 0; 5533 5534 // Holds instructions to analyze. The instructions we visit are mapped in 5535 // ScalarCosts. Those instructions are the ones that would be scalarized if 5536 // we find that the scalar version costs less. 5537 SmallVector<Instruction *, 8> Worklist; 5538 5539 // Returns true if the given instruction can be scalarized. 5540 auto CanBeScalarized = [&](Instruction *I) -> bool { 5541 // We only attempt to scalarize instructions forming a single-use chain 5542 // from the original predicated block that would otherwise be vectorized. 5543 // Although not strictly necessary, we give up on instructions we know will 5544 // already be scalar to avoid traversing chains that are unlikely to be 5545 // beneficial. 5546 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5547 isScalarAfterVectorization(I, VF)) 5548 return false; 5549 5550 // If the instruction is scalar with predication, it will be analyzed 5551 // separately. We ignore it within the context of PredInst. 5552 if (isScalarWithPredication(I, VF)) 5553 return false; 5554 5555 // If any of the instruction's operands are uniform after vectorization, 5556 // the instruction cannot be scalarized. This prevents, for example, a 5557 // masked load from being scalarized. 5558 // 5559 // We assume we will only emit a value for lane zero of an instruction 5560 // marked uniform after vectorization, rather than VF identical values. 5561 // Thus, if we scalarize an instruction that uses a uniform, we would 5562 // create uses of values corresponding to the lanes we aren't emitting code 5563 // for. This behavior can be changed by allowing getScalarValue to clone 5564 // the lane zero values for uniforms rather than asserting. 5565 for (Use &U : I->operands()) 5566 if (auto *J = dyn_cast<Instruction>(U.get())) 5567 if (isUniformAfterVectorization(J, VF)) 5568 return false; 5569 5570 // Otherwise, we can scalarize the instruction. 5571 return true; 5572 }; 5573 5574 // Compute the expected cost discount from scalarizing the entire expression 5575 // feeding the predicated instruction. We currently only consider expressions 5576 // that are single-use instruction chains. 5577 Worklist.push_back(PredInst); 5578 while (!Worklist.empty()) { 5579 Instruction *I = Worklist.pop_back_val(); 5580 5581 // If we've already analyzed the instruction, there's nothing to do. 5582 if (ScalarCosts.contains(I)) 5583 continue; 5584 5585 // Compute the cost of the vector instruction. Note that this cost already 5586 // includes the scalarization overhead of the predicated instruction. 5587 InstructionCost VectorCost = getInstructionCost(I, VF); 5588 5589 // Compute the cost of the scalarized instruction. This cost is the cost of 5590 // the instruction as if it wasn't if-converted and instead remained in the 5591 // predicated block. We will scale this cost by block probability after 5592 // computing the scalarization overhead. 5593 InstructionCost ScalarCost = 5594 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); 5595 5596 // Compute the scalarization overhead of needed insertelement instructions 5597 // and phi nodes. 5598 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5599 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5600 ScalarCost += TTI.getScalarizationOverhead( 5601 cast<VectorType>(toVectorTy(I->getType(), VF)), 5602 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5603 /*Extract*/ false, CostKind); 5604 ScalarCost += 5605 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5606 } 5607 5608 // Compute the scalarization overhead of needed extractelement 5609 // instructions. For each of the instruction's operands, if the operand can 5610 // be scalarized, add it to the worklist; otherwise, account for the 5611 // overhead. 5612 for (Use &U : I->operands()) 5613 if (auto *J = dyn_cast<Instruction>(U.get())) { 5614 assert(VectorType::isValidElementType(J->getType()) && 5615 "Instruction has non-scalar type"); 5616 if (CanBeScalarized(J)) 5617 Worklist.push_back(J); 5618 else if (needsExtract(J, VF)) { 5619 ScalarCost += TTI.getScalarizationOverhead( 5620 cast<VectorType>(toVectorTy(J->getType(), VF)), 5621 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5622 /*Extract*/ true, CostKind); 5623 } 5624 } 5625 5626 // Scale the total scalar cost by block probability. 5627 ScalarCost /= getReciprocalPredBlockProb(); 5628 5629 // Compute the discount. A non-negative discount means the vector version 5630 // of the instruction costs more, and scalarizing would be beneficial. 5631 Discount += VectorCost - ScalarCost; 5632 ScalarCosts[I] = ScalarCost; 5633 } 5634 5635 return Discount; 5636 } 5637 5638 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { 5639 InstructionCost Cost; 5640 5641 // If the vector loop gets executed exactly once with the given VF, ignore the 5642 // costs of comparison and induction instructions, as they'll get simplified 5643 // away. 5644 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF; 5645 auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5646 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking()) 5647 addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), 5648 ValuesToIgnoreForVF); 5649 5650 // For each block. 5651 for (BasicBlock *BB : TheLoop->blocks()) { 5652 InstructionCost BlockCost; 5653 5654 // For each instruction in the old loop. 5655 for (Instruction &I : BB->instructionsWithoutDebug()) { 5656 // Skip ignored values. 5657 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) || 5658 (VF.isVector() && VecValuesToIgnore.count(&I))) 5659 continue; 5660 5661 InstructionCost C = getInstructionCost(&I, VF); 5662 5663 // Check if we should override the cost. 5664 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) 5665 C = InstructionCost(ForceTargetInstructionCost); 5666 5667 BlockCost += C; 5668 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " 5669 << VF << " For instruction: " << I << '\n'); 5670 } 5671 5672 // If we are vectorizing a predicated block, it will have been 5673 // if-converted. This means that the block's instructions (aside from 5674 // stores and instructions that may divide by zero) will now be 5675 // unconditionally executed. For the scalar case, we may not always execute 5676 // the predicated block, if it is an if-else block. Thus, scale the block's 5677 // cost by the probability of executing it. blockNeedsPredication from 5678 // Legal is used so as to not include all blocks in tail folded loops. 5679 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5680 BlockCost /= getReciprocalPredBlockProb(); 5681 5682 Cost += BlockCost; 5683 } 5684 5685 return Cost; 5686 } 5687 5688 /// Gets Address Access SCEV after verifying that the access pattern 5689 /// is loop invariant except the induction variable dependence. 5690 /// 5691 /// This SCEV can be sent to the Target in order to estimate the address 5692 /// calculation cost. 5693 static const SCEV *getAddressAccessSCEV( 5694 Value *Ptr, 5695 LoopVectorizationLegality *Legal, 5696 PredicatedScalarEvolution &PSE, 5697 const Loop *TheLoop) { 5698 5699 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5700 if (!Gep) 5701 return nullptr; 5702 5703 // We are looking for a gep with all loop invariant indices except for one 5704 // which should be an induction variable. 5705 auto *SE = PSE.getSE(); 5706 unsigned NumOperands = Gep->getNumOperands(); 5707 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { 5708 Value *Opd = Gep->getOperand(Idx); 5709 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5710 !Legal->isInductionVariable(Opd)) 5711 return nullptr; 5712 } 5713 5714 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5715 return PSE.getSCEV(Ptr); 5716 } 5717 5718 InstructionCost 5719 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5720 ElementCount VF) { 5721 assert(VF.isVector() && 5722 "Scalarization cost of instruction implies vectorization."); 5723 if (VF.isScalable()) 5724 return InstructionCost::getInvalid(); 5725 5726 Type *ValTy = getLoadStoreType(I); 5727 auto *SE = PSE.getSE(); 5728 5729 unsigned AS = getLoadStoreAddressSpace(I); 5730 Value *Ptr = getLoadStorePointerOperand(I); 5731 Type *PtrTy = toVectorTy(Ptr->getType(), VF); 5732 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5733 // that it is being called from this specific place. 5734 5735 // Figure out whether the access is strided and get the stride value 5736 // if it's known in compile time 5737 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5738 5739 // Get the cost of the scalar memory instruction and address computation. 5740 InstructionCost Cost = 5741 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5742 5743 // Don't pass *I here, since it is scalar but will actually be part of a 5744 // vectorized loop where the user of it is a vectorized instruction. 5745 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5746 const Align Alignment = getLoadStoreAlignment(I); 5747 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 5748 ValTy->getScalarType(), 5749 Alignment, AS, CostKind); 5750 5751 // Get the overhead of the extractelement and insertelement instructions 5752 // we might create due to scalarization. 5753 Cost += getScalarizationOverhead(I, VF, CostKind); 5754 5755 // If we have a predicated load/store, it will need extra i1 extracts and 5756 // conditional branches, but may not be executed for each vector lane. Scale 5757 // the cost by the probability of executing the predicated block. 5758 if (isPredicatedInst(I)) { 5759 Cost /= getReciprocalPredBlockProb(); 5760 5761 // Add the cost of an i1 extract and a branch 5762 auto *VecI1Ty = 5763 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 5764 Cost += TTI.getScalarizationOverhead( 5765 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 5766 /*Insert=*/false, /*Extract=*/true, CostKind); 5767 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 5768 5769 if (useEmulatedMaskMemRefHack(I, VF)) 5770 // Artificially setting to a high enough value to practically disable 5771 // vectorization with such operations. 5772 Cost = 3000000; 5773 } 5774 5775 return Cost; 5776 } 5777 5778 InstructionCost 5779 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5780 ElementCount VF) { 5781 Type *ValTy = getLoadStoreType(I); 5782 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5783 Value *Ptr = getLoadStorePointerOperand(I); 5784 unsigned AS = getLoadStoreAddressSpace(I); 5785 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 5786 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5787 5788 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5789 "Stride should be 1 or -1 for consecutive memory access"); 5790 const Align Alignment = getLoadStoreAlignment(I); 5791 InstructionCost Cost = 0; 5792 if (Legal->isMaskRequired(I)) { 5793 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5794 CostKind); 5795 } else { 5796 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5797 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5798 CostKind, OpInfo, I); 5799 } 5800 5801 bool Reverse = ConsecutiveStride < 0; 5802 if (Reverse) 5803 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5804 CostKind, 0); 5805 return Cost; 5806 } 5807 5808 InstructionCost 5809 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5810 ElementCount VF) { 5811 assert(Legal->isUniformMemOp(*I, VF)); 5812 5813 Type *ValTy = getLoadStoreType(I); 5814 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5815 const Align Alignment = getLoadStoreAlignment(I); 5816 unsigned AS = getLoadStoreAddressSpace(I); 5817 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5818 if (isa<LoadInst>(I)) { 5819 return TTI.getAddressComputationCost(ValTy) + 5820 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5821 CostKind) + 5822 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5823 } 5824 StoreInst *SI = cast<StoreInst>(I); 5825 5826 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 5827 return TTI.getAddressComputationCost(ValTy) + 5828 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5829 CostKind) + 5830 (IsLoopInvariantStoreValue 5831 ? 0 5832 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5833 CostKind, VF.getKnownMinValue() - 1)); 5834 } 5835 5836 InstructionCost 5837 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5838 ElementCount VF) { 5839 Type *ValTy = getLoadStoreType(I); 5840 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5841 const Align Alignment = getLoadStoreAlignment(I); 5842 const Value *Ptr = getLoadStorePointerOperand(I); 5843 5844 return TTI.getAddressComputationCost(VectorTy) + 5845 TTI.getGatherScatterOpCost( 5846 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 5847 TargetTransformInfo::TCK_RecipThroughput, I); 5848 } 5849 5850 InstructionCost 5851 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5852 ElementCount VF) { 5853 const auto *Group = getInterleavedAccessGroup(I); 5854 assert(Group && "Fail to get an interleaved access group."); 5855 5856 Instruction *InsertPos = Group->getInsertPos(); 5857 Type *ValTy = getLoadStoreType(InsertPos); 5858 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5859 unsigned AS = getLoadStoreAddressSpace(InsertPos); 5860 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5861 5862 unsigned InterleaveFactor = Group->getFactor(); 5863 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5864 5865 // Holds the indices of existing members in the interleaved group. 5866 SmallVector<unsigned, 4> Indices; 5867 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 5868 if (Group->getMember(IF)) 5869 Indices.push_back(IF); 5870 5871 // Calculate the cost of the whole interleaved group. 5872 bool UseMaskForGaps = 5873 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 5874 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 5875 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 5876 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5877 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), 5878 UseMaskForGaps); 5879 5880 if (Group->isReverse()) { 5881 // TODO: Add support for reversed masked interleaved access. 5882 assert(!Legal->isMaskRequired(I) && 5883 "Reverse masked interleaved access not supported."); 5884 Cost += Group->getNumMembers() * 5885 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5886 CostKind, 0); 5887 } 5888 return Cost; 5889 } 5890 5891 std::optional<InstructionCost> 5892 LoopVectorizationCostModel::getReductionPatternCost( 5893 Instruction *I, ElementCount VF, Type *Ty, 5894 TTI::TargetCostKind CostKind) const { 5895 using namespace llvm::PatternMatch; 5896 // Early exit for no inloop reductions 5897 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 5898 return std::nullopt; 5899 auto *VectorTy = cast<VectorType>(Ty); 5900 5901 // We are looking for a pattern of, and finding the minimal acceptable cost: 5902 // reduce(mul(ext(A), ext(B))) or 5903 // reduce(mul(A, B)) or 5904 // reduce(ext(A)) or 5905 // reduce(A). 5906 // The basic idea is that we walk down the tree to do that, finding the root 5907 // reduction instruction in InLoopReductionImmediateChains. From there we find 5908 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 5909 // of the components. If the reduction cost is lower then we return it for the 5910 // reduction instruction and 0 for the other instructions in the pattern. If 5911 // it is not we return an invalid cost specifying the orignal cost method 5912 // should be used. 5913 Instruction *RetI = I; 5914 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 5915 if (!RetI->hasOneUser()) 5916 return std::nullopt; 5917 RetI = RetI->user_back(); 5918 } 5919 5920 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 5921 RetI->user_back()->getOpcode() == Instruction::Add) { 5922 RetI = RetI->user_back(); 5923 } 5924 5925 // Test if the found instruction is a reduction, and if not return an invalid 5926 // cost specifying the parent to use the original cost modelling. 5927 if (!InLoopReductionImmediateChains.count(RetI)) 5928 return std::nullopt; 5929 5930 // Find the reduction this chain is a part of and calculate the basic cost of 5931 // the reduction on its own. 5932 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 5933 Instruction *ReductionPhi = LastChain; 5934 while (!isa<PHINode>(ReductionPhi)) 5935 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 5936 5937 const RecurrenceDescriptor &RdxDesc = 5938 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 5939 5940 InstructionCost BaseCost; 5941 RecurKind RK = RdxDesc.getRecurrenceKind(); 5942 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 5943 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); 5944 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, 5945 RdxDesc.getFastMathFlags(), CostKind); 5946 } else { 5947 BaseCost = TTI.getArithmeticReductionCost( 5948 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 5949 } 5950 5951 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5952 // normal fmul instruction to the cost of the fadd reduction. 5953 if (RK == RecurKind::FMulAdd) 5954 BaseCost += 5955 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 5956 5957 // If we're using ordered reductions then we can just return the base cost 5958 // here, since getArithmeticReductionCost calculates the full ordered 5959 // reduction cost when FP reassociation is not allowed. 5960 if (useOrderedReductions(RdxDesc)) 5961 return BaseCost; 5962 5963 // Get the operand that was not the reduction chain and match it to one of the 5964 // patterns, returning the better cost if it is found. 5965 Instruction *RedOp = RetI->getOperand(1) == LastChain 5966 ? dyn_cast<Instruction>(RetI->getOperand(0)) 5967 : dyn_cast<Instruction>(RetI->getOperand(1)); 5968 5969 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 5970 5971 Instruction *Op0, *Op1; 5972 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5973 match(RedOp, 5974 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 5975 match(Op0, m_ZExtOrSExt(m_Value())) && 5976 Op0->getOpcode() == Op1->getOpcode() && 5977 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 5978 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 5979 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 5980 5981 // Matched reduce.add(ext(mul(ext(A), ext(B))) 5982 // Note that the extend opcodes need to all match, or if A==B they will have 5983 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5984 // which is equally fine. 5985 bool IsUnsigned = isa<ZExtInst>(Op0); 5986 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 5987 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 5988 5989 InstructionCost ExtCost = 5990 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 5991 TTI::CastContextHint::None, CostKind, Op0); 5992 InstructionCost MulCost = 5993 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 5994 InstructionCost Ext2Cost = 5995 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 5996 TTI::CastContextHint::None, CostKind, RedOp); 5997 5998 InstructionCost RedCost = TTI.getMulAccReductionCost( 5999 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6000 6001 if (RedCost.isValid() && 6002 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6003 return I == RetI ? RedCost : 0; 6004 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6005 !TheLoop->isLoopInvariant(RedOp)) { 6006 // Matched reduce(ext(A)) 6007 bool IsUnsigned = isa<ZExtInst>(RedOp); 6008 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6009 InstructionCost RedCost = TTI.getExtendedReductionCost( 6010 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6011 RdxDesc.getFastMathFlags(), CostKind); 6012 6013 InstructionCost ExtCost = 6014 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6015 TTI::CastContextHint::None, CostKind, RedOp); 6016 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6017 return I == RetI ? RedCost : 0; 6018 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6019 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6020 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6021 Op0->getOpcode() == Op1->getOpcode() && 6022 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6023 bool IsUnsigned = isa<ZExtInst>(Op0); 6024 Type *Op0Ty = Op0->getOperand(0)->getType(); 6025 Type *Op1Ty = Op1->getOperand(0)->getType(); 6026 Type *LargestOpTy = 6027 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6028 : Op0Ty; 6029 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6030 6031 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6032 // different sizes. We take the largest type as the ext to reduce, and add 6033 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6034 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6035 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6036 TTI::CastContextHint::None, CostKind, Op0); 6037 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6038 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6039 TTI::CastContextHint::None, CostKind, Op1); 6040 InstructionCost MulCost = 6041 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6042 6043 InstructionCost RedCost = TTI.getMulAccReductionCost( 6044 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6045 InstructionCost ExtraExtCost = 0; 6046 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6047 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6048 ExtraExtCost = TTI.getCastInstrCost( 6049 ExtraExtOp->getOpcode(), ExtType, 6050 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6051 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6052 } 6053 6054 if (RedCost.isValid() && 6055 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6056 return I == RetI ? RedCost : 0; 6057 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6058 // Matched reduce.add(mul()) 6059 InstructionCost MulCost = 6060 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6061 6062 InstructionCost RedCost = TTI.getMulAccReductionCost( 6063 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6064 6065 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6066 return I == RetI ? RedCost : 0; 6067 } 6068 } 6069 6070 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6071 } 6072 6073 InstructionCost 6074 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6075 ElementCount VF) { 6076 // Calculate scalar cost only. Vectorization cost should be ready at this 6077 // moment. 6078 if (VF.isScalar()) { 6079 Type *ValTy = getLoadStoreType(I); 6080 const Align Alignment = getLoadStoreAlignment(I); 6081 unsigned AS = getLoadStoreAddressSpace(I); 6082 6083 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6084 return TTI.getAddressComputationCost(ValTy) + 6085 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6086 TTI::TCK_RecipThroughput, OpInfo, I); 6087 } 6088 return getWideningCost(I, VF); 6089 } 6090 6091 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6092 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6093 6094 // There is no mechanism yet to create a scalable scalarization loop, 6095 // so this is currently Invalid. 6096 if (VF.isScalable()) 6097 return InstructionCost::getInvalid(); 6098 6099 if (VF.isScalar()) 6100 return 0; 6101 6102 InstructionCost Cost = 0; 6103 Type *RetTy = toVectorTy(I->getType(), VF); 6104 if (!RetTy->isVoidTy() && 6105 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6106 Cost += TTI.getScalarizationOverhead( 6107 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6108 /*Insert*/ true, 6109 /*Extract*/ false, CostKind); 6110 6111 // Some targets keep addresses scalar. 6112 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6113 return Cost; 6114 6115 // Some targets support efficient element stores. 6116 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6117 return Cost; 6118 6119 // Collect operands to consider. 6120 CallInst *CI = dyn_cast<CallInst>(I); 6121 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6122 6123 // Skip operands that do not require extraction/scalarization and do not incur 6124 // any overhead. 6125 SmallVector<Type *> Tys; 6126 for (auto *V : filterExtractingOperands(Ops, VF)) 6127 Tys.push_back(maybeVectorizeType(V->getType(), VF)); 6128 return Cost + TTI.getOperandsScalarizationOverhead( 6129 filterExtractingOperands(Ops, VF), Tys, CostKind); 6130 } 6131 6132 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6133 if (VF.isScalar()) 6134 return; 6135 NumPredStores = 0; 6136 for (BasicBlock *BB : TheLoop->blocks()) { 6137 // For each instruction in the old loop. 6138 for (Instruction &I : *BB) { 6139 Value *Ptr = getLoadStorePointerOperand(&I); 6140 if (!Ptr) 6141 continue; 6142 6143 // TODO: We should generate better code and update the cost model for 6144 // predicated uniform stores. Today they are treated as any other 6145 // predicated store (see added test cases in 6146 // invariant-store-vectorization.ll). 6147 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6148 NumPredStores++; 6149 6150 if (Legal->isUniformMemOp(I, VF)) { 6151 auto IsLegalToScalarize = [&]() { 6152 if (!VF.isScalable()) 6153 // Scalarization of fixed length vectors "just works". 6154 return true; 6155 6156 // We have dedicated lowering for unpredicated uniform loads and 6157 // stores. Note that even with tail folding we know that at least 6158 // one lane is active (i.e. generalized predication is not possible 6159 // here), and the logic below depends on this fact. 6160 if (!foldTailByMasking()) 6161 return true; 6162 6163 // For scalable vectors, a uniform memop load is always 6164 // uniform-by-parts and we know how to scalarize that. 6165 if (isa<LoadInst>(I)) 6166 return true; 6167 6168 // A uniform store isn't neccessarily uniform-by-part 6169 // and we can't assume scalarization. 6170 auto &SI = cast<StoreInst>(I); 6171 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6172 }; 6173 6174 const InstructionCost GatherScatterCost = 6175 isLegalGatherOrScatter(&I, VF) ? 6176 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6177 6178 // Load: Scalar load + broadcast 6179 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6180 // FIXME: This cost is a significant under-estimate for tail folded 6181 // memory ops. 6182 const InstructionCost ScalarizationCost = 6183 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF) 6184 : InstructionCost::getInvalid(); 6185 6186 // Choose better solution for the current VF, Note that Invalid 6187 // costs compare as maximumal large. If both are invalid, we get 6188 // scalable invalid which signals a failure and a vectorization abort. 6189 if (GatherScatterCost < ScalarizationCost) 6190 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6191 else 6192 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6193 continue; 6194 } 6195 6196 // We assume that widening is the best solution when possible. 6197 if (memoryInstructionCanBeWidened(&I, VF)) { 6198 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6199 int ConsecutiveStride = Legal->isConsecutivePtr( 6200 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6201 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6202 "Expected consecutive stride."); 6203 InstWidening Decision = 6204 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6205 setWideningDecision(&I, VF, Decision, Cost); 6206 continue; 6207 } 6208 6209 // Choose between Interleaving, Gather/Scatter or Scalarization. 6210 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6211 unsigned NumAccesses = 1; 6212 if (isAccessInterleaved(&I)) { 6213 const auto *Group = getInterleavedAccessGroup(&I); 6214 assert(Group && "Fail to get an interleaved access group."); 6215 6216 // Make one decision for the whole group. 6217 if (getWideningDecision(&I, VF) != CM_Unknown) 6218 continue; 6219 6220 NumAccesses = Group->getNumMembers(); 6221 if (interleavedAccessCanBeWidened(&I, VF)) 6222 InterleaveCost = getInterleaveGroupCost(&I, VF); 6223 } 6224 6225 InstructionCost GatherScatterCost = 6226 isLegalGatherOrScatter(&I, VF) 6227 ? getGatherScatterCost(&I, VF) * NumAccesses 6228 : InstructionCost::getInvalid(); 6229 6230 InstructionCost ScalarizationCost = 6231 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6232 6233 // Choose better solution for the current VF, 6234 // write down this decision and use it during vectorization. 6235 InstructionCost Cost; 6236 InstWidening Decision; 6237 if (InterleaveCost <= GatherScatterCost && 6238 InterleaveCost < ScalarizationCost) { 6239 Decision = CM_Interleave; 6240 Cost = InterleaveCost; 6241 } else if (GatherScatterCost < ScalarizationCost) { 6242 Decision = CM_GatherScatter; 6243 Cost = GatherScatterCost; 6244 } else { 6245 Decision = CM_Scalarize; 6246 Cost = ScalarizationCost; 6247 } 6248 // If the instructions belongs to an interleave group, the whole group 6249 // receives the same decision. The whole group receives the cost, but 6250 // the cost will actually be assigned to one instruction. 6251 if (const auto *Group = getInterleavedAccessGroup(&I)) 6252 setWideningDecision(Group, VF, Decision, Cost); 6253 else 6254 setWideningDecision(&I, VF, Decision, Cost); 6255 } 6256 } 6257 6258 // Make sure that any load of address and any other address computation 6259 // remains scalar unless there is gather/scatter support. This avoids 6260 // inevitable extracts into address registers, and also has the benefit of 6261 // activating LSR more, since that pass can't optimize vectorized 6262 // addresses. 6263 if (TTI.prefersVectorizedAddressing()) 6264 return; 6265 6266 // Start with all scalar pointer uses. 6267 SmallPtrSet<Instruction *, 8> AddrDefs; 6268 for (BasicBlock *BB : TheLoop->blocks()) 6269 for (Instruction &I : *BB) { 6270 Instruction *PtrDef = 6271 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6272 if (PtrDef && TheLoop->contains(PtrDef) && 6273 getWideningDecision(&I, VF) != CM_GatherScatter) 6274 AddrDefs.insert(PtrDef); 6275 } 6276 6277 // Add all instructions used to generate the addresses. 6278 SmallVector<Instruction *, 4> Worklist; 6279 append_range(Worklist, AddrDefs); 6280 while (!Worklist.empty()) { 6281 Instruction *I = Worklist.pop_back_val(); 6282 for (auto &Op : I->operands()) 6283 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6284 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6285 AddrDefs.insert(InstOp).second) 6286 Worklist.push_back(InstOp); 6287 } 6288 6289 for (auto *I : AddrDefs) { 6290 if (isa<LoadInst>(I)) { 6291 // Setting the desired widening decision should ideally be handled in 6292 // by cost functions, but since this involves the task of finding out 6293 // if the loaded register is involved in an address computation, it is 6294 // instead changed here when we know this is the case. 6295 InstWidening Decision = getWideningDecision(I, VF); 6296 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6297 // Scalarize a widened load of address. 6298 setWideningDecision( 6299 I, VF, CM_Scalarize, 6300 (VF.getKnownMinValue() * 6301 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6302 else if (const auto *Group = getInterleavedAccessGroup(I)) { 6303 // Scalarize an interleave group of address loads. 6304 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6305 if (Instruction *Member = Group->getMember(I)) 6306 setWideningDecision( 6307 Member, VF, CM_Scalarize, 6308 (VF.getKnownMinValue() * 6309 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6310 } 6311 } 6312 } else 6313 // Make sure I gets scalarized and a cost estimate without 6314 // scalarization overhead. 6315 ForcedScalars[VF].insert(I); 6316 } 6317 } 6318 6319 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6320 assert(!VF.isScalar() && 6321 "Trying to set a vectorization decision for a scalar VF"); 6322 6323 auto ForcedScalar = ForcedScalars.find(VF); 6324 for (BasicBlock *BB : TheLoop->blocks()) { 6325 // For each instruction in the old loop. 6326 for (Instruction &I : *BB) { 6327 CallInst *CI = dyn_cast<CallInst>(&I); 6328 6329 if (!CI) 6330 continue; 6331 6332 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6333 InstructionCost VectorCost = InstructionCost::getInvalid(); 6334 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6335 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6336 Function *ScalarFunc = CI->getCalledFunction(); 6337 Type *ScalarRetTy = CI->getType(); 6338 SmallVector<Type *, 4> Tys, ScalarTys; 6339 for (auto &ArgOp : CI->args()) 6340 ScalarTys.push_back(ArgOp->getType()); 6341 6342 // Estimate cost of scalarized vector call. The source operands are 6343 // assumed to be vectors, so we need to extract individual elements from 6344 // there, execute VF scalar calls, and then gather the result into the 6345 // vector return value. 6346 InstructionCost ScalarCallCost = 6347 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6348 6349 // Compute costs of unpacking argument values for the scalar calls and 6350 // packing the return values to a vector. 6351 InstructionCost ScalarizationCost = 6352 getScalarizationOverhead(CI, VF, CostKind); 6353 6354 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6355 // Honor ForcedScalars and UniformAfterVectorization decisions. 6356 // TODO: For calls, it might still be more profitable to widen. Use 6357 // VPlan-based cost model to compare different options. 6358 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() && 6359 ForcedScalar->second.contains(CI)) || 6360 isUniformAfterVectorization(CI, VF))) { 6361 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr, 6362 Intrinsic::not_intrinsic, std::nullopt, 6363 ScalarCost); 6364 continue; 6365 } 6366 6367 bool MaskRequired = Legal->isMaskRequired(CI); 6368 // Compute corresponding vector type for return value and arguments. 6369 Type *RetTy = toVectorTy(ScalarRetTy, VF); 6370 for (Type *ScalarTy : ScalarTys) 6371 Tys.push_back(toVectorTy(ScalarTy, VF)); 6372 6373 // An in-loop reduction using an fmuladd intrinsic is a special case; 6374 // we don't want the normal cost for that intrinsic. 6375 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6376 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6377 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6378 getVectorIntrinsicIDForCall(CI, TLI), 6379 std::nullopt, *RedCost); 6380 continue; 6381 } 6382 6383 // Find the cost of vectorizing the call, if we can find a suitable 6384 // vector variant of the function. 6385 bool UsesMask = false; 6386 VFInfo FuncInfo; 6387 Function *VecFunc = nullptr; 6388 // Search through any available variants for one we can use at this VF. 6389 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6390 // Must match requested VF. 6391 if (Info.Shape.VF != VF) 6392 continue; 6393 6394 // Must take a mask argument if one is required 6395 if (MaskRequired && !Info.isMasked()) 6396 continue; 6397 6398 // Check that all parameter kinds are supported 6399 bool ParamsOk = true; 6400 for (VFParameter Param : Info.Shape.Parameters) { 6401 switch (Param.ParamKind) { 6402 case VFParamKind::Vector: 6403 break; 6404 case VFParamKind::OMP_Uniform: { 6405 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6406 // Make sure the scalar parameter in the loop is invariant. 6407 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6408 TheLoop)) 6409 ParamsOk = false; 6410 break; 6411 } 6412 case VFParamKind::OMP_Linear: { 6413 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6414 // Find the stride for the scalar parameter in this loop and see if 6415 // it matches the stride for the variant. 6416 // TODO: do we need to figure out the cost of an extract to get the 6417 // first lane? Or do we hope that it will be folded away? 6418 ScalarEvolution *SE = PSE.getSE(); 6419 const auto *SAR = 6420 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6421 6422 if (!SAR || SAR->getLoop() != TheLoop) { 6423 ParamsOk = false; 6424 break; 6425 } 6426 6427 const SCEVConstant *Step = 6428 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6429 6430 if (!Step || 6431 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6432 ParamsOk = false; 6433 6434 break; 6435 } 6436 case VFParamKind::GlobalPredicate: 6437 UsesMask = true; 6438 break; 6439 default: 6440 ParamsOk = false; 6441 break; 6442 } 6443 } 6444 6445 if (!ParamsOk) 6446 continue; 6447 6448 // Found a suitable candidate, stop here. 6449 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6450 FuncInfo = Info; 6451 break; 6452 } 6453 6454 // Add in the cost of synthesizing a mask if one wasn't required. 6455 InstructionCost MaskCost = 0; 6456 if (VecFunc && UsesMask && !MaskRequired) 6457 MaskCost = TTI.getShuffleCost( 6458 TargetTransformInfo::SK_Broadcast, 6459 VectorType::get(IntegerType::getInt1Ty( 6460 VecFunc->getFunctionType()->getContext()), 6461 VF)); 6462 6463 if (TLI && VecFunc && !CI->isNoBuiltin()) 6464 VectorCost = 6465 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6466 6467 // Find the cost of an intrinsic; some targets may have instructions that 6468 // perform the operation without needing an actual call. 6469 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6470 if (IID != Intrinsic::not_intrinsic) 6471 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6472 6473 InstructionCost Cost = ScalarCost; 6474 InstWidening Decision = CM_Scalarize; 6475 6476 if (VectorCost <= Cost) { 6477 Cost = VectorCost; 6478 Decision = CM_VectorCall; 6479 } 6480 6481 if (IntrinsicCost <= Cost) { 6482 Cost = IntrinsicCost; 6483 Decision = CM_IntrinsicCall; 6484 } 6485 6486 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6487 FuncInfo.getParamIndexForOptionalMask(), Cost); 6488 } 6489 } 6490 } 6491 6492 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { 6493 if (!Legal->isInvariant(Op)) 6494 return false; 6495 // Consider Op invariant, if it or its operands aren't predicated 6496 // instruction in the loop. In that case, it is not trivially hoistable. 6497 auto *OpI = dyn_cast<Instruction>(Op); 6498 return !OpI || !TheLoop->contains(OpI) || 6499 (!isPredicatedInst(OpI) && 6500 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) && 6501 all_of(OpI->operands(), 6502 [this](Value *Op) { return shouldConsiderInvariant(Op); })); 6503 } 6504 6505 InstructionCost 6506 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6507 ElementCount VF) { 6508 // If we know that this instruction will remain uniform, check the cost of 6509 // the scalar version. 6510 if (isUniformAfterVectorization(I, VF)) 6511 VF = ElementCount::getFixed(1); 6512 6513 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6514 return InstsToScalarize[VF][I]; 6515 6516 // Forced scalars do not have any scalarization overhead. 6517 auto ForcedScalar = ForcedScalars.find(VF); 6518 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6519 auto InstSet = ForcedScalar->second; 6520 if (InstSet.count(I)) 6521 return getInstructionCost(I, ElementCount::getFixed(1)) * 6522 VF.getKnownMinValue(); 6523 } 6524 6525 Type *RetTy = I->getType(); 6526 if (canTruncateToMinimalBitwidth(I, VF)) 6527 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6528 auto *SE = PSE.getSE(); 6529 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6530 6531 auto HasSingleCopyAfterVectorization = [this](Instruction *I, 6532 ElementCount VF) -> bool { 6533 if (VF.isScalar()) 6534 return true; 6535 6536 auto Scalarized = InstsToScalarize.find(VF); 6537 assert(Scalarized != InstsToScalarize.end() && 6538 "VF not yet analyzed for scalarization profitability"); 6539 return !Scalarized->second.count(I) && 6540 llvm::all_of(I->users(), [&](User *U) { 6541 auto *UI = cast<Instruction>(U); 6542 return !Scalarized->second.count(UI); 6543 }); 6544 }; 6545 (void)HasSingleCopyAfterVectorization; 6546 6547 Type *VectorTy; 6548 if (isScalarAfterVectorization(I, VF)) { 6549 // With the exception of GEPs and PHIs, after scalarization there should 6550 // only be one copy of the instruction generated in the loop. This is 6551 // because the VF is either 1, or any instructions that need scalarizing 6552 // have already been dealt with by the time we get here. As a result, 6553 // it means we don't have to multiply the instruction cost by VF. 6554 assert(I->getOpcode() == Instruction::GetElementPtr || 6555 I->getOpcode() == Instruction::PHI || 6556 (I->getOpcode() == Instruction::BitCast && 6557 I->getType()->isPointerTy()) || 6558 HasSingleCopyAfterVectorization(I, VF)); 6559 VectorTy = RetTy; 6560 } else 6561 VectorTy = toVectorTy(RetTy, VF); 6562 6563 if (VF.isVector() && VectorTy->isVectorTy() && 6564 !TTI.getNumberOfParts(VectorTy)) 6565 return InstructionCost::getInvalid(); 6566 6567 // TODO: We need to estimate the cost of intrinsic calls. 6568 switch (I->getOpcode()) { 6569 case Instruction::GetElementPtr: 6570 // We mark this instruction as zero-cost because the cost of GEPs in 6571 // vectorized code depends on whether the corresponding memory instruction 6572 // is scalarized or not. Therefore, we handle GEPs with the memory 6573 // instruction cost. 6574 return 0; 6575 case Instruction::Br: { 6576 // In cases of scalarized and predicated instructions, there will be VF 6577 // predicated blocks in the vectorized loop. Each branch around these 6578 // blocks requires also an extract of its vector compare i1 element. 6579 // Note that the conditional branch from the loop latch will be replaced by 6580 // a single branch controlling the loop, so there is no extra overhead from 6581 // scalarization. 6582 bool ScalarPredicatedBB = false; 6583 BranchInst *BI = cast<BranchInst>(I); 6584 if (VF.isVector() && BI->isConditional() && 6585 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6586 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && 6587 BI->getParent() != TheLoop->getLoopLatch()) 6588 ScalarPredicatedBB = true; 6589 6590 if (ScalarPredicatedBB) { 6591 // Not possible to scalarize scalable vector with predicated instructions. 6592 if (VF.isScalable()) 6593 return InstructionCost::getInvalid(); 6594 // Return cost for branches around scalarized and predicated blocks. 6595 auto *VecI1Ty = 6596 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6597 return ( 6598 TTI.getScalarizationOverhead( 6599 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), 6600 /*Insert*/ false, /*Extract*/ true, CostKind) + 6601 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6602 } 6603 6604 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6605 // The back-edge branch will remain, as will all scalar branches. 6606 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6607 6608 // This branch will be eliminated by if-conversion. 6609 return 0; 6610 // Note: We currently assume zero cost for an unconditional branch inside 6611 // a predicated block since it will become a fall-through, although we 6612 // may decide in the future to call TTI for all branches. 6613 } 6614 case Instruction::Switch: { 6615 if (VF.isScalar()) 6616 return TTI.getCFInstrCost(Instruction::Switch, CostKind); 6617 auto *Switch = cast<SwitchInst>(I); 6618 return Switch->getNumCases() * 6619 TTI.getCmpSelInstrCost( 6620 Instruction::ICmp, 6621 toVectorTy(Switch->getCondition()->getType(), VF), 6622 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 6623 CmpInst::ICMP_EQ, CostKind); 6624 } 6625 case Instruction::PHI: { 6626 auto *Phi = cast<PHINode>(I); 6627 6628 // First-order recurrences are replaced by vector shuffles inside the loop. 6629 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6630 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6631 // penultimate value of the recurrence. 6632 // TODO: Consider vscale_range info. 6633 if (VF.isScalable() && VF.getKnownMinValue() == 1) 6634 return InstructionCost::getInvalid(); 6635 SmallVector<int> Mask(VF.getKnownMinValue()); 6636 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6637 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6638 cast<VectorType>(VectorTy), Mask, CostKind, 6639 VF.getKnownMinValue() - 1); 6640 } 6641 6642 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6643 // converted into select instructions. We require N - 1 selects per phi 6644 // node, where N is the number of incoming values. 6645 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { 6646 Type *ResultTy = Phi->getType(); 6647 6648 // All instructions in an Any-of reduction chain are narrowed to bool. 6649 // Check if that is the case for this phi node. 6650 auto *HeaderUser = cast_if_present<PHINode>( 6651 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * { 6652 auto *Phi = dyn_cast<PHINode>(U); 6653 if (Phi && Phi->getParent() == TheLoop->getHeader()) 6654 return Phi; 6655 return nullptr; 6656 })); 6657 if (HeaderUser) { 6658 auto &ReductionVars = Legal->getReductionVars(); 6659 auto Iter = ReductionVars.find(HeaderUser); 6660 if (Iter != ReductionVars.end() && 6661 RecurrenceDescriptor::isAnyOfRecurrenceKind( 6662 Iter->second.getRecurrenceKind())) 6663 ResultTy = Type::getInt1Ty(Phi->getContext()); 6664 } 6665 return (Phi->getNumIncomingValues() - 1) * 6666 TTI.getCmpSelInstrCost( 6667 Instruction::Select, toVectorTy(ResultTy, VF), 6668 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6669 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6670 } 6671 6672 // When tail folding with EVL, if the phi is part of an out of loop 6673 // reduction then it will be transformed into a wide vp_merge. 6674 if (VF.isVector() && foldTailWithEVL() && 6675 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { 6676 IntrinsicCostAttributes ICA( 6677 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), 6678 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); 6679 return TTI.getIntrinsicInstrCost(ICA, CostKind); 6680 } 6681 6682 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6683 } 6684 case Instruction::UDiv: 6685 case Instruction::SDiv: 6686 case Instruction::URem: 6687 case Instruction::SRem: 6688 if (VF.isVector() && isPredicatedInst(I)) { 6689 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6690 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6691 ScalarCost : SafeDivisorCost; 6692 } 6693 // We've proven all lanes safe to speculate, fall through. 6694 [[fallthrough]]; 6695 case Instruction::Add: 6696 case Instruction::Sub: { 6697 auto Info = Legal->getHistogramInfo(I); 6698 if (Info && VF.isVector()) { 6699 const HistogramInfo *HGram = Info.value(); 6700 // Assume that a non-constant update value (or a constant != 1) requires 6701 // a multiply, and add that into the cost. 6702 InstructionCost MulCost = TTI::TCC_Free; 6703 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); 6704 if (!RHS || RHS->getZExtValue() != 1) 6705 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 6706 6707 // Find the cost of the histogram operation itself. 6708 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); 6709 Type *ScalarTy = I->getType(); 6710 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); 6711 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 6712 Type::getVoidTy(I->getContext()), 6713 {PtrTy, ScalarTy, MaskTy}); 6714 6715 // Add the costs together with the add/sub operation. 6716 return TTI.getIntrinsicInstrCost( 6717 ICA, TargetTransformInfo::TCK_RecipThroughput) + 6718 MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); 6719 } 6720 [[fallthrough]]; 6721 } 6722 case Instruction::FAdd: 6723 case Instruction::FSub: 6724 case Instruction::Mul: 6725 case Instruction::FMul: 6726 case Instruction::FDiv: 6727 case Instruction::FRem: 6728 case Instruction::Shl: 6729 case Instruction::LShr: 6730 case Instruction::AShr: 6731 case Instruction::And: 6732 case Instruction::Or: 6733 case Instruction::Xor: { 6734 // If we're speculating on the stride being 1, the multiplication may 6735 // fold away. We can generalize this for all operations using the notion 6736 // of neutral elements. (TODO) 6737 if (I->getOpcode() == Instruction::Mul && 6738 (PSE.getSCEV(I->getOperand(0))->isOne() || 6739 PSE.getSCEV(I->getOperand(1))->isOne())) 6740 return 0; 6741 6742 // Detect reduction patterns 6743 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6744 return *RedCost; 6745 6746 // Certain instructions can be cheaper to vectorize if they have a constant 6747 // second vector operand. One example of this are shifts on x86. 6748 Value *Op2 = I->getOperand(1); 6749 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) && 6750 isa<SCEVConstant>(PSE.getSCEV(Op2))) { 6751 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue(); 6752 } 6753 auto Op2Info = TTI.getOperandInfo(Op2); 6754 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6755 shouldConsiderInvariant(Op2)) 6756 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6757 6758 SmallVector<const Value *, 4> Operands(I->operand_values()); 6759 return TTI.getArithmeticInstrCost( 6760 I->getOpcode(), VectorTy, CostKind, 6761 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6762 Op2Info, Operands, I, TLI); 6763 } 6764 case Instruction::FNeg: { 6765 return TTI.getArithmeticInstrCost( 6766 I->getOpcode(), VectorTy, CostKind, 6767 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6768 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6769 I->getOperand(0), I); 6770 } 6771 case Instruction::Select: { 6772 SelectInst *SI = cast<SelectInst>(I); 6773 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6774 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6775 6776 const Value *Op0, *Op1; 6777 using namespace llvm::PatternMatch; 6778 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6779 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6780 // select x, y, false --> x & y 6781 // select x, true, y --> x | y 6782 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6783 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6784 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6785 Op1->getType()->getScalarSizeInBits() == 1); 6786 6787 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6788 return TTI.getArithmeticInstrCost( 6789 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6790 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6791 } 6792 6793 Type *CondTy = SI->getCondition()->getType(); 6794 if (!ScalarCond) 6795 CondTy = VectorType::get(CondTy, VF); 6796 6797 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6798 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6799 Pred = Cmp->getPredicate(); 6800 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6801 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 6802 {TTI::OK_AnyValue, TTI::OP_None}, I); 6803 } 6804 case Instruction::ICmp: 6805 case Instruction::FCmp: { 6806 Type *ValTy = I->getOperand(0)->getType(); 6807 6808 if (canTruncateToMinimalBitwidth(I, VF)) { 6809 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6810 (void)Op0AsInstruction; 6811 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || 6812 MinBWs[I] == MinBWs[Op0AsInstruction]) && 6813 "if both the operand and the compare are marked for " 6814 "truncation, they must have the same bitwidth"); 6815 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); 6816 } 6817 6818 VectorTy = toVectorTy(ValTy, VF); 6819 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 6820 cast<CmpInst>(I)->getPredicate(), CostKind, 6821 {TTI::OK_AnyValue, TTI::OP_None}, 6822 {TTI::OK_AnyValue, TTI::OP_None}, I); 6823 } 6824 case Instruction::Store: 6825 case Instruction::Load: { 6826 ElementCount Width = VF; 6827 if (Width.isVector()) { 6828 InstWidening Decision = getWideningDecision(I, Width); 6829 assert(Decision != CM_Unknown && 6830 "CM decision should be taken at this point"); 6831 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 6832 return InstructionCost::getInvalid(); 6833 if (Decision == CM_Scalarize) 6834 Width = ElementCount::getFixed(1); 6835 } 6836 VectorTy = toVectorTy(getLoadStoreType(I), Width); 6837 return getMemoryInstructionCost(I, VF); 6838 } 6839 case Instruction::BitCast: 6840 if (I->getType()->isPointerTy()) 6841 return 0; 6842 [[fallthrough]]; 6843 case Instruction::ZExt: 6844 case Instruction::SExt: 6845 case Instruction::FPToUI: 6846 case Instruction::FPToSI: 6847 case Instruction::FPExt: 6848 case Instruction::PtrToInt: 6849 case Instruction::IntToPtr: 6850 case Instruction::SIToFP: 6851 case Instruction::UIToFP: 6852 case Instruction::Trunc: 6853 case Instruction::FPTrunc: { 6854 // Computes the CastContextHint from a Load/Store instruction. 6855 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6856 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6857 "Expected a load or a store!"); 6858 6859 if (VF.isScalar() || !TheLoop->contains(I)) 6860 return TTI::CastContextHint::Normal; 6861 6862 switch (getWideningDecision(I, VF)) { 6863 case LoopVectorizationCostModel::CM_GatherScatter: 6864 return TTI::CastContextHint::GatherScatter; 6865 case LoopVectorizationCostModel::CM_Interleave: 6866 return TTI::CastContextHint::Interleave; 6867 case LoopVectorizationCostModel::CM_Scalarize: 6868 case LoopVectorizationCostModel::CM_Widen: 6869 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6870 : TTI::CastContextHint::Normal; 6871 case LoopVectorizationCostModel::CM_Widen_Reverse: 6872 return TTI::CastContextHint::Reversed; 6873 case LoopVectorizationCostModel::CM_Unknown: 6874 llvm_unreachable("Instr did not go through cost modelling?"); 6875 case LoopVectorizationCostModel::CM_VectorCall: 6876 case LoopVectorizationCostModel::CM_IntrinsicCall: 6877 llvm_unreachable_internal("Instr has invalid widening decision"); 6878 } 6879 6880 llvm_unreachable("Unhandled case!"); 6881 }; 6882 6883 unsigned Opcode = I->getOpcode(); 6884 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6885 // For Trunc, the context is the only user, which must be a StoreInst. 6886 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6887 if (I->hasOneUse()) 6888 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6889 CCH = ComputeCCH(Store); 6890 } 6891 // For Z/Sext, the context is the operand, which must be a LoadInst. 6892 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6893 Opcode == Instruction::FPExt) { 6894 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6895 CCH = ComputeCCH(Load); 6896 } 6897 6898 // We optimize the truncation of induction variables having constant 6899 // integer steps. The cost of these truncations is the same as the scalar 6900 // operation. 6901 if (isOptimizableIVTruncate(I, VF)) { 6902 auto *Trunc = cast<TruncInst>(I); 6903 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6904 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6905 } 6906 6907 // Detect reduction patterns 6908 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6909 return *RedCost; 6910 6911 Type *SrcScalarTy = I->getOperand(0)->getType(); 6912 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6913 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6914 SrcScalarTy = 6915 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); 6916 Type *SrcVecTy = 6917 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6918 6919 if (canTruncateToMinimalBitwidth(I, VF)) { 6920 // If the result type is <= the source type, there will be no extend 6921 // after truncating the users to the minimal required bitwidth. 6922 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && 6923 (I->getOpcode() == Instruction::ZExt || 6924 I->getOpcode() == Instruction::SExt)) 6925 return 0; 6926 } 6927 6928 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6929 } 6930 case Instruction::Call: 6931 return getVectorCallCost(cast<CallInst>(I), VF); 6932 case Instruction::ExtractValue: 6933 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 6934 case Instruction::Alloca: 6935 // We cannot easily widen alloca to a scalable alloca, as 6936 // the result would need to be a vector of pointers. 6937 if (VF.isScalable()) 6938 return InstructionCost::getInvalid(); 6939 [[fallthrough]]; 6940 default: 6941 // This opcode is unknown. Assume that it is the same as 'mul'. 6942 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6943 } // end of switch. 6944 } 6945 6946 void LoopVectorizationCostModel::collectValuesToIgnore() { 6947 // Ignore ephemeral values. 6948 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6949 6950 SmallVector<Value *, 4> DeadInterleavePointerOps; 6951 SmallVector<Value *, 4> DeadOps; 6952 6953 // If a scalar epilogue is required, users outside the loop won't use 6954 // live-outs from the vector loop but from the scalar epilogue. Ignore them if 6955 // that is the case. 6956 bool RequiresScalarEpilogue = requiresScalarEpilogue(true); 6957 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) { 6958 return RequiresScalarEpilogue && 6959 !TheLoop->contains(cast<Instruction>(U)->getParent()); 6960 }; 6961 6962 LoopBlocksDFS DFS(TheLoop); 6963 DFS.perform(LI); 6964 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps; 6965 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO()))) 6966 for (Instruction &I : reverse(*BB)) { 6967 // Find all stores to invariant variables. Since they are going to sink 6968 // outside the loop we do not need calculate cost for them. 6969 StoreInst *SI; 6970 if ((SI = dyn_cast<StoreInst>(&I)) && 6971 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 6972 ValuesToIgnore.insert(&I); 6973 DeadInvariantStoreOps[SI->getPointerOperand()].push_back( 6974 SI->getValueOperand()); 6975 } 6976 6977 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) 6978 continue; 6979 6980 // Add instructions that would be trivially dead and are only used by 6981 // values already ignored to DeadOps to seed worklist. 6982 if (wouldInstructionBeTriviallyDead(&I, TLI) && 6983 all_of(I.users(), [this, IsLiveOutDead](User *U) { 6984 return VecValuesToIgnore.contains(U) || 6985 ValuesToIgnore.contains(U) || IsLiveOutDead(U); 6986 })) 6987 DeadOps.push_back(&I); 6988 6989 // For interleave groups, we only create a pointer for the start of the 6990 // interleave group. Queue up addresses of group members except the insert 6991 // position for further processing. 6992 if (isAccessInterleaved(&I)) { 6993 auto *Group = getInterleavedAccessGroup(&I); 6994 if (Group->getInsertPos() == &I) 6995 continue; 6996 Value *PointerOp = getLoadStorePointerOperand(&I); 6997 DeadInterleavePointerOps.push_back(PointerOp); 6998 } 6999 7000 // Queue branches for analysis. They are dead, if their successors only 7001 // contain dead instructions. 7002 if (auto *Br = dyn_cast<BranchInst>(&I)) { 7003 if (Br->isConditional()) 7004 DeadOps.push_back(&I); 7005 } 7006 } 7007 7008 // Mark ops feeding interleave group members as free, if they are only used 7009 // by other dead computations. 7010 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { 7011 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); 7012 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { 7013 Instruction *UI = cast<Instruction>(U); 7014 return !VecValuesToIgnore.contains(U) && 7015 (!isAccessInterleaved(UI) || 7016 getInterleavedAccessGroup(UI)->getInsertPos() == UI); 7017 })) 7018 continue; 7019 VecValuesToIgnore.insert(Op); 7020 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); 7021 } 7022 7023 for (const auto &[_, Ops] : DeadInvariantStoreOps) { 7024 for (Value *Op : ArrayRef(Ops).drop_back()) 7025 DeadOps.push_back(Op); 7026 } 7027 // Mark ops that would be trivially dead and are only used by ignored 7028 // instructions as free. 7029 BasicBlock *Header = TheLoop->getHeader(); 7030 7031 // Returns true if the block contains only dead instructions. Such blocks will 7032 // be removed by VPlan-to-VPlan transforms and won't be considered by the 7033 // VPlan-based cost model, so skip them in the legacy cost-model as well. 7034 auto IsEmptyBlock = [this](BasicBlock *BB) { 7035 return all_of(*BB, [this](Instruction &I) { 7036 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) || 7037 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional()); 7038 }); 7039 }; 7040 for (unsigned I = 0; I != DeadOps.size(); ++I) { 7041 auto *Op = dyn_cast<Instruction>(DeadOps[I]); 7042 7043 // Check if the branch should be considered dead. 7044 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) { 7045 BasicBlock *ThenBB = Br->getSuccessor(0); 7046 BasicBlock *ElseBB = Br->getSuccessor(1); 7047 // Don't considers branches leaving the loop for simplification. 7048 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB)) 7049 continue; 7050 bool ThenEmpty = IsEmptyBlock(ThenBB); 7051 bool ElseEmpty = IsEmptyBlock(ElseBB); 7052 if ((ThenEmpty && ElseEmpty) || 7053 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB && 7054 ElseBB->phis().empty()) || 7055 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB && 7056 ThenBB->phis().empty())) { 7057 VecValuesToIgnore.insert(Br); 7058 DeadOps.push_back(Br->getCondition()); 7059 } 7060 continue; 7061 } 7062 7063 // Skip any op that shouldn't be considered dead. 7064 if (!Op || !TheLoop->contains(Op) || 7065 (isa<PHINode>(Op) && Op->getParent() == Header) || 7066 !wouldInstructionBeTriviallyDead(Op, TLI) || 7067 any_of(Op->users(), [this, IsLiveOutDead](User *U) { 7068 return !VecValuesToIgnore.contains(U) && 7069 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U); 7070 })) 7071 continue; 7072 7073 if (!TheLoop->contains(Op->getParent())) 7074 continue; 7075 7076 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore 7077 // which applies for both scalar and vector versions. Otherwise it is only 7078 // dead in vector versions, so only add it to VecValuesToIgnore. 7079 if (all_of(Op->users(), 7080 [this](User *U) { return ValuesToIgnore.contains(U); })) 7081 ValuesToIgnore.insert(Op); 7082 7083 VecValuesToIgnore.insert(Op); 7084 DeadOps.append(Op->op_begin(), Op->op_end()); 7085 } 7086 7087 // Ignore type-promoting instructions we identified during reduction 7088 // detection. 7089 for (const auto &Reduction : Legal->getReductionVars()) { 7090 const RecurrenceDescriptor &RedDes = Reduction.second; 7091 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7092 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7093 } 7094 // Ignore type-casting instructions we identified during induction 7095 // detection. 7096 for (const auto &Induction : Legal->getInductionVars()) { 7097 const InductionDescriptor &IndDes = Induction.second; 7098 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7099 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7100 } 7101 } 7102 7103 void LoopVectorizationCostModel::collectInLoopReductions() { 7104 for (const auto &Reduction : Legal->getReductionVars()) { 7105 PHINode *Phi = Reduction.first; 7106 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7107 7108 // We don't collect reductions that are type promoted (yet). 7109 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7110 continue; 7111 7112 // If the target would prefer this reduction to happen "in-loop", then we 7113 // want to record it as such. 7114 unsigned Opcode = RdxDesc.getOpcode(); 7115 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7116 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7117 TargetTransformInfo::ReductionFlags())) 7118 continue; 7119 7120 // Check that we can correctly put the reductions into the loop, by 7121 // finding the chain of operations that leads from the phi to the loop 7122 // exit value. 7123 SmallVector<Instruction *, 4> ReductionOperations = 7124 RdxDesc.getReductionOpChain(Phi, TheLoop); 7125 bool InLoop = !ReductionOperations.empty(); 7126 7127 if (InLoop) { 7128 InLoopReductions.insert(Phi); 7129 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7130 Instruction *LastChain = Phi; 7131 for (auto *I : ReductionOperations) { 7132 InLoopReductionImmediateChains[I] = LastChain; 7133 LastChain = I; 7134 } 7135 } 7136 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7137 << " reduction for phi: " << *Phi << "\n"); 7138 } 7139 } 7140 7141 // This function will select a scalable VF if the target supports scalable 7142 // vectors and a fixed one otherwise. 7143 // TODO: we could return a pair of values that specify the max VF and 7144 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7145 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7146 // doesn't have a cost model that can choose which plan to execute if 7147 // more than one is generated. 7148 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7149 LoopVectorizationCostModel &CM) { 7150 unsigned WidestType; 7151 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7152 7153 TargetTransformInfo::RegisterKind RegKind = 7154 TTI.enableScalableVectorization() 7155 ? TargetTransformInfo::RGK_ScalableVector 7156 : TargetTransformInfo::RGK_FixedWidthVector; 7157 7158 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7159 unsigned N = RegSize.getKnownMinValue() / WidestType; 7160 return ElementCount::get(N, RegSize.isScalable()); 7161 } 7162 7163 VectorizationFactor 7164 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7165 ElementCount VF = UserVF; 7166 // Outer loop handling: They may require CFG and instruction level 7167 // transformations before even evaluating whether vectorization is profitable. 7168 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7169 // the vectorization pipeline. 7170 if (!OrigLoop->isInnermost()) { 7171 // If the user doesn't provide a vectorization factor, determine a 7172 // reasonable one. 7173 if (UserVF.isZero()) { 7174 VF = determineVPlanVF(TTI, CM); 7175 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7176 7177 // Make sure we have a VF > 1 for stress testing. 7178 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7179 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7180 << "overriding computed VF.\n"); 7181 VF = ElementCount::getFixed(4); 7182 } 7183 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7184 !ForceTargetSupportsScalableVectors) { 7185 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7186 << "not supported by the target.\n"); 7187 reportVectorizationFailure( 7188 "Scalable vectorization requested but not supported by the target", 7189 "the scalable user-specified vectorization width for outer-loop " 7190 "vectorization cannot be used because the target does not support " 7191 "scalable vectors.", 7192 "ScalableVFUnfeasible", ORE, OrigLoop); 7193 return VectorizationFactor::Disabled(); 7194 } 7195 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7196 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7197 "VF needs to be a power of two"); 7198 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7199 << "VF " << VF << " to build VPlans.\n"); 7200 buildVPlans(VF, VF); 7201 7202 // For VPlan build stress testing, we bail out after VPlan construction. 7203 if (VPlanBuildStressTest) 7204 return VectorizationFactor::Disabled(); 7205 7206 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7207 } 7208 7209 LLVM_DEBUG( 7210 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7211 "VPlan-native path.\n"); 7212 return VectorizationFactor::Disabled(); 7213 } 7214 7215 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7216 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7217 CM.collectValuesToIgnore(); 7218 CM.collectElementTypesForWidening(); 7219 7220 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7221 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7222 return; 7223 7224 // Invalidate interleave groups if all blocks of loop will be predicated. 7225 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7226 !useMaskedInterleavedAccesses(TTI)) { 7227 LLVM_DEBUG( 7228 dbgs() 7229 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7230 "which requires masked-interleaved support.\n"); 7231 if (CM.InterleaveInfo.invalidateGroups()) 7232 // Invalidating interleave groups also requires invalidating all decisions 7233 // based on them, which includes widening decisions and uniform and scalar 7234 // values. 7235 CM.invalidateCostModelingDecisions(); 7236 } 7237 7238 if (CM.foldTailByMasking()) 7239 Legal->prepareToFoldTailByMasking(); 7240 7241 ElementCount MaxUserVF = 7242 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7243 if (UserVF) { 7244 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) { 7245 reportVectorizationInfo( 7246 "UserVF ignored because it may be larger than the maximal safe VF", 7247 "InvalidUserVF", ORE, OrigLoop); 7248 } else { 7249 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7250 "VF needs to be a power of two"); 7251 // Collect the instructions (and their associated costs) that will be more 7252 // profitable to scalarize. 7253 CM.collectInLoopReductions(); 7254 if (CM.selectUserVectorizationFactor(UserVF)) { 7255 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7256 buildVPlansWithVPRecipes(UserVF, UserVF); 7257 LLVM_DEBUG(printPlans(dbgs())); 7258 return; 7259 } 7260 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7261 "InvalidCost", ORE, OrigLoop); 7262 } 7263 } 7264 7265 // Collect the Vectorization Factor Candidates. 7266 SmallVector<ElementCount> VFCandidates; 7267 for (auto VF = ElementCount::getFixed(1); 7268 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7269 VFCandidates.push_back(VF); 7270 for (auto VF = ElementCount::getScalable(1); 7271 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7272 VFCandidates.push_back(VF); 7273 7274 CM.collectInLoopReductions(); 7275 for (const auto &VF : VFCandidates) { 7276 // Collect Uniform and Scalar instructions after vectorization with VF. 7277 CM.collectUniformsAndScalars(VF); 7278 7279 // Collect the instructions (and their associated costs) that will be more 7280 // profitable to scalarize. 7281 if (VF.isVector()) 7282 CM.collectInstsToScalarize(VF); 7283 } 7284 7285 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7286 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7287 7288 LLVM_DEBUG(printPlans(dbgs())); 7289 } 7290 7291 InstructionCost VPCostContext::getLegacyCost(Instruction *UI, 7292 ElementCount VF) const { 7293 if (ForceTargetInstructionCost.getNumOccurrences()) 7294 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); 7295 return CM.getInstructionCost(UI, VF); 7296 } 7297 7298 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { 7299 return CM.ValuesToIgnore.contains(UI) || 7300 (IsVector && CM.VecValuesToIgnore.contains(UI)) || 7301 SkipCostComputation.contains(UI); 7302 } 7303 7304 InstructionCost 7305 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, 7306 VPCostContext &CostCtx) const { 7307 InstructionCost Cost; 7308 // Cost modeling for inductions is inaccurate in the legacy cost model 7309 // compared to the recipes that are generated. To match here initially during 7310 // VPlan cost model bring up directly use the induction costs from the legacy 7311 // cost model. Note that we do this as pre-processing; the VPlan may not have 7312 // any recipes associated with the original induction increment instruction 7313 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 7314 // the cost of induction phis and increments (both that are represented by 7315 // recipes and those that are not), to avoid distinguishing between them here, 7316 // and skip all recipes that represent induction phis and increments (the 7317 // former case) later on, if they exist, to avoid counting them twice. 7318 // Similarly we pre-compute the cost of any optimized truncates. 7319 // TODO: Switch to more accurate costing based on VPlan. 7320 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { 7321 Instruction *IVInc = cast<Instruction>( 7322 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 7323 SmallVector<Instruction *> IVInsts = {IVInc}; 7324 for (unsigned I = 0; I != IVInsts.size(); I++) { 7325 for (Value *Op : IVInsts[I]->operands()) { 7326 auto *OpI = dyn_cast<Instruction>(Op); 7327 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse()) 7328 continue; 7329 IVInsts.push_back(OpI); 7330 } 7331 } 7332 IVInsts.push_back(IV); 7333 for (User *U : IV->users()) { 7334 auto *CI = cast<Instruction>(U); 7335 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) 7336 continue; 7337 IVInsts.push_back(CI); 7338 } 7339 7340 // If the vector loop gets executed exactly once with the given VF, ignore 7341 // the costs of comparison and induction instructions, as they'll get 7342 // simplified away. 7343 // TODO: Remove this code after stepping away from the legacy cost model and 7344 // adding code to simplify VPlans before calculating their costs. 7345 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); 7346 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking()) 7347 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), 7348 CostCtx.SkipCostComputation); 7349 7350 for (Instruction *IVInst : IVInsts) { 7351 if (CostCtx.skipCostComputation(IVInst, VF.isVector())) 7352 continue; 7353 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); 7354 LLVM_DEBUG({ 7355 dbgs() << "Cost of " << InductionCost << " for VF " << VF 7356 << ": induction instruction " << *IVInst << "\n"; 7357 }); 7358 Cost += InductionCost; 7359 CostCtx.SkipCostComputation.insert(IVInst); 7360 } 7361 } 7362 7363 /// Compute the cost of all exiting conditions of the loop using the legacy 7364 /// cost model. This is to match the legacy behavior, which adds the cost of 7365 /// all exit conditions. Note that this over-estimates the cost, as there will 7366 /// be a single condition to control the vector loop. 7367 SmallVector<BasicBlock *> Exiting; 7368 CM.TheLoop->getExitingBlocks(Exiting); 7369 SetVector<Instruction *> ExitInstrs; 7370 // Collect all exit conditions. 7371 for (BasicBlock *EB : Exiting) { 7372 auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); 7373 if (!Term) 7374 continue; 7375 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { 7376 ExitInstrs.insert(CondI); 7377 } 7378 } 7379 // Compute the cost of all instructions only feeding the exit conditions. 7380 for (unsigned I = 0; I != ExitInstrs.size(); ++I) { 7381 Instruction *CondI = ExitInstrs[I]; 7382 if (!OrigLoop->contains(CondI) || 7383 !CostCtx.SkipCostComputation.insert(CondI).second) 7384 continue; 7385 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); 7386 LLVM_DEBUG({ 7387 dbgs() << "Cost of " << CondICost << " for VF " << VF 7388 << ": exit condition instruction " << *CondI << "\n"; 7389 }); 7390 Cost += CondICost; 7391 for (Value *Op : CondI->operands()) { 7392 auto *OpI = dyn_cast<Instruction>(Op); 7393 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { 7394 return OrigLoop->contains(cast<Instruction>(U)->getParent()) && 7395 !ExitInstrs.contains(cast<Instruction>(U)); 7396 })) 7397 continue; 7398 ExitInstrs.insert(OpI); 7399 } 7400 } 7401 7402 // The legacy cost model has special logic to compute the cost of in-loop 7403 // reductions, which may be smaller than the sum of all instructions involved 7404 // in the reduction. 7405 // TODO: Switch to costing based on VPlan once the logic has been ported. 7406 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { 7407 if (ForceTargetInstructionCost.getNumOccurrences()) 7408 continue; 7409 7410 if (!CM.isInLoopReduction(RedPhi)) 7411 continue; 7412 7413 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); 7414 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), 7415 ChainOps.end()); 7416 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { 7417 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; 7418 }; 7419 // Also include the operands of instructions in the chain, as the cost-model 7420 // may mark extends as free. 7421 // 7422 // For ARM, some of the instruction can folded into the reducion 7423 // instruction. So we need to mark all folded instructions free. 7424 // For example: We can fold reduce(mul(ext(A), ext(B))) into one 7425 // instruction. 7426 for (auto *ChainOp : ChainOps) { 7427 for (Value *Op : ChainOp->operands()) { 7428 if (auto *I = dyn_cast<Instruction>(Op)) { 7429 ChainOpsAndOperands.insert(I); 7430 if (I->getOpcode() == Instruction::Mul) { 7431 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0)); 7432 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1)); 7433 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && 7434 Ext0->getOpcode() == Ext1->getOpcode()) { 7435 ChainOpsAndOperands.insert(Ext0); 7436 ChainOpsAndOperands.insert(Ext1); 7437 } 7438 } 7439 } 7440 } 7441 } 7442 7443 // Pre-compute the cost for I, if it has a reduction pattern cost. 7444 for (Instruction *I : ChainOpsAndOperands) { 7445 auto ReductionCost = CM.getReductionPatternCost( 7446 I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); 7447 if (!ReductionCost) 7448 continue; 7449 7450 assert(!CostCtx.SkipCostComputation.contains(I) && 7451 "reduction op visited multiple times"); 7452 CostCtx.SkipCostComputation.insert(I); 7453 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7454 << ":\n in-loop reduction " << *I << "\n"); 7455 Cost += *ReductionCost; 7456 } 7457 } 7458 7459 // Pre-compute the costs for branches except for the backedge, as the number 7460 // of replicate regions in a VPlan may not directly match the number of 7461 // branches, which would lead to different decisions. 7462 // TODO: Compute cost of branches for each replicate region in the VPlan, 7463 // which is more accurate than the legacy cost model. 7464 for (BasicBlock *BB : OrigLoop->blocks()) { 7465 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector())) 7466 continue; 7467 CostCtx.SkipCostComputation.insert(BB->getTerminator()); 7468 if (BB == OrigLoop->getLoopLatch()) 7469 continue; 7470 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); 7471 Cost += BranchCost; 7472 } 7473 7474 // Pre-compute costs for instructions that are forced-scalar or profitable to 7475 // scalarize. Their costs will be computed separately in the legacy cost 7476 // model. 7477 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) { 7478 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector())) 7479 continue; 7480 CostCtx.SkipCostComputation.insert(ForcedScalar); 7481 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF); 7482 LLVM_DEBUG({ 7483 dbgs() << "Cost of " << ForcedCost << " for VF " << VF 7484 << ": forced scalar " << *ForcedScalar << "\n"; 7485 }); 7486 Cost += ForcedCost; 7487 } 7488 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) { 7489 if (CostCtx.skipCostComputation(Scalarized, VF.isVector())) 7490 continue; 7491 CostCtx.SkipCostComputation.insert(Scalarized); 7492 LLVM_DEBUG({ 7493 dbgs() << "Cost of " << ScalarCost << " for VF " << VF 7494 << ": profitable to scalarize " << *Scalarized << "\n"; 7495 }); 7496 Cost += ScalarCost; 7497 } 7498 7499 return Cost; 7500 } 7501 7502 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, 7503 ElementCount VF) const { 7504 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); 7505 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); 7506 7507 // Now compute and add the VPlan-based cost. 7508 Cost += Plan.cost(VF, CostCtx); 7509 #ifndef NDEBUG 7510 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); 7511 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost 7512 << " (Estimated cost per lane: "); 7513 if (Cost.isValid()) { 7514 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth; 7515 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane)); 7516 } else /* No point dividing an invalid cost - it will still be invalid */ 7517 LLVM_DEBUG(dbgs() << "Invalid"); 7518 LLVM_DEBUG(dbgs() << ")\n"); 7519 #endif 7520 return Cost; 7521 } 7522 7523 #ifndef NDEBUG 7524 /// Return true if the original loop \ TheLoop contains any instructions that do 7525 /// not have corresponding recipes in \p Plan and are not marked to be ignored 7526 /// in \p CostCtx. This means the VPlan contains simplification that the legacy 7527 /// cost-model did not account for. 7528 static bool planContainsAdditionalSimplifications(VPlan &Plan, 7529 VPCostContext &CostCtx, 7530 Loop *TheLoop) { 7531 // First collect all instructions for the recipes in Plan. 7532 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { 7533 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 7534 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 7535 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 7536 return &WidenMem->getIngredient(); 7537 return nullptr; 7538 }; 7539 7540 DenseSet<Instruction *> SeenInstrs; 7541 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); 7542 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 7543 for (VPRecipeBase &R : *VPBB) { 7544 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) { 7545 auto *IG = IR->getInterleaveGroup(); 7546 unsigned NumMembers = IG->getNumMembers(); 7547 for (unsigned I = 0; I != NumMembers; ++I) { 7548 if (Instruction *M = IG->getMember(I)) 7549 SeenInstrs.insert(M); 7550 } 7551 continue; 7552 } 7553 // The VPlan-based cost model is more accurate for partial reduction and 7554 // comparing against the legacy cost isn't desirable. 7555 if (isa<VPPartialReductionRecipe>(&R)) 7556 return true; 7557 if (Instruction *UI = GetInstructionForCost(&R)) 7558 SeenInstrs.insert(UI); 7559 } 7560 } 7561 7562 // Return true if the loop contains any instructions that are not also part of 7563 // the VPlan or are skipped for VPlan-based cost computations. This indicates 7564 // that the VPlan contains extra simplifications. 7565 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx, 7566 TheLoop](BasicBlock *BB) { 7567 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) { 7568 if (isa<PHINode>(&I) && BB == TheLoop->getHeader()) 7569 return false; 7570 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); 7571 }); 7572 }); 7573 } 7574 #endif 7575 7576 VectorizationFactor LoopVectorizationPlanner::computeBestVF() { 7577 if (VPlans.empty()) 7578 return VectorizationFactor::Disabled(); 7579 // If there is a single VPlan with a single VF, return it directly. 7580 VPlan &FirstPlan = *VPlans[0]; 7581 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) 7582 return {*FirstPlan.vectorFactors().begin(), 0, 0}; 7583 7584 ElementCount ScalarVF = ElementCount::getFixed(1); 7585 assert(hasPlanWithVF(ScalarVF) && 7586 "More than a single plan/VF w/o any plan having scalar VF"); 7587 7588 // TODO: Compute scalar cost using VPlan-based cost model. 7589 InstructionCost ScalarCost = CM.expectedCost(ScalarVF); 7590 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); 7591 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost); 7592 VectorizationFactor BestFactor = ScalarFactor; 7593 7594 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 7595 if (ForceVectorization) { 7596 // Ignore scalar width, because the user explicitly wants vectorization. 7597 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 7598 // evaluation. 7599 BestFactor.Cost = InstructionCost::getMax(); 7600 } 7601 7602 for (auto &P : VPlans) { 7603 for (ElementCount VF : P->vectorFactors()) { 7604 if (VF.isScalar()) 7605 continue; 7606 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 7607 LLVM_DEBUG( 7608 dbgs() 7609 << "LV: Not considering vector loop of width " << VF 7610 << " because it will not generate any vector instructions.\n"); 7611 continue; 7612 } 7613 7614 InstructionCost Cost = cost(*P, VF); 7615 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); 7616 if (isMoreProfitable(CurrentFactor, BestFactor)) 7617 BestFactor = CurrentFactor; 7618 7619 // If profitable add it to ProfitableVF list. 7620 if (isMoreProfitable(CurrentFactor, ScalarFactor)) 7621 ProfitableVFs.push_back(CurrentFactor); 7622 } 7623 } 7624 7625 #ifndef NDEBUG 7626 // Select the optimal vectorization factor according to the legacy cost-model. 7627 // This is now only used to verify the decisions by the new VPlan-based 7628 // cost-model and will be retired once the VPlan-based cost-model is 7629 // stabilized. 7630 VectorizationFactor LegacyVF = selectVectorizationFactor(); 7631 VPlan &BestPlan = getPlanFor(BestFactor.Width); 7632 7633 // Pre-compute the cost and use it to check if BestPlan contains any 7634 // simplifications not accounted for in the legacy cost model. If that's the 7635 // case, don't trigger the assertion, as the extra simplifications may cause a 7636 // different VF to be picked by the VPlan-based cost model. 7637 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); 7638 precomputeCosts(BestPlan, BestFactor.Width, CostCtx); 7639 assert((BestFactor.Width == LegacyVF.Width || 7640 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), 7641 CostCtx, OrigLoop) || 7642 planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), 7643 CostCtx, OrigLoop)) && 7644 " VPlan cost model and legacy cost model disagreed"); 7645 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && 7646 "when vectorizing, the scalar cost must be computed."); 7647 #endif 7648 7649 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); 7650 return BestFactor; 7651 } 7652 7653 static void addRuntimeUnrollDisableMetaData(Loop *L) { 7654 SmallVector<Metadata *, 4> MDs; 7655 // Reserve first location for self reference to the LoopID metadata node. 7656 MDs.push_back(nullptr); 7657 bool IsUnrollMetadata = false; 7658 MDNode *LoopID = L->getLoopID(); 7659 if (LoopID) { 7660 // First find existing loop unrolling disable metadata. 7661 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { 7662 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); 7663 if (MD) { 7664 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7665 IsUnrollMetadata = 7666 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7667 } 7668 MDs.push_back(LoopID->getOperand(I)); 7669 } 7670 } 7671 7672 if (!IsUnrollMetadata) { 7673 // Add runtime unroll disable metadata. 7674 LLVMContext &Context = L->getHeader()->getContext(); 7675 SmallVector<Metadata *, 1> DisableOperands; 7676 DisableOperands.push_back( 7677 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7678 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7679 MDs.push_back(DisableNode); 7680 MDNode *NewLoopID = MDNode::get(Context, MDs); 7681 // Set operand 0 to refer to the loop id itself. 7682 NewLoopID->replaceOperandWith(0, NewLoopID); 7683 L->setLoopID(NewLoopID); 7684 } 7685 } 7686 7687 // If \p R is a ComputeReductionResult when vectorizing the epilog loop, 7688 // fix the reduction's scalar PHI node by adding the incoming value from the 7689 // main vector loop. 7690 static void fixReductionScalarResumeWhenVectorizingEpilog( 7691 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, 7692 BasicBlock *BypassBlock) { 7693 auto *EpiRedResult = dyn_cast<VPInstruction>(R); 7694 if (!EpiRedResult || 7695 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7696 return; 7697 7698 auto *EpiRedHeaderPhi = 7699 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0)); 7700 const RecurrenceDescriptor &RdxDesc = 7701 EpiRedHeaderPhi->getRecurrenceDescriptor(); 7702 Value *MainResumeValue = 7703 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); 7704 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 7705 RdxDesc.getRecurrenceKind())) { 7706 auto *Cmp = cast<ICmpInst>(MainResumeValue); 7707 assert(Cmp->getPredicate() == CmpInst::ICMP_NE && 7708 "AnyOf expected to start with ICMP_NE"); 7709 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && 7710 "AnyOf expected to start by comparing main resume value to original " 7711 "start value"); 7712 MainResumeValue = Cmp->getOperand(0); 7713 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 7714 RdxDesc.getRecurrenceKind())) { 7715 using namespace llvm::PatternMatch; 7716 Value *Cmp, *OrigResumeV; 7717 bool IsExpectedPattern = 7718 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), 7719 m_Specific(RdxDesc.getSentinelValue()), 7720 m_Value(OrigResumeV))) && 7721 match(Cmp, 7722 m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), 7723 m_Specific(RdxDesc.getRecurrenceStartValue()))); 7724 assert(IsExpectedPattern && "Unexpected reduction resume pattern"); 7725 (void)IsExpectedPattern; 7726 MainResumeValue = OrigResumeV; 7727 } 7728 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue); 7729 7730 // When fixing reductions in the epilogue loop we should already have 7731 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry 7732 // over the incoming values correctly. 7733 using namespace VPlanPatternMatch; 7734 auto IsResumePhi = [](VPUser *U) { 7735 return match( 7736 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue())); 7737 }; 7738 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && 7739 "ResumePhi must have a single user"); 7740 auto *EpiResumePhiVPI = 7741 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi)); 7742 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true)); 7743 EpiResumePhi->setIncomingValueForBlock( 7744 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); 7745 } 7746 7747 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( 7748 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7749 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue, 7750 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7751 assert(BestVPlan.hasVF(BestVF) && 7752 "Trying to execute plan with unsupported VF"); 7753 assert(BestVPlan.hasUF(BestUF) && 7754 "Trying to execute plan with unsupported UF"); 7755 assert( 7756 ((VectorizingEpilogue && ExpandedSCEVs) || 7757 (!VectorizingEpilogue && !ExpandedSCEVs)) && 7758 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7759 7760 // TODO: Move to VPlan transform stage once the transition to the VPlan-based 7761 // cost model is complete for better cost estimates. 7762 VPlanTransforms::unrollByUF(BestVPlan, BestUF, 7763 OrigLoop->getHeader()->getContext()); 7764 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7765 VPlanTransforms::convertToConcreteRecipes(BestVPlan); 7766 7767 // Perform the actual loop transformation. 7768 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, 7769 &BestVPlan, OrigLoop->getParentLoop(), 7770 Legal->getWidestInductionType()); 7771 7772 #ifdef EXPENSIVE_CHECKS 7773 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 7774 #endif 7775 7776 // 0. Generate SCEV-dependent code in the entry, including TripCount, before 7777 // making any changes to the CFG. 7778 if (!BestVPlan.getEntry()->empty()) 7779 BestVPlan.getEntry()->execute(&State); 7780 7781 if (!ILV.getTripCount()) 7782 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); 7783 else 7784 assert(VectorizingEpilogue && "should only re-use the existing trip " 7785 "count during epilogue vectorization"); 7786 7787 // 1. Set up the skeleton for vectorization, including vector pre-header and 7788 // middle block. The vector loop is created during VPlan execution. 7789 VPBasicBlock *VectorPH = 7790 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor()); 7791 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( 7792 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); 7793 if (VectorizingEpilogue) 7794 VPlanTransforms::removeDeadRecipes(BestVPlan); 7795 7796 // Only use noalias metadata when using memory checks guaranteeing no overlap 7797 // across all iterations. 7798 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7799 std::unique_ptr<LoopVersioning> LVer = nullptr; 7800 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7801 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7802 7803 // We currently don't use LoopVersioning for the actual loop cloning but we 7804 // still use it to add the noalias metadata. 7805 // TODO: Find a better way to re-use LoopVersioning functionality to add 7806 // metadata. 7807 LVer = std::make_unique<LoopVersioning>( 7808 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7809 PSE.getSE()); 7810 State.LVer = &*LVer; 7811 State.LVer->prepareNoAliasMetadata(); 7812 } 7813 7814 ILV.printDebugTracesAtStart(); 7815 7816 //===------------------------------------------------===// 7817 // 7818 // Notice: any optimization or new instruction that go 7819 // into the code below should also be implemented in 7820 // the cost-model. 7821 // 7822 //===------------------------------------------------===// 7823 7824 // 2. Copy and widen instructions from the old loop into the new loop. 7825 BestVPlan.prepareToExecute( 7826 ILV.getTripCount(), 7827 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); 7828 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); 7829 7830 BestVPlan.execute(&State); 7831 7832 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7833 // 2.5 When vectorizing the epilogue, fix reduction and induction resume 7834 // values from the additional bypass block. 7835 if (VectorizingEpilogue) { 7836 assert(!ILV.Legal->hasUncountableEarlyExit() && 7837 "Epilogue vectorisation not yet supported with early exits"); 7838 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); 7839 for (VPRecipeBase &R : *MiddleVPBB) { 7840 fixReductionScalarResumeWhenVectorizingEpilog( 7841 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); 7842 } 7843 BasicBlock *PH = OrigLoop->getLoopPreheader(); 7844 for (const auto &[IVPhi, _] : Legal->getInductionVars()) { 7845 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); 7846 Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); 7847 Inc->setIncomingValueForBlock(BypassBlock, V); 7848 } 7849 } 7850 7851 // 2.6. Maintain Loop Hints 7852 // Keep all loop hints from the original loop on the vector loop (we'll 7853 // replace the vectorizer-specific hints below). 7854 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { 7855 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7856 7857 std::optional<MDNode *> VectorizedLoopID = 7858 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7859 LLVMLoopVectorizeFollowupVectorized}); 7860 7861 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); 7862 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7863 if (VectorizedLoopID) { 7864 L->setLoopID(*VectorizedLoopID); 7865 } else { 7866 // Keep all loop hints from the original loop on the vector loop (we'll 7867 // replace the vectorizer-specific hints below). 7868 if (MDNode *LID = OrigLoop->getLoopID()) 7869 L->setLoopID(LID); 7870 7871 LoopVectorizeHints Hints(L, true, *ORE); 7872 Hints.setAlreadyVectorized(); 7873 } 7874 TargetTransformInfo::UnrollingPreferences UP; 7875 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7876 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) 7877 addRuntimeUnrollDisableMetaData(L); 7878 } 7879 7880 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7881 // predication, updating analyses. 7882 ILV.fixVectorizedLoop(State); 7883 7884 ILV.printDebugTracesAtEnd(); 7885 7886 // 4. Adjust branch weight of the branch in the middle block. 7887 if (BestVPlan.getVectorLoopRegion()) { 7888 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7889 auto *MiddleTerm = 7890 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); 7891 if (MiddleTerm->isConditional() && 7892 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7893 // Assume that `Count % VectorTripCount` is equally distributed. 7894 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); 7895 assert(TripCount > 0 && "trip count should not be zero"); 7896 const uint32_t Weights[] = {1, TripCount - 1}; 7897 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); 7898 } 7899 } 7900 7901 return State.ExpandedSCEVs; 7902 } 7903 7904 //===--------------------------------------------------------------------===// 7905 // EpilogueVectorizerMainLoop 7906 //===--------------------------------------------------------------------===// 7907 7908 /// This function is partially responsible for generating the control flow 7909 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7910 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7911 const SCEV2ValueTy &ExpandedSCEVs) { 7912 createVectorLoopSkeleton(""); 7913 7914 // Generate the code to check the minimum iteration count of the vector 7915 // epilogue (see below). 7916 EPI.EpilogueIterationCountCheck = 7917 emitIterationCountCheck(LoopScalarPreHeader, true); 7918 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7919 7920 // Generate the code to check any assumptions that we've made for SCEV 7921 // expressions. 7922 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7923 7924 // Generate the code that checks at runtime if arrays overlap. We put the 7925 // checks into a separate block to make the more common case of few elements 7926 // faster. 7927 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7928 7929 // Generate the iteration count check for the main loop, *after* the check 7930 // for the epilogue loop, so that the path-length is shorter for the case 7931 // that goes directly through the vector epilogue. The longer-path length for 7932 // the main loop is compensated for, by the gain from vectorizing the larger 7933 // trip count. Note: the branch will get updated later on when we vectorize 7934 // the epilogue. 7935 EPI.MainLoopIterationCountCheck = 7936 emitIterationCountCheck(LoopScalarPreHeader, false); 7937 7938 // Generate the induction variable. 7939 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7940 7941 return LoopVectorPreHeader; 7942 } 7943 7944 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7945 LLVM_DEBUG({ 7946 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7947 << "Main Loop VF:" << EPI.MainLoopVF 7948 << ", Main Loop UF:" << EPI.MainLoopUF 7949 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7950 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7951 }); 7952 } 7953 7954 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7955 DEBUG_WITH_TYPE(VerboseDebug, { 7956 dbgs() << "intermediate fn:\n" 7957 << *OrigLoop->getHeader()->getParent() << "\n"; 7958 }); 7959 } 7960 7961 BasicBlock * 7962 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7963 bool ForEpilogue) { 7964 assert(Bypass && "Expected valid bypass basic block."); 7965 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7966 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7967 Value *Count = getTripCount(); 7968 // Reuse existing vector loop preheader for TC checks. 7969 // Note that new preheader block is generated for vector loop. 7970 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7971 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7972 7973 // Generate code to check if the loop's trip count is less than VF * UF of the 7974 // main vector loop. 7975 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7976 : VF.isVector()) 7977 ? ICmpInst::ICMP_ULE 7978 : ICmpInst::ICMP_ULT; 7979 7980 Value *CheckMinIters = Builder.CreateICmp( 7981 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7982 "min.iters.check"); 7983 7984 if (!ForEpilogue) 7985 TCCheckBlock->setName("vector.main.loop.iter.check"); 7986 7987 // Create new preheader for vector loop. 7988 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7989 DT, LI, nullptr, "vector.ph"); 7990 7991 if (ForEpilogue) { 7992 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7993 DT->getNode(Bypass)->getIDom()) && 7994 "TC check is expected to dominate Bypass"); 7995 7996 LoopBypassBlocks.push_back(TCCheckBlock); 7997 7998 // Save the trip count so we don't have to regenerate it in the 7999 // vec.epilog.iter.check. This is safe to do because the trip count 8000 // generated here dominates the vector epilog iter check. 8001 EPI.TripCount = Count; 8002 } 8003 8004 BranchInst &BI = 8005 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8006 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 8007 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 8008 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 8009 8010 introduceCheckBlockInVPlan(TCCheckBlock); 8011 return TCCheckBlock; 8012 } 8013 8014 //===--------------------------------------------------------------------===// 8015 // EpilogueVectorizerEpilogueLoop 8016 //===--------------------------------------------------------------------===// 8017 8018 /// This function is partially responsible for generating the control flow 8019 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8020 BasicBlock * 8021 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 8022 const SCEV2ValueTy &ExpandedSCEVs) { 8023 createVectorLoopSkeleton("vec.epilog."); 8024 8025 // Now, compare the remaining count and if there aren't enough iterations to 8026 // execute the vectorized epilogue skip to the scalar part. 8027 LoopVectorPreHeader->setName("vec.epilog.ph"); 8028 BasicBlock *VecEpilogueIterationCountCheck = 8029 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, 8030 nullptr, "vec.epilog.iter.check", true); 8031 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 8032 VecEpilogueIterationCountCheck); 8033 AdditionalBypassBlock = VecEpilogueIterationCountCheck; 8034 8035 // Adjust the control flow taking the state info from the main loop 8036 // vectorization into account. 8037 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8038 "expected this to be saved from the previous pass."); 8039 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8040 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8041 8042 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8043 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8044 8045 if (EPI.SCEVSafetyCheck) 8046 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8047 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8048 if (EPI.MemSafetyCheck) 8049 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8050 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8051 8052 DT->changeImmediateDominator(LoopScalarPreHeader, 8053 EPI.EpilogueIterationCountCheck); 8054 // Keep track of bypass blocks, as they feed start values to the induction and 8055 // reduction phis in the scalar loop preheader. 8056 if (EPI.SCEVSafetyCheck) 8057 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8058 if (EPI.MemSafetyCheck) 8059 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8060 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8061 8062 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 8063 // reductions which merge control-flow from the latch block and the middle 8064 // block. Update the incoming values here and move the Phi into the preheader. 8065 SmallVector<PHINode *, 4> PhisInBlock; 8066 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8067 PhisInBlock.push_back(&Phi); 8068 8069 for (PHINode *Phi : PhisInBlock) { 8070 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8071 Phi->replaceIncomingBlockWith( 8072 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8073 VecEpilogueIterationCountCheck); 8074 8075 // If the phi doesn't have an incoming value from the 8076 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 8077 // value and also those from other check blocks. This is needed for 8078 // reduction phis only. 8079 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 8080 return EPI.EpilogueIterationCountCheck == IncB; 8081 })) 8082 continue; 8083 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8084 if (EPI.SCEVSafetyCheck) 8085 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8086 if (EPI.MemSafetyCheck) 8087 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8088 } 8089 8090 // Generate bypass values from the additional bypass block. Note that when the 8091 // vectorized epilogue is skipped due to iteration count check, then the 8092 // resume value for the induction variable comes from the trip count of the 8093 // main vector loop, passed as the second argument. 8094 createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); 8095 return LoopVectorPreHeader; 8096 } 8097 8098 BasicBlock * 8099 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8100 BasicBlock *Bypass, BasicBlock *Insert) { 8101 8102 assert(EPI.TripCount && 8103 "Expected trip count to have been saved in the first pass."); 8104 assert( 8105 (!isa<Instruction>(EPI.TripCount) || 8106 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8107 "saved trip count does not dominate insertion point."); 8108 Value *TC = EPI.TripCount; 8109 IRBuilder<> Builder(Insert->getTerminator()); 8110 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8111 8112 // Generate code to check if the loop's trip count is less than VF * UF of the 8113 // vector epilogue loop. 8114 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8115 ? ICmpInst::ICMP_ULE 8116 : ICmpInst::ICMP_ULT; 8117 8118 Value *CheckMinIters = 8119 Builder.CreateICmp(P, Count, 8120 createStepForVF(Builder, Count->getType(), 8121 EPI.EpilogueVF, EPI.EpilogueUF), 8122 "min.epilog.iters.check"); 8123 8124 BranchInst &BI = 8125 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8126 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 8127 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 8128 unsigned EpilogueLoopStep = 8129 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 8130 // We assume the remaining `Count` is equally distributed in 8131 // [0, MainLoopStep) 8132 // So the probability for `Count < EpilogueLoopStep` should be 8133 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 8134 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 8135 const uint32_t Weights[] = {EstimatedSkipCount, 8136 MainLoopStep - EstimatedSkipCount}; 8137 setBranchWeights(BI, Weights, /*IsExpected=*/false); 8138 } 8139 ReplaceInstWithInst(Insert->getTerminator(), &BI); 8140 LoopBypassBlocks.push_back(Insert); 8141 8142 // A new entry block has been created for the epilogue VPlan. Hook it in, as 8143 // otherwise we would try to modify the entry to the main vector loop. 8144 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); 8145 VPBasicBlock *OldEntry = Plan.getEntry(); 8146 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); 8147 Plan.setEntry(NewEntry); 8148 // OldEntry is now dead and will be cleaned up when the plan gets destroyed. 8149 8150 introduceCheckBlockInVPlan(Insert); 8151 return Insert; 8152 } 8153 8154 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8155 LLVM_DEBUG({ 8156 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8157 << "Epilogue Loop VF:" << EPI.EpilogueVF 8158 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8159 }); 8160 } 8161 8162 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8163 DEBUG_WITH_TYPE(VerboseDebug, { 8164 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8165 }); 8166 } 8167 8168 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 8169 VPRecipeBuilder::mapToVPValues(User::op_range Operands) { 8170 std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 8171 return getVPValueOrAddLiveIn(Op); 8172 }; 8173 return map_range(Operands, Fn); 8174 } 8175 8176 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { 8177 BasicBlock *Src = SI->getParent(); 8178 assert(!OrigLoop->isLoopExiting(Src) && 8179 all_of(successors(Src), 8180 [this](BasicBlock *Succ) { 8181 return OrigLoop->getHeader() != Succ; 8182 }) && 8183 "unsupported switch either exiting loop or continuing to header"); 8184 // Create masks where the terminator in Src is a switch. We create mask for 8185 // all edges at the same time. This is more efficient, as we can create and 8186 // collect compares for all cases once. 8187 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); 8188 BasicBlock *DefaultDst = SI->getDefaultDest(); 8189 MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares; 8190 for (auto &C : SI->cases()) { 8191 BasicBlock *Dst = C.getCaseSuccessor(); 8192 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); 8193 // Cases whose destination is the same as default are redundant and can be 8194 // ignored - they will get there anyhow. 8195 if (Dst == DefaultDst) 8196 continue; 8197 auto &Compares = Dst2Compares[Dst]; 8198 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); 8199 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); 8200 } 8201 8202 // We need to handle 2 separate cases below for all entries in Dst2Compares, 8203 // which excludes destinations matching the default destination. 8204 VPValue *SrcMask = getBlockInMask(Src); 8205 VPValue *DefaultMask = nullptr; 8206 for (const auto &[Dst, Conds] : Dst2Compares) { 8207 // 1. Dst is not the default destination. Dst is reached if any of the cases 8208 // with destination == Dst are taken. Join the conditions for each case 8209 // whose destination == Dst using an OR. 8210 VPValue *Mask = Conds[0]; 8211 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) 8212 Mask = Builder.createOr(Mask, V); 8213 if (SrcMask) 8214 Mask = Builder.createLogicalAnd(SrcMask, Mask); 8215 EdgeMaskCache[{Src, Dst}] = Mask; 8216 8217 // 2. Create the mask for the default destination, which is reached if none 8218 // of the cases with destination != default destination are taken. Join the 8219 // conditions for each case where the destination is != Dst using an OR and 8220 // negate it. 8221 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; 8222 } 8223 8224 if (DefaultMask) { 8225 DefaultMask = Builder.createNot(DefaultMask); 8226 if (SrcMask) 8227 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); 8228 } 8229 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; 8230 } 8231 8232 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 8233 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8234 8235 // Look for cached value. 8236 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8237 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8238 if (ECEntryIt != EdgeMaskCache.end()) 8239 return ECEntryIt->second; 8240 8241 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { 8242 createSwitchEdgeMasks(SI); 8243 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); 8244 return EdgeMaskCache[Edge]; 8245 } 8246 8247 VPValue *SrcMask = getBlockInMask(Src); 8248 8249 // The terminator has to be a branch inst! 8250 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8251 assert(BI && "Unexpected terminator found"); 8252 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8253 return EdgeMaskCache[Edge] = SrcMask; 8254 8255 // If source is an exiting block, we know the exit edge is dynamically dead 8256 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8257 // adding uses of an otherwise potentially dead instruction unless we are 8258 // vectorizing a loop with uncountable exits. In that case, we always 8259 // materialize the mask. 8260 if (OrigLoop->isLoopExiting(Src) && 8261 Src != Legal->getUncountableEarlyExitingBlock()) 8262 return EdgeMaskCache[Edge] = SrcMask; 8263 8264 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); 8265 assert(EdgeMask && "No Edge Mask found for condition"); 8266 8267 if (BI->getSuccessor(0) != Dst) 8268 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8269 8270 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8271 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 8272 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 8273 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8274 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); 8275 } 8276 8277 return EdgeMaskCache[Edge] = EdgeMask; 8278 } 8279 8280 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { 8281 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8282 8283 // Look for cached value. 8284 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8285 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); 8286 assert(ECEntryIt != EdgeMaskCache.end() && 8287 "looking up mask for edge which has not been created"); 8288 return ECEntryIt->second; 8289 } 8290 8291 void VPRecipeBuilder::createHeaderMask() { 8292 BasicBlock *Header = OrigLoop->getHeader(); 8293 8294 // When not folding the tail, use nullptr to model all-true mask. 8295 if (!CM.foldTailByMasking()) { 8296 BlockMaskCache[Header] = nullptr; 8297 return; 8298 } 8299 8300 // Introduce the early-exit compare IV <= BTC to form header block mask. 8301 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8302 // constructing the desired canonical IV in the header block as its first 8303 // non-phi instructions. 8304 8305 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8306 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8307 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8308 HeaderVPBB->insert(IV, NewInsertionPoint); 8309 8310 VPBuilder::InsertPointGuard Guard(Builder); 8311 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8312 VPValue *BlockMask = nullptr; 8313 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8314 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8315 BlockMaskCache[Header] = BlockMask; 8316 } 8317 8318 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8319 // Return the cached value. 8320 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8321 assert(BCEntryIt != BlockMaskCache.end() && 8322 "Trying to access mask for block without one."); 8323 return BCEntryIt->second; 8324 } 8325 8326 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { 8327 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8328 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8329 assert(OrigLoop->getHeader() != BB && 8330 "Loop header must have cached block mask"); 8331 8332 // All-one mask is modelled as no-mask following the convention for masked 8333 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8334 VPValue *BlockMask = nullptr; 8335 // This is the block mask. We OR all unique incoming edges. 8336 for (auto *Predecessor : 8337 SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) { 8338 VPValue *EdgeMask = createEdgeMask(Predecessor, BB); 8339 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8340 BlockMaskCache[BB] = EdgeMask; 8341 return; 8342 } 8343 8344 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8345 BlockMask = EdgeMask; 8346 continue; 8347 } 8348 8349 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8350 } 8351 8352 BlockMaskCache[BB] = BlockMask; 8353 } 8354 8355 VPWidenMemoryRecipe * 8356 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, 8357 VFRange &Range) { 8358 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8359 "Must be called with either a load or store"); 8360 8361 auto WillWiden = [&](ElementCount VF) -> bool { 8362 LoopVectorizationCostModel::InstWidening Decision = 8363 CM.getWideningDecision(I, VF); 8364 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8365 "CM decision should be taken at this point."); 8366 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8367 return true; 8368 if (CM.isScalarAfterVectorization(I, VF) || 8369 CM.isProfitableToScalarize(I, VF)) 8370 return false; 8371 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8372 }; 8373 8374 if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range)) 8375 return nullptr; 8376 8377 VPValue *Mask = nullptr; 8378 if (Legal->isMaskRequired(I)) 8379 Mask = getBlockInMask(I->getParent()); 8380 8381 // Determine if the pointer operand of the access is either consecutive or 8382 // reverse consecutive. 8383 LoopVectorizationCostModel::InstWidening Decision = 8384 CM.getWideningDecision(I, Range.Start); 8385 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8386 bool Consecutive = 8387 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8388 8389 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8390 if (Consecutive) { 8391 auto *GEP = dyn_cast<GetElementPtrInst>( 8392 Ptr->getUnderlyingValue()->stripPointerCasts()); 8393 VPSingleDefRecipe *VectorPtr; 8394 if (Reverse) { 8395 // When folding the tail, we may compute an address that we don't in the 8396 // original scalar loop and it may not be inbounds. Drop Inbounds in that 8397 // case. 8398 GEPNoWrapFlags Flags = 8399 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) 8400 ? GEPNoWrapFlags::none() 8401 : GEPNoWrapFlags::inBounds(); 8402 VectorPtr = new VPReverseVectorPointerRecipe( 8403 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); 8404 } else { 8405 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), 8406 GEP ? GEP->getNoWrapFlags() 8407 : GEPNoWrapFlags::none(), 8408 I->getDebugLoc()); 8409 } 8410 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8411 Ptr = VectorPtr; 8412 } 8413 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8414 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, 8415 I->getDebugLoc()); 8416 8417 StoreInst *Store = cast<StoreInst>(I); 8418 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, 8419 Reverse, I->getDebugLoc()); 8420 } 8421 8422 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8423 /// insert a recipe to expand the step for the induction recipe. 8424 static VPWidenIntOrFpInductionRecipe * 8425 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8426 VPValue *Start, const InductionDescriptor &IndDesc, 8427 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { 8428 assert(IndDesc.getStartValue() == 8429 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8430 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8431 "step must be loop invariant"); 8432 8433 VPValue *Step = 8434 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8435 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8436 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8437 IndDesc, TruncI, 8438 TruncI->getDebugLoc()); 8439 } 8440 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8441 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8442 IndDesc, Phi->getDebugLoc()); 8443 } 8444 8445 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8446 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { 8447 8448 // Check if this is an integer or fp induction. If so, build the recipe that 8449 // produces its scalar and vector values. 8450 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8451 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8452 *PSE.getSE(), *OrigLoop); 8453 8454 // Check if this is pointer induction. If so, build the recipe for it. 8455 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8456 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8457 *PSE.getSE()); 8458 return new VPWidenPointerInductionRecipe( 8459 Phi, Operands[0], Step, *II, 8460 LoopVectorizationPlanner::getDecisionAndClampRange( 8461 [&](ElementCount VF) { 8462 return CM.isScalarAfterVectorization(Phi, VF); 8463 }, 8464 Range), 8465 Phi->getDebugLoc()); 8466 } 8467 return nullptr; 8468 } 8469 8470 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8471 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { 8472 // Optimize the special case where the source is a constant integer 8473 // induction variable. Notice that we can only optimize the 'trunc' case 8474 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8475 // (c) other casts depend on pointer size. 8476 8477 // Determine whether \p K is a truncation based on an induction variable that 8478 // can be optimized. 8479 auto IsOptimizableIVTruncate = 8480 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8481 return [=](ElementCount VF) -> bool { 8482 return CM.isOptimizableIVTruncate(K, VF); 8483 }; 8484 }; 8485 8486 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8487 IsOptimizableIVTruncate(I), Range)) { 8488 8489 auto *Phi = cast<PHINode>(I->getOperand(0)); 8490 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8491 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); 8492 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8493 *OrigLoop); 8494 } 8495 return nullptr; 8496 } 8497 8498 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, 8499 ArrayRef<VPValue *> Operands) { 8500 unsigned NumIncoming = Phi->getNumIncomingValues(); 8501 8502 // We know that all PHIs in non-header blocks are converted into selects, so 8503 // we don't have to worry about the insertion order and we can just use the 8504 // builder. At this point we generate the predication tree. There may be 8505 // duplications since this is a simple recursive scan, but future 8506 // optimizations will clean it up. 8507 SmallVector<VPValue *, 2> OperandsWithMask; 8508 8509 for (unsigned In = 0; In < NumIncoming; In++) { 8510 OperandsWithMask.push_back(Operands[In]); 8511 VPValue *EdgeMask = 8512 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); 8513 if (!EdgeMask) { 8514 assert(In == 0 && "Both null and non-null edge masks found"); 8515 assert(all_equal(Operands) && 8516 "Distinct incoming values with one having a full mask"); 8517 break; 8518 } 8519 OperandsWithMask.push_back(EdgeMask); 8520 } 8521 return new VPBlendRecipe(Phi, OperandsWithMask); 8522 } 8523 8524 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8525 ArrayRef<VPValue *> Operands, 8526 VFRange &Range) { 8527 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8528 [this, CI](ElementCount VF) { 8529 return CM.isScalarWithPredication(CI, VF); 8530 }, 8531 Range); 8532 8533 if (IsPredicated) 8534 return nullptr; 8535 8536 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8537 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8538 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8539 ID == Intrinsic::pseudoprobe || 8540 ID == Intrinsic::experimental_noalias_scope_decl)) 8541 return nullptr; 8542 8543 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8544 8545 // Is it beneficial to perform intrinsic call compared to lib call? 8546 bool ShouldUseVectorIntrinsic = 8547 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8548 [&](ElementCount VF) -> bool { 8549 return CM.getCallWideningDecision(CI, VF).Kind == 8550 LoopVectorizationCostModel::CM_IntrinsicCall; 8551 }, 8552 Range); 8553 if (ShouldUseVectorIntrinsic) 8554 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), 8555 CI->getDebugLoc()); 8556 8557 Function *Variant = nullptr; 8558 std::optional<unsigned> MaskPos; 8559 // Is better to call a vectorized version of the function than to to scalarize 8560 // the call? 8561 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8562 [&](ElementCount VF) -> bool { 8563 // The following case may be scalarized depending on the VF. 8564 // The flag shows whether we can use a usual Call for vectorized 8565 // version of the instruction. 8566 8567 // If we've found a variant at a previous VF, then stop looking. A 8568 // vectorized variant of a function expects input in a certain shape 8569 // -- basically the number of input registers, the number of lanes 8570 // per register, and whether there's a mask required. 8571 // We store a pointer to the variant in the VPWidenCallRecipe, so 8572 // once we have an appropriate variant it's only valid for that VF. 8573 // This will force a different vplan to be generated for each VF that 8574 // finds a valid variant. 8575 if (Variant) 8576 return false; 8577 LoopVectorizationCostModel::CallWideningDecision Decision = 8578 CM.getCallWideningDecision(CI, VF); 8579 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8580 Variant = Decision.Variant; 8581 MaskPos = Decision.MaskPos; 8582 return true; 8583 } 8584 8585 return false; 8586 }, 8587 Range); 8588 if (ShouldUseVectorCall) { 8589 if (MaskPos.has_value()) { 8590 // We have 2 cases that would require a mask: 8591 // 1) The block needs to be predicated, either due to a conditional 8592 // in the scalar loop or use of an active lane mask with 8593 // tail-folding, and we use the appropriate mask for the block. 8594 // 2) No mask is required for the block, but the only available 8595 // vector variant at this VF requires a mask, so we synthesize an 8596 // all-true mask. 8597 VPValue *Mask = nullptr; 8598 if (Legal->isMaskRequired(CI)) 8599 Mask = getBlockInMask(CI->getParent()); 8600 else 8601 Mask = Plan.getOrAddLiveIn( 8602 ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); 8603 8604 Ops.insert(Ops.begin() + *MaskPos, Mask); 8605 } 8606 8607 Ops.push_back(Operands.back()); 8608 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); 8609 } 8610 8611 return nullptr; 8612 } 8613 8614 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8615 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8616 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8617 // Instruction should be widened, unless it is scalar after vectorization, 8618 // scalarization is profitable or it is predicated. 8619 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8620 return CM.isScalarAfterVectorization(I, VF) || 8621 CM.isProfitableToScalarize(I, VF) || 8622 CM.isScalarWithPredication(I, VF); 8623 }; 8624 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8625 Range); 8626 } 8627 8628 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8629 ArrayRef<VPValue *> Operands, 8630 VPBasicBlock *VPBB) { 8631 switch (I->getOpcode()) { 8632 default: 8633 return nullptr; 8634 case Instruction::SDiv: 8635 case Instruction::UDiv: 8636 case Instruction::SRem: 8637 case Instruction::URem: { 8638 // If not provably safe, use a select to form a safe divisor before widening the 8639 // div/rem operation itself. Otherwise fall through to general handling below. 8640 if (CM.isPredicatedInst(I)) { 8641 SmallVector<VPValue *> Ops(Operands); 8642 VPValue *Mask = getBlockInMask(I->getParent()); 8643 VPValue *One = 8644 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); 8645 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); 8646 Ops[1] = SafeRHS; 8647 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8648 } 8649 [[fallthrough]]; 8650 } 8651 case Instruction::Add: 8652 case Instruction::And: 8653 case Instruction::AShr: 8654 case Instruction::FAdd: 8655 case Instruction::FCmp: 8656 case Instruction::FDiv: 8657 case Instruction::FMul: 8658 case Instruction::FNeg: 8659 case Instruction::FRem: 8660 case Instruction::FSub: 8661 case Instruction::ICmp: 8662 case Instruction::LShr: 8663 case Instruction::Mul: 8664 case Instruction::Or: 8665 case Instruction::Select: 8666 case Instruction::Shl: 8667 case Instruction::Sub: 8668 case Instruction::Xor: 8669 case Instruction::Freeze: 8670 SmallVector<VPValue *> NewOps(Operands); 8671 if (Instruction::isBinaryOp(I->getOpcode())) { 8672 // The legacy cost model uses SCEV to check if some of the operands are 8673 // constants. To match the legacy cost model's behavior, use SCEV to try 8674 // to replace operands with constants. 8675 ScalarEvolution &SE = *PSE.getSE(); 8676 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) { 8677 Value *V = Op->getUnderlyingValue(); 8678 if (isa<Constant>(V) || !SE.isSCEVable(V->getType())) 8679 return Op; 8680 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V)); 8681 if (!C) 8682 return Op; 8683 return Plan.getOrAddLiveIn(C->getValue()); 8684 }; 8685 // For Mul, the legacy cost model checks both operands. 8686 if (I->getOpcode() == Instruction::Mul) 8687 NewOps[0] = GetConstantViaSCEV(NewOps[0]); 8688 // For other binops, the legacy cost model only checks the second operand. 8689 NewOps[1] = GetConstantViaSCEV(NewOps[1]); 8690 } 8691 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); 8692 }; 8693 } 8694 8695 VPHistogramRecipe * 8696 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, 8697 ArrayRef<VPValue *> Operands) { 8698 // FIXME: Support other operations. 8699 unsigned Opcode = HI->Update->getOpcode(); 8700 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && 8701 "Histogram update operation must be an Add or Sub"); 8702 8703 SmallVector<VPValue *, 3> HGramOps; 8704 // Bucket address. 8705 HGramOps.push_back(Operands[1]); 8706 // Increment value. 8707 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); 8708 8709 // In case of predicated execution (due to tail-folding, or conditional 8710 // execution, or both), pass the relevant mask. 8711 if (Legal->isMaskRequired(HI->Store)) 8712 HGramOps.push_back(getBlockInMask(HI->Store->getParent())); 8713 8714 return new VPHistogramRecipe(Opcode, 8715 make_range(HGramOps.begin(), HGramOps.end()), 8716 HI->Store->getDebugLoc()); 8717 } 8718 8719 void VPRecipeBuilder::fixHeaderPhis() { 8720 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8721 for (VPHeaderPHIRecipe *R : PhisToFix) { 8722 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8723 VPRecipeBase *IncR = 8724 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8725 R->addOperand(IncR->getVPSingleValue()); 8726 } 8727 } 8728 8729 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, 8730 VFRange &Range) { 8731 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8732 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8733 Range); 8734 8735 bool IsPredicated = CM.isPredicatedInst(I); 8736 8737 // Even if the instruction is not marked as uniform, there are certain 8738 // intrinsic calls that can be effectively treated as such, so we check for 8739 // them here. Conservatively, we only do this for scalable vectors, since 8740 // for fixed-width VFs we can always fall back on full scalarization. 8741 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8742 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8743 case Intrinsic::assume: 8744 case Intrinsic::lifetime_start: 8745 case Intrinsic::lifetime_end: 8746 // For scalable vectors if one of the operands is variant then we still 8747 // want to mark as uniform, which will generate one instruction for just 8748 // the first lane of the vector. We can't scalarize the call in the same 8749 // way as for fixed-width vectors because we don't know how many lanes 8750 // there are. 8751 // 8752 // The reasons for doing it this way for scalable vectors are: 8753 // 1. For the assume intrinsic generating the instruction for the first 8754 // lane is still be better than not generating any at all. For 8755 // example, the input may be a splat across all lanes. 8756 // 2. For the lifetime start/end intrinsics the pointer operand only 8757 // does anything useful when the input comes from a stack object, 8758 // which suggests it should always be uniform. For non-stack objects 8759 // the effect is to poison the object, which still allows us to 8760 // remove the call. 8761 IsUniform = true; 8762 break; 8763 default: 8764 break; 8765 } 8766 } 8767 VPValue *BlockInMask = nullptr; 8768 if (!IsPredicated) { 8769 // Finalize the recipe for Instr, first if it is not predicated. 8770 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8771 } else { 8772 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8773 // Instructions marked for predication are replicated and a mask operand is 8774 // added initially. Masked replicate recipes will later be placed under an 8775 // if-then construct to prevent side-effects. Generate recipes to compute 8776 // the block mask for this region. 8777 BlockInMask = getBlockInMask(I->getParent()); 8778 } 8779 8780 // Note that there is some custom logic to mark some intrinsics as uniform 8781 // manually above for scalable vectors, which this assert needs to account for 8782 // as well. 8783 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || 8784 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && 8785 "Should not predicate a uniform recipe"); 8786 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), 8787 IsUniform, BlockInMask); 8788 return Recipe; 8789 } 8790 8791 /// Find all possible partial reductions in the loop and track all of those that 8792 /// are valid so recipes can be formed later. 8793 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { 8794 // Find all possible partial reductions. 8795 SmallVector<std::pair<PartialReductionChain, unsigned>, 1> 8796 PartialReductionChains; 8797 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) 8798 if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair = 8799 getScaledReduction(Phi, RdxDesc, Range)) 8800 PartialReductionChains.push_back(*Pair); 8801 8802 // A partial reduction is invalid if any of its extends are used by 8803 // something that isn't another partial reduction. This is because the 8804 // extends are intended to be lowered along with the reduction itself. 8805 8806 // Build up a set of partial reduction bin ops for efficient use checking. 8807 SmallSet<User *, 4> PartialReductionBinOps; 8808 for (const auto &[PartialRdx, _] : PartialReductionChains) 8809 PartialReductionBinOps.insert(PartialRdx.BinOp); 8810 8811 auto ExtendIsOnlyUsedByPartialReductions = 8812 [&PartialReductionBinOps](Instruction *Extend) { 8813 return all_of(Extend->users(), [&](const User *U) { 8814 return PartialReductionBinOps.contains(U); 8815 }); 8816 }; 8817 8818 // Check if each use of a chain's two extends is a partial reduction 8819 // and only add those that don't have non-partial reduction users. 8820 for (auto Pair : PartialReductionChains) { 8821 PartialReductionChain Chain = Pair.first; 8822 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && 8823 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) 8824 ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); 8825 } 8826 } 8827 8828 std::optional<std::pair<PartialReductionChain, unsigned>> 8829 VPRecipeBuilder::getScaledReduction(PHINode *PHI, 8830 const RecurrenceDescriptor &Rdx, 8831 VFRange &Range) { 8832 // TODO: Allow scaling reductions when predicating. The select at 8833 // the end of the loop chooses between the phi value and most recent 8834 // reduction result, both of which have different VFs to the active lane 8835 // mask when scaling. 8836 if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) 8837 return std::nullopt; 8838 8839 auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr()); 8840 if (!Update) 8841 return std::nullopt; 8842 8843 Value *Op = Update->getOperand(0); 8844 Value *PhiOp = Update->getOperand(1); 8845 if (Op == PHI) { 8846 Op = Update->getOperand(1); 8847 PhiOp = Update->getOperand(0); 8848 } 8849 if (PhiOp != PHI) 8850 return std::nullopt; 8851 8852 auto *BinOp = dyn_cast<BinaryOperator>(Op); 8853 if (!BinOp || !BinOp->hasOneUse()) 8854 return std::nullopt; 8855 8856 using namespace llvm::PatternMatch; 8857 Value *A, *B; 8858 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || 8859 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) 8860 return std::nullopt; 8861 8862 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0)); 8863 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1)); 8864 8865 TTI::PartialReductionExtendKind OpAExtend = 8866 TargetTransformInfo::getPartialReductionExtendKind(ExtA); 8867 TTI::PartialReductionExtendKind OpBExtend = 8868 TargetTransformInfo::getPartialReductionExtendKind(ExtB); 8869 8870 PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); 8871 8872 unsigned TargetScaleFactor = 8873 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( 8874 A->getType()->getPrimitiveSizeInBits()); 8875 8876 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8877 [&](ElementCount VF) { 8878 InstructionCost Cost = TTI->getPartialReductionCost( 8879 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), 8880 VF, OpAExtend, OpBExtend, 8881 std::make_optional(BinOp->getOpcode())); 8882 return Cost.isValid(); 8883 }, 8884 Range)) 8885 return std::make_pair(Chain, TargetScaleFactor); 8886 8887 return std::nullopt; 8888 } 8889 8890 VPRecipeBase * 8891 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8892 ArrayRef<VPValue *> Operands, 8893 VFRange &Range, VPBasicBlock *VPBB) { 8894 // First, check for specific widening recipes that deal with inductions, Phi 8895 // nodes, calls and memory operations. 8896 VPRecipeBase *Recipe; 8897 if (auto *Phi = dyn_cast<PHINode>(Instr)) { 8898 if (Phi->getParent() != OrigLoop->getHeader()) 8899 return tryToBlend(Phi, Operands); 8900 8901 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8902 return Recipe; 8903 8904 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8905 assert((Legal->isReductionVariable(Phi) || 8906 Legal->isFixedOrderRecurrence(Phi)) && 8907 "can only widen reductions and fixed-order recurrences here"); 8908 VPValue *StartV = Operands[0]; 8909 if (Legal->isReductionVariable(Phi)) { 8910 const RecurrenceDescriptor &RdxDesc = 8911 Legal->getReductionVars().find(Phi)->second; 8912 assert(RdxDesc.getRecurrenceStartValue() == 8913 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8914 8915 // If the PHI is used by a partial reduction, set the scale factor. 8916 std::optional<std::pair<PartialReductionChain, unsigned>> Pair = 8917 getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); 8918 unsigned ScaleFactor = Pair ? Pair->second : 1; 8919 PhiRecipe = new VPReductionPHIRecipe( 8920 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), 8921 CM.useOrderedReductions(RdxDesc), ScaleFactor); 8922 } else { 8923 // TODO: Currently fixed-order recurrences are modeled as chains of 8924 // first-order recurrences. If there are no users of the intermediate 8925 // recurrences in the chain, the fixed order recurrence should be modeled 8926 // directly, enabling more efficient codegen. 8927 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8928 } 8929 8930 PhisToFix.push_back(PhiRecipe); 8931 return PhiRecipe; 8932 } 8933 8934 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8935 cast<TruncInst>(Instr), Operands, Range))) 8936 return Recipe; 8937 8938 // All widen recipes below deal only with VF > 1. 8939 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8940 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8941 return nullptr; 8942 8943 if (auto *CI = dyn_cast<CallInst>(Instr)) 8944 return tryToWidenCall(CI, Operands, Range); 8945 8946 if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) 8947 if (auto HistInfo = Legal->getHistogramInfo(SI)) 8948 return tryToWidenHistogram(*HistInfo, Operands); 8949 8950 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8951 return tryToWidenMemory(Instr, Operands, Range); 8952 8953 if (getScaledReductionForInstr(Instr)) 8954 return tryToCreatePartialReduction(Instr, Operands); 8955 8956 if (!shouldWiden(Instr, Range)) 8957 return nullptr; 8958 8959 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) 8960 return new VPWidenGEPRecipe(GEP, 8961 make_range(Operands.begin(), Operands.end())); 8962 8963 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8964 return new VPWidenSelectRecipe( 8965 *SI, make_range(Operands.begin(), Operands.end())); 8966 } 8967 8968 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8969 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), 8970 *CI); 8971 } 8972 8973 return tryToWiden(Instr, Operands, VPBB); 8974 } 8975 8976 VPRecipeBase * 8977 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, 8978 ArrayRef<VPValue *> Operands) { 8979 assert(Operands.size() == 2 && 8980 "Unexpected number of operands for partial reduction"); 8981 8982 VPValue *BinOp = Operands[0]; 8983 VPValue *Phi = Operands[1]; 8984 if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe())) 8985 std::swap(BinOp, Phi); 8986 8987 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, 8988 Reduction); 8989 } 8990 8991 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8992 ElementCount MaxVF) { 8993 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8994 8995 auto MaxVFTimes2 = MaxVF * 2; 8996 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8997 VFRange SubRange = {VF, MaxVFTimes2}; 8998 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8999 // Now optimize the initial VPlan. 9000 if (!Plan->hasVF(ElementCount::getFixed(1))) 9001 VPlanTransforms::truncateToMinimalBitwidths(*Plan, 9002 CM.getMinimalBitwidths()); 9003 VPlanTransforms::optimize(*Plan); 9004 // TODO: try to put it close to addActiveLaneMask(). 9005 // Discard the plan if it is not EVL-compatible 9006 if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( 9007 *Plan, CM.getMaxSafeElements())) 9008 break; 9009 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9010 VPlans.push_back(std::move(Plan)); 9011 } 9012 VF = SubRange.End; 9013 } 9014 } 9015 9016 // Add the necessary canonical IV and branch recipes required to control the 9017 // loop. 9018 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 9019 DebugLoc DL) { 9020 Value *StartIdx = ConstantInt::get(IdxTy, 0); 9021 auto *StartV = Plan.getOrAddLiveIn(StartIdx); 9022 9023 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 9024 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 9025 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 9026 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 9027 Header->insert(CanonicalIVPHI, Header->begin()); 9028 9029 VPBuilder Builder(TopRegion->getExitingBasicBlock()); 9030 // Add a VPInstruction to increment the scalar canonical IV by VF * UF. 9031 auto *CanonicalIVIncrement = Builder.createOverflowingOp( 9032 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, 9033 "index.next"); 9034 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 9035 9036 // Add the BranchOnCount VPInstruction to the latch. 9037 Builder.createNaryOp(VPInstruction::BranchOnCount, 9038 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 9039 } 9040 9041 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the 9042 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute 9043 /// the end value of the induction. 9044 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, 9045 VPBuilder &VectorPHBuilder, 9046 VPBuilder &ScalarPHBuilder, 9047 VPTypeAnalysis &TypeInfo, 9048 VPValue *VectorTC) { 9049 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); 9050 // Truncated wide inductions resume from the last lane of their vector value 9051 // in the last vector iteration which is handled elsewhere. 9052 if (WideIntOrFp && WideIntOrFp->getTruncInst()) 9053 return nullptr; 9054 9055 VPValue *Start = WideIV->getStartValue(); 9056 VPValue *Step = WideIV->getStepValue(); 9057 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 9058 VPValue *EndValue = VectorTC; 9059 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { 9060 EndValue = VectorPHBuilder.createDerivedIV( 9061 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), 9062 Start, VectorTC, Step); 9063 } 9064 9065 // EndValue is derived from the vector trip count (which has the same type as 9066 // the widest induction) and thus may be wider than the induction here. 9067 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); 9068 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { 9069 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, 9070 ScalarTypeOfWideIV, 9071 WideIV->getDebugLoc()); 9072 } 9073 9074 auto *ResumePhiRecipe = 9075 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, 9076 WideIV->getDebugLoc(), "bc.resume.val"); 9077 return ResumePhiRecipe; 9078 } 9079 9080 /// Create resume phis in the scalar preheader for first-order recurrences, 9081 /// reductions and inductions, and update the VPIRInstructions wrapping the 9082 /// original phis in the scalar header. 9083 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { 9084 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 9085 auto *ScalarPH = Plan.getScalarPreheader(); 9086 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); 9087 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9088 VPBuilder VectorPHBuilder( 9089 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); 9090 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9091 VPBuilder ScalarPHBuilder(ScalarPH); 9092 VPValue *OneVPV = Plan.getOrAddLiveIn( 9093 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); 9094 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { 9095 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR); 9096 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); 9097 if (!ScalarPhiI) 9098 break; 9099 9100 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); 9101 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { 9102 if (VPValue *ResumePhi = addResumePhiRecipeForInduction( 9103 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, 9104 &Plan.getVectorTripCount())) { 9105 ScalarPhiIRI->addOperand(ResumePhi); 9106 continue; 9107 } 9108 // TODO: Also handle truncated inductions here. Computing end-values 9109 // separately should be done as VPlan-to-VPlan optimization, after 9110 // legalizing all resume values to use the last lane from the loop. 9111 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && 9112 "should only skip truncated wide inductions"); 9113 continue; 9114 } 9115 9116 // The backedge value provides the value to resume coming out of a loop, 9117 // which for FORs is a vector whose last element needs to be extracted. The 9118 // start value provides the value if the loop is bypassed. 9119 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); 9120 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); 9121 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9122 "Cannot handle loops with uncountable early exits"); 9123 if (IsFOR) 9124 ResumeFromVectorLoop = MiddleBuilder.createNaryOp( 9125 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, 9126 "vector.recur.extract"); 9127 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; 9128 auto *ResumePhiR = ScalarPHBuilder.createNaryOp( 9129 VPInstruction::ResumePhi, 9130 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); 9131 ScalarPhiIRI->addOperand(ResumePhiR); 9132 } 9133 } 9134 9135 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is 9136 /// either an untruncated wide induction, or if it increments a wide induction 9137 /// by its step. 9138 static bool isOptimizableIVOrUse(VPValue *VPV) { 9139 VPRecipeBase *Def = VPV->getDefiningRecipe(); 9140 if (!Def) 9141 return false; 9142 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def); 9143 if (WideIV) { 9144 // VPV itself is a wide induction, separately compute the end value for exit 9145 // users if it is not a truncated IV. 9146 return isa<VPWidenPointerInductionRecipe>(WideIV) || 9147 !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst(); 9148 } 9149 9150 // Check if VPV is an optimizable induction increment. 9151 if (Def->getNumOperands() != 2) 9152 return false; 9153 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0)); 9154 if (!WideIV) 9155 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1)); 9156 if (!WideIV) 9157 return false; 9158 9159 using namespace VPlanPatternMatch; 9160 auto &ID = WideIV->getInductionDescriptor(); 9161 9162 // Check if VPV increments the induction by the induction step. 9163 VPValue *IVStep = WideIV->getStepValue(); 9164 switch (ID.getInductionOpcode()) { 9165 case Instruction::Add: 9166 return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV), 9167 m_Specific(IVStep))); 9168 case Instruction::FAdd: 9169 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV), 9170 m_Specific(IVStep))); 9171 case Instruction::FSub: 9172 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV), 9173 m_Specific(IVStep))); 9174 case Instruction::Sub: { 9175 // IVStep will be the negated step of the subtraction. Check if Step == -1 * 9176 // IVStep. 9177 VPValue *Step; 9178 if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) || 9179 !Step->isLiveIn() || !IVStep->isLiveIn()) 9180 return false; 9181 auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 9182 auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue()); 9183 return StepCI && IVStepCI && 9184 StepCI->getValue() == (-1 * IVStepCI->getValue()); 9185 } 9186 default: 9187 return ID.getKind() == InductionDescriptor::IK_PtrInduction && 9188 match(VPV, m_GetElementPtr(m_Specific(WideIV), 9189 m_Specific(WideIV->getStepValue()))); 9190 } 9191 llvm_unreachable("should have been covered by switch above"); 9192 } 9193 9194 // Collect VPIRInstructions for phis in the exit blocks that are modeled 9195 // in VPlan and add the exiting VPValue as operand. Some exiting values are not 9196 // modeled explicitly yet and won't be included. Those are un-truncated 9197 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction 9198 // increments. 9199 static SetVector<VPIRInstruction *> 9200 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, 9201 VPlan &Plan) { 9202 auto *MiddleVPBB = Plan.getMiddleBlock(); 9203 SetVector<VPIRInstruction *> ExitUsersToFix; 9204 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { 9205 for (VPRecipeBase &R : *ExitVPBB) { 9206 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); 9207 if (!ExitIRI) 9208 continue; 9209 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); 9210 if (!ExitPhi) 9211 break; 9212 for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { 9213 BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); 9214 if (PredVPBB != MiddleVPBB) { 9215 SmallVector<BasicBlock *> ExitingBlocks; 9216 OrigLoop->getExitingBlocks(ExitingBlocks); 9217 assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); 9218 ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] 9219 : ExitingBlocks[0]; 9220 } 9221 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); 9222 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); 9223 // Exit values for inductions are computed and updated outside of VPlan 9224 // and independent of induction recipes. 9225 // TODO: Compute induction exit values in VPlan. 9226 if (isOptimizableIVOrUse(V) && 9227 ExitVPBB->getSinglePredecessor() == MiddleVPBB) 9228 continue; 9229 ExitUsersToFix.insert(ExitIRI); 9230 ExitIRI->addOperand(V); 9231 } 9232 } 9233 } 9234 return ExitUsersToFix; 9235 } 9236 9237 // Add exit values to \p Plan. Extracts are added for each entry in \p 9238 // ExitUsersToFix if needed and their operands are updated. Returns true if all 9239 // exit users can be handled, otherwise return false. 9240 static bool 9241 addUsersInExitBlocks(VPlan &Plan, 9242 const SetVector<VPIRInstruction *> &ExitUsersToFix) { 9243 if (ExitUsersToFix.empty()) 9244 return true; 9245 9246 auto *MiddleVPBB = Plan.getMiddleBlock(); 9247 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9248 9249 // Introduce extract for exiting values and update the VPIRInstructions 9250 // modeling the corresponding LCSSA phis. 9251 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9252 for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) { 9253 // Pass live-in values used by exit phis directly through to their users 9254 // in the exit block. 9255 if (Op->isLiveIn()) 9256 continue; 9257 9258 // Currently only live-ins can be used by exit values from blocks not 9259 // exiting via the vector latch through to the middle block. 9260 if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) 9261 return false; 9262 9263 LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); 9264 VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, 9265 {Op, Plan.getOrAddLiveIn(ConstantInt::get( 9266 IntegerType::get(Ctx, 32), 1))}); 9267 ExitIRI->setOperand(Idx, Ext); 9268 } 9269 } 9270 return true; 9271 } 9272 9273 /// Handle users in the exit block for first order reductions in the original 9274 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi 9275 /// users in the original exit block using the VPIRInstruction wrapping to the 9276 /// LCSSA phi. 9277 static void addExitUsersForFirstOrderRecurrences( 9278 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { 9279 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9280 auto *ScalarPHVPBB = Plan.getScalarPreheader(); 9281 auto *MiddleVPBB = Plan.getMiddleBlock(); 9282 VPBuilder ScalarPHBuilder(ScalarPHVPBB); 9283 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9284 VPValue *TwoVPV = Plan.getOrAddLiveIn( 9285 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); 9286 9287 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { 9288 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); 9289 if (!FOR) 9290 continue; 9291 9292 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9293 "Cannot handle loops with uncountable early exits"); 9294 9295 // This is the second phase of vectorizing first-order recurrences, creating 9296 // extract for users outside the loop. An overview of the transformation is 9297 // described below. Suppose we have the following loop with some use after 9298 // the loop of the last a[i-1], 9299 // 9300 // for (int i = 0; i < n; ++i) { 9301 // t = a[i - 1]; 9302 // b[i] = a[i] - t; 9303 // } 9304 // use t; 9305 // 9306 // There is a first-order recurrence on "a". For this loop, the shorthand 9307 // scalar IR looks like: 9308 // 9309 // scalar.ph: 9310 // s.init = a[-1] 9311 // br scalar.body 9312 // 9313 // scalar.body: 9314 // i = phi [0, scalar.ph], [i+1, scalar.body] 9315 // s1 = phi [s.init, scalar.ph], [s2, scalar.body] 9316 // s2 = a[i] 9317 // b[i] = s2 - s1 9318 // br cond, scalar.body, exit.block 9319 // 9320 // exit.block: 9321 // use = lcssa.phi [s1, scalar.body] 9322 // 9323 // In this example, s1 is a recurrence because it's value depends on the 9324 // previous iteration. In the first phase of vectorization, we created a 9325 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts 9326 // for users in the scalar preheader and exit block. 9327 // 9328 // vector.ph: 9329 // v_init = vector(..., ..., ..., a[-1]) 9330 // br vector.body 9331 // 9332 // vector.body 9333 // i = phi [0, vector.ph], [i+4, vector.body] 9334 // v1 = phi [v_init, vector.ph], [v2, vector.body] 9335 // v2 = a[i, i+1, i+2, i+3] 9336 // b[i] = v2 - v1 9337 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) 9338 // b[i, i+1, i+2, i+3] = v2 - v1 9339 // br cond, vector.body, middle.block 9340 // 9341 // middle.block: 9342 // vector.recur.extract.for.phi = v2(2) 9343 // vector.recur.extract = v2(3) 9344 // br cond, scalar.ph, exit.block 9345 // 9346 // scalar.ph: 9347 // scalar.recur.init = phi [vector.recur.extract, middle.block], 9348 // [s.init, otherwise] 9349 // br scalar.body 9350 // 9351 // scalar.body: 9352 // i = phi [0, scalar.ph], [i+1, scalar.body] 9353 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] 9354 // s2 = a[i] 9355 // b[i] = s2 - s1 9356 // br cond, scalar.body, exit.block 9357 // 9358 // exit.block: 9359 // lo = lcssa.phi [s1, scalar.body], 9360 // [vector.recur.extract.for.phi, middle.block] 9361 // 9362 // Now update VPIRInstructions modeling LCSSA phis in the exit block. 9363 // Extract the penultimate value of the recurrence and use it as operand for 9364 // the VPIRInstruction modeling the phi. 9365 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9366 if (ExitIRI->getOperand(0) != FOR) 9367 continue; 9368 VPValue *PenultimateElement = MiddleBuilder.createNaryOp( 9369 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, 9370 "vector.recur.extract.for.phi"); 9371 ExitIRI->setOperand(0, PenultimateElement); 9372 ExitUsersToFix.remove(ExitIRI); 9373 } 9374 } 9375 } 9376 9377 VPlanPtr 9378 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 9379 9380 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9381 9382 // --------------------------------------------------------------------------- 9383 // Build initial VPlan: Scan the body of the loop in a topological order to 9384 // visit each basic block after having visited its predecessor basic blocks. 9385 // --------------------------------------------------------------------------- 9386 9387 // Create initial VPlan skeleton, having a basic block for the pre-header 9388 // which contains SCEV expansions that need to happen before the CFG is 9389 // modified; a basic block for the vector pre-header, followed by a region for 9390 // the vector loop, followed by the middle basic block. The skeleton vector 9391 // loop region contains a header and latch basic blocks. 9392 9393 bool RequiresScalarEpilogueCheck = 9394 LoopVectorizationPlanner::getDecisionAndClampRange( 9395 [this](ElementCount VF) { 9396 return !CM.requiresScalarEpilogue(VF.isVector()); 9397 }, 9398 Range); 9399 VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), 9400 PSE, RequiresScalarEpilogueCheck, 9401 CM.foldTailByMasking(), OrigLoop); 9402 9403 // Don't use getDecisionAndClampRange here, because we don't know the UF 9404 // so this function is better to be conservative, rather than to split 9405 // it up into different VPlans. 9406 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 9407 bool IVUpdateMayOverflow = false; 9408 for (ElementCount VF : Range) 9409 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 9410 9411 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9412 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 9413 // Use NUW for the induction increment if we proved that it won't overflow in 9414 // the vector loop or when not folding the tail. In the later case, we know 9415 // that the canonical induction increment will not overflow as the vector trip 9416 // count is >= increment and a multiple of the increment. 9417 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; 9418 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 9419 9420 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9421 Builder); 9422 9423 // --------------------------------------------------------------------------- 9424 // Pre-construction: record ingredients whose recipes we'll need to further 9425 // process after constructing the initial VPlan. 9426 // --------------------------------------------------------------------------- 9427 9428 // For each interleave group which is relevant for this (possibly trimmed) 9429 // Range, add it to the set of groups to be later applied to the VPlan and add 9430 // placeholders for its members' Recipes which we'll be replacing with a 9431 // single VPInterleaveRecipe. 9432 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9433 auto ApplyIG = [IG, this](ElementCount VF) -> bool { 9434 bool Result = (VF.isVector() && // Query is illegal for VF == 1 9435 CM.getWideningDecision(IG->getInsertPos(), VF) == 9436 LoopVectorizationCostModel::CM_Interleave); 9437 // For scalable vectors, the only interleave factor currently supported 9438 // is 2 since we require the (de)interleave2 intrinsics instead of 9439 // shufflevectors. 9440 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 9441 "Unsupported interleave factor for scalable vectors"); 9442 return Result; 9443 }; 9444 if (!getDecisionAndClampRange(ApplyIG, Range)) 9445 continue; 9446 InterleaveGroups.insert(IG); 9447 } 9448 9449 // --------------------------------------------------------------------------- 9450 // Construct recipes for the instructions in the loop 9451 // --------------------------------------------------------------------------- 9452 9453 // Scan the body of the loop in a topological order to visit each basic block 9454 // after having visited its predecessor basic blocks. 9455 LoopBlocksDFS DFS(OrigLoop); 9456 DFS.perform(LI); 9457 9458 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 9459 VPBasicBlock *VPBB = HeaderVPBB; 9460 BasicBlock *HeaderBB = OrigLoop->getHeader(); 9461 bool NeedsMasks = 9462 CM.foldTailByMasking() || 9463 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { 9464 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); 9465 return Legal->blockNeedsPredication(BB) || NeedsBlends; 9466 }); 9467 9468 RecipeBuilder.collectScaledReductions(Range); 9469 9470 auto *MiddleVPBB = Plan->getMiddleBlock(); 9471 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); 9472 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9473 // Relevant instructions from basic block BB will be grouped into VPRecipe 9474 // ingredients and fill a new VPBasicBlock. 9475 if (VPBB != HeaderVPBB) 9476 VPBB->setName(BB->getName()); 9477 Builder.setInsertPoint(VPBB); 9478 9479 if (VPBB == HeaderVPBB) 9480 RecipeBuilder.createHeaderMask(); 9481 else if (NeedsMasks) 9482 RecipeBuilder.createBlockInMask(BB); 9483 9484 // Introduce each ingredient into VPlan. 9485 // TODO: Model and preserve debug intrinsics in VPlan. 9486 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 9487 Instruction *Instr = &I; 9488 SmallVector<VPValue *, 4> Operands; 9489 auto *Phi = dyn_cast<PHINode>(Instr); 9490 if (Phi && Phi->getParent() == HeaderBB) { 9491 Operands.push_back(Plan->getOrAddLiveIn( 9492 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9493 } else { 9494 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); 9495 Operands = {OpRange.begin(), OpRange.end()}; 9496 } 9497 9498 // The stores with invariant address inside the loop will be deleted, and 9499 // in the exit block, a uniform store recipe will be created for the final 9500 // invariant store of the reduction. 9501 StoreInst *SI; 9502 if ((SI = dyn_cast<StoreInst>(&I)) && 9503 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 9504 // Only create recipe for the final invariant store of the reduction. 9505 if (!Legal->isInvariantStoreOfReduction(SI)) 9506 continue; 9507 auto *Recipe = new VPReplicateRecipe( 9508 SI, RecipeBuilder.mapToVPValues(Instr->operands()), 9509 true /* IsUniform */); 9510 Recipe->insertBefore(*MiddleVPBB, MBIP); 9511 continue; 9512 } 9513 9514 VPRecipeBase *Recipe = 9515 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); 9516 if (!Recipe) 9517 Recipe = RecipeBuilder.handleReplication(Instr, Range); 9518 9519 RecipeBuilder.setRecipe(Instr, Recipe); 9520 if (isa<VPHeaderPHIRecipe>(Recipe)) { 9521 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 9522 // the following cases, VPHeaderPHIRecipes may be created after non-phi 9523 // recipes and need to be moved to the phi section of HeaderVPBB: 9524 // * tail-folding (non-phi recipes computing the header mask are 9525 // introduced earlier than regular header phi recipes, and should appear 9526 // after them) 9527 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 9528 9529 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 9530 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 9531 "unexpected recipe needs moving"); 9532 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9533 } else 9534 VPBB->appendRecipe(Recipe); 9535 } 9536 9537 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); 9538 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9539 } 9540 9541 // After here, VPBB should not be used. 9542 VPBB = nullptr; 9543 9544 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9545 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9546 "entry block must be set to a VPRegionBlock having a non-empty entry " 9547 "VPBasicBlock"); 9548 RecipeBuilder.fixHeaderPhis(); 9549 9550 // Update wide induction increments to use the same step as the corresponding 9551 // wide induction. This enables detecting induction increments directly in 9552 // VPlan and removes redundant splats. 9553 for (const auto &[Phi, ID] : Legal->getInductionVars()) { 9554 auto *IVInc = cast<Instruction>( 9555 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 9556 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) 9557 continue; 9558 VPWidenInductionRecipe *WideIV = 9559 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); 9560 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); 9561 R->setOperand(1, WideIV->getStepValue()); 9562 } 9563 9564 if (auto *UncountableExitingBlock = 9565 Legal->getUncountableEarlyExitingBlock()) { 9566 VPlanTransforms::handleUncountableEarlyExit( 9567 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); 9568 } 9569 addScalarResumePhis(RecipeBuilder, *Plan); 9570 SetVector<VPIRInstruction *> ExitUsersToFix = 9571 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); 9572 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); 9573 if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { 9574 reportVectorizationFailure( 9575 "Some exit values in loop with uncountable exit not supported yet", 9576 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); 9577 return nullptr; 9578 } 9579 9580 // --------------------------------------------------------------------------- 9581 // Transform initial VPlan: Apply previously taken decisions, in order, to 9582 // bring the VPlan to its final state. 9583 // --------------------------------------------------------------------------- 9584 9585 // Adjust the recipes for any inloop reductions. 9586 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); 9587 9588 // Interleave memory: for each Interleave Group we marked earlier as relevant 9589 // for this VPlan, replace the Recipes widening its memory instructions with a 9590 // single VPInterleaveRecipe at its insertion point. 9591 VPlanTransforms::createInterleaveGroups( 9592 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); 9593 9594 for (ElementCount VF : Range) 9595 Plan->addVF(VF); 9596 Plan->setName("Initial VPlan"); 9597 9598 // Replace VPValues for known constant strides guaranteed by predicate scalar 9599 // evolution. 9600 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { 9601 auto *R = cast<VPRecipeBase>(&U); 9602 return R->getParent()->getParent() || 9603 R->getParent() == 9604 Plan->getVectorLoopRegion()->getSinglePredecessor(); 9605 }; 9606 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9607 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9608 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9609 // Only handle constant strides for now. 9610 if (!ScevStride) 9611 continue; 9612 9613 auto *CI = Plan->getOrAddLiveIn( 9614 ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); 9615 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) 9616 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9617 9618 // The versioned value may not be used in the loop directly but through a 9619 // sext/zext. Add new live-ins in those cases. 9620 for (Value *U : StrideV->users()) { 9621 if (!isa<SExtInst, ZExtInst>(U)) 9622 continue; 9623 VPValue *StrideVPV = Plan->getLiveIn(U); 9624 if (!StrideVPV) 9625 continue; 9626 unsigned BW = U->getType()->getScalarSizeInBits(); 9627 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) 9628 : ScevStride->getAPInt().zext(BW); 9629 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); 9630 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9631 } 9632 } 9633 9634 VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) { 9635 return Legal->blockNeedsPredication(BB); 9636 }); 9637 9638 // Sink users of fixed-order recurrence past the recipe defining the previous 9639 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9640 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 9641 return nullptr; 9642 9643 if (useActiveLaneMask(Style)) { 9644 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 9645 // TailFoldingStyle is visible there. 9646 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 9647 bool WithoutRuntimeCheck = 9648 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 9649 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 9650 WithoutRuntimeCheck); 9651 } 9652 9653 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9654 return Plan; 9655 } 9656 9657 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9658 // Outer loop handling: They may require CFG and instruction level 9659 // transformations before even evaluating whether vectorization is profitable. 9660 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9661 // the vectorization pipeline. 9662 assert(!OrigLoop->isInnermost()); 9663 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9664 9665 // Create new empty VPlan 9666 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE, 9667 true, false, OrigLoop); 9668 9669 // Build hierarchical CFG 9670 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9671 HCFGBuilder.buildHierarchicalCFG(); 9672 9673 for (ElementCount VF : Range) 9674 Plan->addVF(VF); 9675 9676 VPlanTransforms::VPInstructionsToVPRecipes( 9677 Plan, 9678 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9679 *PSE.getSE(), *TLI); 9680 9681 // Remove the existing terminator of the exiting block of the top-most region. 9682 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9683 auto *Term = 9684 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9685 Term->eraseFromParent(); 9686 9687 // Tail folding is not supported for outer loops, so the induction increment 9688 // is guaranteed to not wrap. 9689 bool HasNUW = true; 9690 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 9691 DebugLoc()); 9692 9693 // Collect mapping of IR header phis to header phi recipes, to be used in 9694 // addScalarResumePhis. 9695 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9696 Builder); 9697 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9698 if (isa<VPCanonicalIVPHIRecipe>(&R)) 9699 continue; 9700 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); 9701 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); 9702 } 9703 addScalarResumePhis(RecipeBuilder, *Plan); 9704 9705 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9706 return Plan; 9707 } 9708 9709 // Adjust the recipes for reductions. For in-loop reductions the chain of 9710 // instructions leading from the loop exit instr to the phi need to be converted 9711 // to reductions, with one operand being vector and the other being the scalar 9712 // reduction chain. For other reductions, a select is introduced between the phi 9713 // and users outside the vector region when folding the tail. 9714 // 9715 // A ComputeReductionResult recipe is added to the middle block, also for 9716 // in-loop reductions which compute their result in-loop, because generating 9717 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 9718 // 9719 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9720 // with a boolean reduction phi node to check if the condition is true in any 9721 // iteration. The final value is selected by the final ComputeReductionResult. 9722 void LoopVectorizationPlanner::adjustRecipesForReductions( 9723 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 9724 using namespace VPlanPatternMatch; 9725 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 9726 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 9727 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); 9728 SmallVector<VPRecipeBase *> ToDelete; 9729 9730 for (VPRecipeBase &R : Header->phis()) { 9731 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9732 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9733 continue; 9734 9735 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9736 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9737 assert( 9738 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9739 !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && 9740 "AnyOf and FindLast reductions are not allowed for in-loop reductions"); 9741 9742 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9743 SetVector<VPSingleDefRecipe *> Worklist; 9744 Worklist.insert(PhiR); 9745 for (unsigned I = 0; I != Worklist.size(); ++I) { 9746 VPSingleDefRecipe *Cur = Worklist[I]; 9747 for (VPUser *U : Cur->users()) { 9748 auto *UserRecipe = cast<VPSingleDefRecipe>(U); 9749 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { 9750 assert((UserRecipe->getParent() == MiddleVPBB || 9751 UserRecipe->getParent() == Plan->getScalarPreheader()) && 9752 "U must be either in the loop region, the middle block or the " 9753 "scalar preheader."); 9754 continue; 9755 } 9756 Worklist.insert(UserRecipe); 9757 } 9758 } 9759 9760 // Visit operation "Links" along the reduction chain top-down starting from 9761 // the phi until LoopExitValue. We keep track of the previous item 9762 // (PreviousLink) to tell which of the two operands of a Link will remain 9763 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9764 // the select instructions. Blend recipes of in-loop reduction phi's will 9765 // get folded to their non-phi operand, as the reduction recipe handles the 9766 // condition directly. 9767 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 9768 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 9769 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9770 9771 // Index of the first operand which holds a non-mask vector operand. 9772 unsigned IndexOfFirstOperand; 9773 // Recognize a call to the llvm.fmuladd intrinsic. 9774 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9775 VPValue *VecOp; 9776 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9777 if (IsFMulAdd) { 9778 assert( 9779 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9780 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9781 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9782 isa<VPWidenIntrinsicRecipe>(CurrentLink)) && 9783 CurrentLink->getOperand(2) == PreviousLink && 9784 "expected a call where the previous link is the added operand"); 9785 9786 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9787 // need to create an fmul recipe (multiplying the first two operands of 9788 // the fmuladd together) to use as the vector operand for the fadd 9789 // reduction. 9790 VPInstruction *FMulRecipe = new VPInstruction( 9791 Instruction::FMul, 9792 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9793 CurrentLinkI->getFastMathFlags()); 9794 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9795 VecOp = FMulRecipe; 9796 } else { 9797 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); 9798 if (PhiR->isInLoop() && Blend) { 9799 assert(Blend->getNumIncomingValues() == 2 && 9800 "Blend must have 2 incoming values"); 9801 if (Blend->getIncomingValue(0) == PhiR) 9802 Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); 9803 else { 9804 assert(Blend->getIncomingValue(1) == PhiR && 9805 "PhiR must be an operand of the blend"); 9806 Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); 9807 } 9808 continue; 9809 } 9810 9811 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9812 if (isa<VPWidenRecipe>(CurrentLink)) { 9813 assert(isa<CmpInst>(CurrentLinkI) && 9814 "need to have the compare of the select"); 9815 continue; 9816 } 9817 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9818 "must be a select recipe"); 9819 IndexOfFirstOperand = 1; 9820 } else { 9821 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9822 "Expected to replace a VPWidenSC"); 9823 IndexOfFirstOperand = 0; 9824 } 9825 // Note that for non-commutable operands (cmp-selects), the semantics of 9826 // the cmp-select are captured in the recurrence kind. 9827 unsigned VecOpId = 9828 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9829 ? IndexOfFirstOperand + 1 9830 : IndexOfFirstOperand; 9831 VecOp = CurrentLink->getOperand(VecOpId); 9832 assert(VecOp != PreviousLink && 9833 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9834 (VecOpId - IndexOfFirstOperand)) == 9835 PreviousLink && 9836 "PreviousLink must be the operand other than VecOp"); 9837 } 9838 9839 BasicBlock *BB = CurrentLinkI->getParent(); 9840 VPValue *CondOp = nullptr; 9841 if (CM.blockNeedsPredicationForAnyReason(BB)) 9842 CondOp = RecipeBuilder.getBlockInMask(BB); 9843 9844 auto *RedRecipe = new VPReductionRecipe( 9845 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, 9846 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); 9847 // Append the recipe to the end of the VPBasicBlock because we need to 9848 // ensure that it comes after all of it's inputs, including CondOp. 9849 // Delete CurrentLink as it will be invalid if its operand is replaced 9850 // with a reduction defined at the bottom of the block in the next link. 9851 LinkVPBB->appendRecipe(RedRecipe); 9852 CurrentLink->replaceAllUsesWith(RedRecipe); 9853 ToDelete.push_back(CurrentLink); 9854 PreviousLink = RedRecipe; 9855 } 9856 } 9857 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); 9858 Builder.setInsertPoint(&*LatchVPBB->begin()); 9859 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); 9860 for (VPRecipeBase &R : 9861 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9862 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9863 if (!PhiR) 9864 continue; 9865 9866 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9867 // If tail is folded by masking, introduce selects between the phi 9868 // and the users outside the vector region of each reduction, at the 9869 // beginning of the dedicated latch block. 9870 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9871 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9872 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9873 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9874 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9875 "reduction recipe must be defined before latch"); 9876 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9877 std::optional<FastMathFlags> FMFs = 9878 PhiTy->isFloatingPointTy() 9879 ? std::make_optional(RdxDesc.getFastMathFlags()) 9880 : std::nullopt; 9881 NewExitingVPV = 9882 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9883 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9884 return isa<VPInstruction>(&U) && 9885 cast<VPInstruction>(&U)->getOpcode() == 9886 VPInstruction::ComputeReductionResult; 9887 }); 9888 if (CM.usePredicatedReductionSelect( 9889 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) 9890 PhiR->setOperand(1, NewExitingVPV); 9891 } 9892 9893 // If the vector reduction can be performed in a smaller type, we truncate 9894 // then extend the loop exit value to enable InstCombine to evaluate the 9895 // entire expression in the smaller type. 9896 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9897 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && 9898 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 9899 RdxDesc.getRecurrenceKind())) { 9900 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9901 Type *RdxTy = RdxDesc.getRecurrenceType(); 9902 auto *Trunc = 9903 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9904 auto *Extnd = 9905 RdxDesc.isSigned() 9906 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9907 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9908 9909 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9910 Extnd->insertAfter(Trunc); 9911 if (PhiR->getOperand(1) == NewExitingVPV) 9912 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9913 NewExitingVPV = Extnd; 9914 } 9915 9916 // We want code in the middle block to appear to execute on the location of 9917 // the scalar loop's latch terminator because: (a) it is all compiler 9918 // generated, (b) these instructions are always executed after evaluating 9919 // the latch conditional branch, and (c) other passes may add new 9920 // predecessors which terminate on this line. This is the easiest way to 9921 // ensure we don't accidentally cause an extra step back into the loop while 9922 // debugging. 9923 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9924 9925 // TODO: At the moment ComputeReductionResult also drives creation of the 9926 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9927 // even for in-loop reductions, until the reduction resume value handling is 9928 // also modeled in VPlan. 9929 auto *FinalReductionResult = new VPInstruction( 9930 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9931 // Update all users outside the vector region. 9932 OrigExitingVPV->replaceUsesWithIf( 9933 FinalReductionResult, [](VPUser &User, unsigned) { 9934 auto *Parent = cast<VPRecipeBase>(&User)->getParent(); 9935 return Parent && !Parent->getParent(); 9936 }); 9937 FinalReductionResult->insertBefore(*MiddleVPBB, IP); 9938 9939 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9940 // with a boolean reduction phi node to check if the condition is true in 9941 // any iteration. The final value is selected by the final 9942 // ComputeReductionResult. 9943 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 9944 RdxDesc.getRecurrenceKind())) { 9945 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { 9946 return isa<VPWidenSelectRecipe>(U) || 9947 (isa<VPReplicateRecipe>(U) && 9948 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == 9949 Instruction::Select); 9950 })); 9951 VPValue *Cmp = Select->getOperand(0); 9952 // If the compare is checking the reduction PHI node, adjust it to check 9953 // the start value. 9954 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { 9955 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) 9956 if (CmpR->getOperand(I) == PhiR) 9957 CmpR->setOperand(I, PhiR->getStartValue()); 9958 } 9959 VPBuilder::InsertPointGuard Guard(Builder); 9960 Builder.setInsertPoint(Select); 9961 9962 // If the true value of the select is the reduction phi, the new value is 9963 // selected if the negated condition is true in any iteration. 9964 if (Select->getOperand(1) == PhiR) 9965 Cmp = Builder.createNot(Cmp); 9966 VPValue *Or = Builder.createOr(PhiR, Cmp); 9967 Select->getVPSingleValue()->replaceAllUsesWith(Or); 9968 // Delete Select now that it has invalid types. 9969 ToDelete.push_back(Select); 9970 9971 // Convert the reduction phi to operate on bools. 9972 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( 9973 OrigLoop->getHeader()->getContext()))); 9974 continue; 9975 } 9976 9977 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 9978 RdxDesc.getRecurrenceKind())) { 9979 // Adjust the start value for FindLastIV recurrences to use the sentinel 9980 // value after generating the ResumePhi recipe, which uses the original 9981 // start value. 9982 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); 9983 } 9984 } 9985 9986 VPlanTransforms::clearReductionWrapFlags(*Plan); 9987 for (VPRecipeBase *R : ToDelete) 9988 R->eraseFromParent(); 9989 } 9990 9991 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9992 assert(!State.Lane && "VPDerivedIVRecipe being replicated."); 9993 9994 // Fast-math-flags propagate from the original induction instruction. 9995 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9996 if (FPBinOp) 9997 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9998 9999 Value *Step = State.get(getStepValue(), VPLane(0)); 10000 Value *Index = State.get(getOperand(1), VPLane(0)); 10001 Value *DerivedIV = emitTransformedIndex( 10002 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, 10003 cast_if_present<BinaryOperator>(FPBinOp)); 10004 DerivedIV->setName(Name); 10005 // If index is the vector trip count, the concrete value will only be set in 10006 // prepareToExecute, leading to missed simplifications, e.g. if it is 0. 10007 // TODO: Remove the special case for the vector trip count once it is computed 10008 // in VPlan and can be used during VPlan simplification. 10009 assert((DerivedIV != Index || 10010 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && 10011 "IV didn't need transforming?"); 10012 State.set(this, DerivedIV, VPLane(0)); 10013 } 10014 10015 void VPReplicateRecipe::execute(VPTransformState &State) { 10016 Instruction *UI = getUnderlyingInstr(); 10017 if (State.Lane) { // Generate a single instance. 10018 assert((State.VF.isScalar() || !isUniform()) && 10019 "uniform recipe shouldn't be predicated"); 10020 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 10021 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State); 10022 // Insert scalar instance packing it into a vector. 10023 if (State.VF.isVector() && shouldPack()) { 10024 // If we're constructing lane 0, initialize to start from poison. 10025 if (State.Lane->isFirstLane()) { 10026 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 10027 Value *Poison = PoisonValue::get( 10028 VectorType::get(UI->getType(), State.VF)); 10029 State.set(this, Poison); 10030 } 10031 State.packScalarIntoVectorValue(this, *State.Lane); 10032 } 10033 return; 10034 } 10035 10036 if (IsUniform) { 10037 // Uniform within VL means we need to generate lane 0. 10038 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State); 10039 return; 10040 } 10041 10042 // A store of a loop varying value to a uniform address only needs the last 10043 // copy of the store. 10044 if (isa<StoreInst>(UI) && 10045 vputils::isUniformAfterVectorization(getOperand(1))) { 10046 auto Lane = VPLane::getLastLaneForVF(State.VF); 10047 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 10048 return; 10049 } 10050 10051 // Generate scalar instances for all VF lanes. 10052 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 10053 const unsigned EndLane = State.VF.getKnownMinValue(); 10054 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 10055 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 10056 } 10057 10058 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10059 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10060 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10061 // for predication. 10062 static ScalarEpilogueLowering getScalarEpilogueLowering( 10063 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10064 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10065 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 10066 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10067 // don't look at hints or options, and don't request a scalar epilogue. 10068 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10069 // LoopAccessInfo (due to code dependency and not being able to reliably get 10070 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10071 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10072 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10073 // back to the old way and vectorize with versioning when forced. See D81345.) 10074 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10075 PGSOQueryType::IRPass) && 10076 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10077 return CM_ScalarEpilogueNotAllowedOptSize; 10078 10079 // 2) If set, obey the directives 10080 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10081 switch (PreferPredicateOverEpilogue) { 10082 case PreferPredicateTy::ScalarEpilogue: 10083 return CM_ScalarEpilogueAllowed; 10084 case PreferPredicateTy::PredicateElseScalarEpilogue: 10085 return CM_ScalarEpilogueNotNeededUsePredicate; 10086 case PreferPredicateTy::PredicateOrDontVectorize: 10087 return CM_ScalarEpilogueNotAllowedUsePredicate; 10088 }; 10089 } 10090 10091 // 3) If set, obey the hints 10092 switch (Hints.getPredicate()) { 10093 case LoopVectorizeHints::FK_Enabled: 10094 return CM_ScalarEpilogueNotNeededUsePredicate; 10095 case LoopVectorizeHints::FK_Disabled: 10096 return CM_ScalarEpilogueAllowed; 10097 }; 10098 10099 // 4) if the TTI hook indicates this is profitable, request predication. 10100 TailFoldingInfo TFI(TLI, &LVL, IAI); 10101 if (TTI->preferPredicateOverEpilogue(&TFI)) 10102 return CM_ScalarEpilogueNotNeededUsePredicate; 10103 10104 return CM_ScalarEpilogueAllowed; 10105 } 10106 10107 // Process the loop in the VPlan-native vectorization path. This path builds 10108 // VPlan upfront in the vectorization pipeline, which allows to apply 10109 // VPlan-to-VPlan transformations from the very beginning without modifying the 10110 // input LLVM IR. 10111 static bool processLoopInVPlanNativePath( 10112 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10113 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10114 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10115 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10116 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10117 LoopVectorizationRequirements &Requirements) { 10118 10119 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10120 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10121 return false; 10122 } 10123 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10124 Function *F = L->getHeader()->getParent(); 10125 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10126 10127 ScalarEpilogueLowering SEL = 10128 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 10129 10130 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10131 &Hints, IAI); 10132 // Use the planner for outer loop vectorization. 10133 // TODO: CM is not used at this point inside the planner. Turn CM into an 10134 // optional argument if we don't need it in the future. 10135 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 10136 ORE); 10137 10138 // Get user vectorization factor. 10139 ElementCount UserVF = Hints.getWidth(); 10140 10141 CM.collectElementTypesForWidening(); 10142 10143 // Plan how to best vectorize, return the best VF and its cost. 10144 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10145 10146 // If we are stress testing VPlan builds, do not attempt to generate vector 10147 // code. Masked vector code generation support will follow soon. 10148 // Also, do not attempt to vectorize if no vector code will be produced. 10149 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10150 return false; 10151 10152 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10153 10154 { 10155 bool AddBranchWeights = 10156 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10157 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10158 AddBranchWeights); 10159 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10160 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); 10161 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10162 << L->getHeader()->getParent()->getName() << "\"\n"); 10163 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10164 } 10165 10166 reportVectorization(ORE, L, VF, 1); 10167 10168 // Mark the loop as already vectorized to avoid vectorizing again. 10169 Hints.setAlreadyVectorized(); 10170 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10171 return true; 10172 } 10173 10174 // Emit a remark if there are stores to floats that required a floating point 10175 // extension. If the vectorized loop was generated with floating point there 10176 // will be a performance penalty from the conversion overhead and the change in 10177 // the vector width. 10178 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10179 SmallVector<Instruction *, 4> Worklist; 10180 for (BasicBlock *BB : L->getBlocks()) { 10181 for (Instruction &Inst : *BB) { 10182 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10183 if (S->getValueOperand()->getType()->isFloatTy()) 10184 Worklist.push_back(S); 10185 } 10186 } 10187 } 10188 10189 // Traverse the floating point stores upwards searching, for floating point 10190 // conversions. 10191 SmallPtrSet<const Instruction *, 4> Visited; 10192 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10193 while (!Worklist.empty()) { 10194 auto *I = Worklist.pop_back_val(); 10195 if (!L->contains(I)) 10196 continue; 10197 if (!Visited.insert(I).second) 10198 continue; 10199 10200 // Emit a remark if the floating point store required a floating 10201 // point conversion. 10202 // TODO: More work could be done to identify the root cause such as a 10203 // constant or a function return type and point the user to it. 10204 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10205 ORE->emit([&]() { 10206 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10207 I->getDebugLoc(), L->getHeader()) 10208 << "floating point conversion changes vector width. " 10209 << "Mixed floating point precision requires an up/down " 10210 << "cast that will negatively impact performance."; 10211 }); 10212 10213 for (Use &Op : I->operands()) 10214 if (auto *OpI = dyn_cast<Instruction>(Op)) 10215 Worklist.push_back(OpI); 10216 } 10217 } 10218 10219 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10220 VectorizationFactor &VF, Loop *L, 10221 const TargetTransformInfo &TTI, 10222 PredicatedScalarEvolution &PSE, 10223 ScalarEpilogueLowering SEL) { 10224 InstructionCost CheckCost = Checks.getCost(); 10225 if (!CheckCost.isValid()) 10226 return false; 10227 10228 // When interleaving only scalar and vector cost will be equal, which in turn 10229 // would lead to a divide by 0. Fall back to hard threshold. 10230 if (VF.Width.isScalar()) { 10231 if (CheckCost > VectorizeMemoryCheckThreshold) { 10232 LLVM_DEBUG( 10233 dbgs() 10234 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10235 return false; 10236 } 10237 return true; 10238 } 10239 10240 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10241 uint64_t ScalarC = *VF.ScalarCost.getValue(); 10242 if (ScalarC == 0) 10243 return true; 10244 10245 // First, compute the minimum iteration count required so that the vector 10246 // loop outperforms the scalar loop. 10247 // The total cost of the scalar loop is 10248 // ScalarC * TC 10249 // where 10250 // * TC is the actual trip count of the loop. 10251 // * ScalarC is the cost of a single scalar iteration. 10252 // 10253 // The total cost of the vector loop is 10254 // RtC + VecC * (TC / VF) + EpiC 10255 // where 10256 // * RtC is the cost of the generated runtime checks 10257 // * VecC is the cost of a single vector iteration. 10258 // * TC is the actual trip count of the loop 10259 // * VF is the vectorization factor 10260 // * EpiCost is the cost of the generated epilogue, including the cost 10261 // of the remaining scalar operations. 10262 // 10263 // Vectorization is profitable once the total vector cost is less than the 10264 // total scalar cost: 10265 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10266 // 10267 // Now we can compute the minimum required trip count TC as 10268 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 10269 // 10270 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10271 // the computations are performed on doubles, not integers and the result 10272 // is rounded up, hence we get an upper estimate of the TC. 10273 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); 10274 uint64_t RtC = *CheckCost.getValue(); 10275 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); 10276 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); 10277 10278 // Second, compute a minimum iteration count so that the cost of the 10279 // runtime checks is only a fraction of the total scalar loop cost. This 10280 // adds a loop-dependent bound on the overhead incurred if the runtime 10281 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10282 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10283 // cost, compute 10284 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10285 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); 10286 10287 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 10288 // epilogue is allowed, choose the next closest multiple of VF. This should 10289 // partly compensate for ignoring the epilogue cost. 10290 uint64_t MinTC = std::max(MinTC1, MinTC2); 10291 if (SEL == CM_ScalarEpilogueAllowed) 10292 MinTC = alignTo(MinTC, IntVF); 10293 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 10294 10295 LLVM_DEBUG( 10296 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10297 << VF.MinProfitableTripCount << "\n"); 10298 10299 // Skip vectorization if the expected trip count is less than the minimum 10300 // required trip count. 10301 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { 10302 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10303 VF.MinProfitableTripCount)) { 10304 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10305 "trip count < minimum profitable VF (" 10306 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10307 << ")\n"); 10308 10309 return false; 10310 } 10311 } 10312 return true; 10313 } 10314 10315 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10316 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10317 !EnableLoopInterleaving), 10318 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10319 !EnableLoopVectorization) {} 10320 10321 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue 10322 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that 10323 /// don't have a corresponding wide induction in \p EpiPlan. 10324 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { 10325 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those 10326 // will need their resume-values computed in the main vector loop. Others 10327 // can be removed from the main VPlan. 10328 SmallPtrSet<PHINode *, 2> EpiWidenedPhis; 10329 for (VPRecipeBase &R : 10330 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 10331 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10332 continue; 10333 EpiWidenedPhis.insert( 10334 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); 10335 } 10336 for (VPRecipeBase &R : make_early_inc_range( 10337 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { 10338 auto *VPIRInst = cast<VPIRInstruction>(&R); 10339 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); 10340 if (!IRI) 10341 break; 10342 if (EpiWidenedPhis.contains(IRI)) 10343 continue; 10344 // There is no corresponding wide induction in the epilogue plan that would 10345 // need a resume value. Remove the VPIRInst wrapping the scalar header phi 10346 // together with the corresponding ResumePhi. The resume values for the 10347 // scalar loop will be created during execution of EpiPlan. 10348 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); 10349 VPIRInst->eraseFromParent(); 10350 ResumePhi->eraseFromParent(); 10351 } 10352 VPlanTransforms::removeDeadRecipes(MainPlan); 10353 10354 using namespace VPlanPatternMatch; 10355 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); 10356 VPValue *VectorTC = &MainPlan.getVectorTripCount(); 10357 // If there is a suitable resume value for the canonical induction in the 10358 // scalar (which will become vector) epilogue loop we are done. Otherwise 10359 // create it below. 10360 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { 10361 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( 10362 m_Specific(VectorTC), m_SpecificInt(0))); 10363 })) 10364 return; 10365 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); 10366 ScalarPHBuilder.createNaryOp( 10367 VPInstruction::ResumePhi, 10368 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, 10369 "vec.epilog.resume.val"); 10370 } 10371 10372 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded 10373 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. 10374 static void 10375 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, 10376 const SCEV2ValueTy &ExpandedSCEVs, 10377 const EpilogueLoopVectorizationInfo &EPI) { 10378 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); 10379 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10380 Header->setName("vec.epilog.vector.body"); 10381 10382 // Re-use the trip count and steps expanded for the main loop, as 10383 // skeleton creation needs it as a value that dominates both the scalar 10384 // and vector epilogue loops 10385 // TODO: This is a workaround needed for epilogue vectorization and it 10386 // should be removed once induction resume value creation is done 10387 // directly in VPlan. 10388 for (auto &R : make_early_inc_range(*Plan.getEntry())) { 10389 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); 10390 if (!ExpandR) 10391 continue; 10392 auto *ExpandedVal = 10393 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10394 ExpandR->replaceAllUsesWith(ExpandedVal); 10395 if (Plan.getTripCount() == ExpandR) 10396 Plan.resetTripCount(ExpandedVal); 10397 ExpandR->eraseFromParent(); 10398 } 10399 10400 // Ensure that the start values for all header phi recipes are updated before 10401 // vectorizing the epilogue loop. 10402 for (VPRecipeBase &R : Header->phis()) { 10403 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { 10404 // When vectorizing the epilogue loop, the canonical induction start 10405 // value needs to be changed from zero to the value after the main 10406 // vector loop. Find the resume value created during execution of the main 10407 // VPlan. 10408 // FIXME: Improve modeling for canonical IV start values in the epilogue 10409 // loop. 10410 BasicBlock *MainMiddle = find_singleton<BasicBlock>( 10411 predecessors(L->getLoopPreheader()), 10412 [&EPI](BasicBlock *BB, bool) -> BasicBlock * { 10413 if (BB != EPI.MainLoopIterationCountCheck && 10414 BB != EPI.EpilogueIterationCountCheck && 10415 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck) 10416 return BB; 10417 return nullptr; 10418 }); 10419 using namespace llvm::PatternMatch; 10420 Type *IdxTy = IV->getScalarType(); 10421 PHINode *EPResumeVal = find_singleton<PHINode>( 10422 L->getLoopPreheader()->phis(), 10423 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * { 10424 if (P.getType() == IdxTy && 10425 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount && 10426 match( 10427 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), 10428 m_SpecificInt(0))) 10429 return &P; 10430 return nullptr; 10431 }); 10432 assert(EPResumeVal && "must have a resume value for the canonical IV"); 10433 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); 10434 assert(all_of(IV->users(), 10435 [](const VPUser *U) { 10436 return isa<VPScalarIVStepsRecipe>(U) || 10437 isa<VPScalarCastRecipe>(U) || 10438 isa<VPDerivedIVRecipe>(U) || 10439 cast<VPInstruction>(U)->getOpcode() == 10440 Instruction::Add; 10441 }) && 10442 "the canonical IV should only be used by its increment or " 10443 "ScalarIVSteps when resetting the start value"); 10444 IV->setOperand(0, VPV); 10445 continue; 10446 } 10447 10448 Value *ResumeV = nullptr; 10449 // TODO: Move setting of resume values to prepareToExecute. 10450 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10451 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) 10452 ->getIncomingValueForBlock(L->getLoopPreheader()); 10453 const RecurrenceDescriptor &RdxDesc = 10454 ReductionPhi->getRecurrenceDescriptor(); 10455 RecurKind RK = RdxDesc.getRecurrenceKind(); 10456 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 10457 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10458 // start value; compare the final value from the main vector loop 10459 // to the start value. 10460 IRBuilder<> Builder( 10461 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10462 ResumeV = 10463 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); 10464 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 10465 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment 10466 // to the resume value. The resume value is adjusted to the sentinel 10467 // value when the final value from the main vector loop equals the start 10468 // value. This ensures correctness when the start value might not be 10469 // less than the minimum value of a monotonically increasing induction 10470 // variable. 10471 IRBuilder<> Builder( 10472 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10473 Value *Cmp = 10474 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); 10475 ResumeV = 10476 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); 10477 } 10478 } else { 10479 // Retrieve the induction resume values for wide inductions from 10480 // their original phi nodes in the scalar loop. 10481 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode(); 10482 // Hook up to the PHINode generated by a ResumePhi recipe of main 10483 // loop VPlan, which feeds the scalar loop. 10484 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); 10485 } 10486 assert(ResumeV && "Must have a resume value"); 10487 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); 10488 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10489 } 10490 } 10491 10492 bool LoopVectorizePass::processLoop(Loop *L) { 10493 assert((EnableVPlanNativePath || L->isInnermost()) && 10494 "VPlan-native path is not enabled. Only process inner loops."); 10495 10496 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10497 << L->getHeader()->getParent()->getName() << "' from " 10498 << L->getLocStr() << "\n"); 10499 10500 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10501 10502 LLVM_DEBUG( 10503 dbgs() << "LV: Loop hints:" 10504 << " force=" 10505 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10506 ? "disabled" 10507 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10508 ? "enabled" 10509 : "?")) 10510 << " width=" << Hints.getWidth() 10511 << " interleave=" << Hints.getInterleave() << "\n"); 10512 10513 // Function containing loop 10514 Function *F = L->getHeader()->getParent(); 10515 10516 // Looking at the diagnostic output is the only way to determine if a loop 10517 // was vectorized (other than looking at the IR or machine code), so it 10518 // is important to generate an optimization remark for each loop. Most of 10519 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10520 // generated as OptimizationRemark and OptimizationRemarkMissed are 10521 // less verbose reporting vectorized loops and unvectorized loops that may 10522 // benefit from vectorization, respectively. 10523 10524 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10525 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10526 return false; 10527 } 10528 10529 PredicatedScalarEvolution PSE(*SE, *L); 10530 10531 // Check if it is legal to vectorize the loop. 10532 LoopVectorizationRequirements Requirements; 10533 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10534 &Requirements, &Hints, DB, AC, BFI, PSI); 10535 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10536 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10537 Hints.emitRemarkWithHints(); 10538 return false; 10539 } 10540 10541 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { 10542 reportVectorizationFailure("Auto-vectorization of loops with uncountable " 10543 "early exit is not enabled", 10544 "UncountableEarlyExitLoopsDisabled", ORE, L); 10545 return false; 10546 } 10547 10548 if (LVL.hasStructVectorCall()) { 10549 reportVectorizationFailure("Auto-vectorization of calls that return struct " 10550 "types is not yet supported", 10551 "StructCallVectorizationUnsupported", ORE, L); 10552 return false; 10553 } 10554 10555 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10556 // here. They may require CFG and instruction level transformations before 10557 // even evaluating whether vectorization is profitable. Since we cannot modify 10558 // the incoming IR, we need to build VPlan upfront in the vectorization 10559 // pipeline. 10560 if (!L->isInnermost()) 10561 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10562 ORE, BFI, PSI, Hints, Requirements); 10563 10564 assert(L->isInnermost() && "Inner loop expected."); 10565 10566 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10567 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10568 10569 // If an override option has been passed in for interleaved accesses, use it. 10570 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10571 UseInterleaved = EnableInterleavedMemAccesses; 10572 10573 // Analyze interleaved memory accesses. 10574 if (UseInterleaved) 10575 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10576 10577 if (LVL.hasUncountableEarlyExit()) { 10578 BasicBlock *LoopLatch = L->getLoopLatch(); 10579 if (IAI.requiresScalarEpilogue() || 10580 any_of(LVL.getCountableExitingBlocks(), 10581 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { 10582 reportVectorizationFailure("Auto-vectorization of early exit loops " 10583 "requiring a scalar epilogue is unsupported", 10584 "UncountableEarlyExitUnsupported", ORE, L); 10585 return false; 10586 } 10587 } 10588 10589 // Check the function attributes and profiles to find out if this function 10590 // should be optimized for size. 10591 ScalarEpilogueLowering SEL = 10592 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10593 10594 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10595 // count by optimizing for size, to minimize overheads. 10596 auto ExpectedTC = getSmallBestKnownTC(PSE, L); 10597 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10598 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10599 << "This loop is worth vectorizing only if no scalar " 10600 << "iteration overheads are incurred."); 10601 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10602 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10603 else { 10604 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10605 LLVM_DEBUG(dbgs() << "\n"); 10606 // Predicate tail-folded loops are efficient even when the loop 10607 // iteration count is low. However, setting the epilogue policy to 10608 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 10609 // with runtime checks. It's more effective to let 10610 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 10611 // for the loop. 10612 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 10613 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10614 } else { 10615 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10616 "small to consider vectorizing.\n"); 10617 reportVectorizationFailure( 10618 "The trip count is below the minial threshold value.", 10619 "loop trip count is too low, avoiding vectorization", 10620 "LowTripCount", ORE, L); 10621 Hints.emitRemarkWithHints(); 10622 return false; 10623 } 10624 } 10625 } 10626 10627 // Check the function attributes to see if implicit floats or vectors are 10628 // allowed. 10629 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10630 reportVectorizationFailure( 10631 "Can't vectorize when the NoImplicitFloat attribute is used", 10632 "loop not vectorized due to NoImplicitFloat attribute", 10633 "NoImplicitFloat", ORE, L); 10634 Hints.emitRemarkWithHints(); 10635 return false; 10636 } 10637 10638 // Check if the target supports potentially unsafe FP vectorization. 10639 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10640 // for the target we're vectorizing for, to make sure none of the 10641 // additional fp-math flags can help. 10642 if (Hints.isPotentiallyUnsafe() && 10643 TTI->isFPVectorizationPotentiallyUnsafe()) { 10644 reportVectorizationFailure( 10645 "Potentially unsafe FP op prevents vectorization", 10646 "loop not vectorized due to unsafe FP support.", 10647 "UnsafeFP", ORE, L); 10648 Hints.emitRemarkWithHints(); 10649 return false; 10650 } 10651 10652 bool AllowOrderedReductions; 10653 // If the flag is set, use that instead and override the TTI behaviour. 10654 if (ForceOrderedReductions.getNumOccurrences() > 0) 10655 AllowOrderedReductions = ForceOrderedReductions; 10656 else 10657 AllowOrderedReductions = TTI->enableOrderedReductions(); 10658 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10659 ORE->emit([&]() { 10660 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10661 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10662 ExactFPMathInst->getDebugLoc(), 10663 ExactFPMathInst->getParent()) 10664 << "loop not vectorized: cannot prove it is safe to reorder " 10665 "floating-point operations"; 10666 }); 10667 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10668 "reorder floating-point operations\n"); 10669 Hints.emitRemarkWithHints(); 10670 return false; 10671 } 10672 10673 // Use the cost model. 10674 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10675 F, &Hints, IAI); 10676 // Use the planner for vectorization. 10677 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10678 ORE); 10679 10680 // Get user vectorization factor and interleave count. 10681 ElementCount UserVF = Hints.getWidth(); 10682 unsigned UserIC = Hints.getInterleave(); 10683 10684 // Plan how to best vectorize. 10685 LVP.plan(UserVF, UserIC); 10686 VectorizationFactor VF = LVP.computeBestVF(); 10687 unsigned IC = 1; 10688 10689 if (ORE->allowExtraAnalysis(LV_NAME)) 10690 LVP.emitInvalidCostRemarks(ORE); 10691 10692 bool AddBranchWeights = 10693 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10694 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10695 AddBranchWeights); 10696 if (LVP.hasPlanWithVF(VF.Width)) { 10697 // Select the interleave count. 10698 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10699 10700 unsigned SelectedIC = std::max(IC, UserIC); 10701 // Optimistically generate runtime checks if they are needed. Drop them if 10702 // they turn out to not be profitable. 10703 if (VF.Width.isVector() || SelectedIC > 1) 10704 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10705 10706 // Check if it is profitable to vectorize with runtime checks. 10707 bool ForceVectorization = 10708 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10709 if (!ForceVectorization && 10710 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { 10711 ORE->emit([&]() { 10712 return OptimizationRemarkAnalysisAliasing( 10713 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10714 L->getHeader()) 10715 << "loop not vectorized: cannot prove it is safe to reorder " 10716 "memory operations"; 10717 }); 10718 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10719 Hints.emitRemarkWithHints(); 10720 return false; 10721 } 10722 } 10723 10724 // Identify the diagnostic messages that should be produced. 10725 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10726 bool VectorizeLoop = true, InterleaveLoop = true; 10727 if (VF.Width.isScalar()) { 10728 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10729 VecDiagMsg = std::make_pair( 10730 "VectorizationNotBeneficial", 10731 "the cost-model indicates that vectorization is not beneficial"); 10732 VectorizeLoop = false; 10733 } 10734 10735 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { 10736 // Tell the user interleaving was avoided up-front, despite being explicitly 10737 // requested. 10738 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10739 "interleaving should be avoided up front\n"); 10740 IntDiagMsg = std::make_pair( 10741 "InterleavingAvoided", 10742 "Ignoring UserIC, because interleaving was avoided up front"); 10743 InterleaveLoop = false; 10744 } else if (IC == 1 && UserIC <= 1) { 10745 // Tell the user interleaving is not beneficial. 10746 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10747 IntDiagMsg = std::make_pair( 10748 "InterleavingNotBeneficial", 10749 "the cost-model indicates that interleaving is not beneficial"); 10750 InterleaveLoop = false; 10751 if (UserIC == 1) { 10752 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10753 IntDiagMsg.second += 10754 " and is explicitly disabled or interleave count is set to 1"; 10755 } 10756 } else if (IC > 1 && UserIC == 1) { 10757 // Tell the user interleaving is beneficial, but it explicitly disabled. 10758 LLVM_DEBUG( 10759 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10760 IntDiagMsg = std::make_pair( 10761 "InterleavingBeneficialButDisabled", 10762 "the cost-model indicates that interleaving is beneficial " 10763 "but is explicitly disabled or interleave count is set to 1"); 10764 InterleaveLoop = false; 10765 } 10766 10767 // If there is a histogram in the loop, do not just interleave without 10768 // vectorizing. The order of operations will be incorrect without the 10769 // histogram intrinsics, which are only used for recipes with VF > 1. 10770 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) { 10771 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " 10772 << "to histogram operations.\n"); 10773 IntDiagMsg = std::make_pair( 10774 "HistogramPreventsScalarInterleaving", 10775 "Unable to interleave without vectorization due to constraints on " 10776 "the order of histogram operations"); 10777 InterleaveLoop = false; 10778 } 10779 10780 // Override IC if user provided an interleave count. 10781 IC = UserIC > 0 ? UserIC : IC; 10782 10783 // Emit diagnostic messages, if any. 10784 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10785 if (!VectorizeLoop && !InterleaveLoop) { 10786 // Do not vectorize or interleaving the loop. 10787 ORE->emit([&]() { 10788 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10789 L->getStartLoc(), L->getHeader()) 10790 << VecDiagMsg.second; 10791 }); 10792 ORE->emit([&]() { 10793 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10794 L->getStartLoc(), L->getHeader()) 10795 << IntDiagMsg.second; 10796 }); 10797 return false; 10798 } 10799 10800 if (!VectorizeLoop && InterleaveLoop) { 10801 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10802 ORE->emit([&]() { 10803 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10804 L->getStartLoc(), L->getHeader()) 10805 << VecDiagMsg.second; 10806 }); 10807 } else if (VectorizeLoop && !InterleaveLoop) { 10808 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10809 << ") in " << L->getLocStr() << '\n'); 10810 ORE->emit([&]() { 10811 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10812 L->getStartLoc(), L->getHeader()) 10813 << IntDiagMsg.second; 10814 }); 10815 } else if (VectorizeLoop && InterleaveLoop) { 10816 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10817 << ") in " << L->getLocStr() << '\n'); 10818 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10819 } 10820 10821 bool DisableRuntimeUnroll = false; 10822 MDNode *OrigLoopID = L->getLoopID(); 10823 { 10824 using namespace ore; 10825 if (!VectorizeLoop) { 10826 assert(IC > 1 && "interleave count should not be 1 or 0"); 10827 // If we decided that it is not legal to vectorize the loop, then 10828 // interleave it. 10829 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10830 InnerLoopVectorizer Unroller( 10831 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), 10832 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan); 10833 10834 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10835 10836 ORE->emit([&]() { 10837 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10838 L->getHeader()) 10839 << "interleaved loop (interleaved count: " 10840 << NV("InterleaveCount", IC) << ")"; 10841 }); 10842 } else { 10843 // If we decided that it is *legal* to vectorize the loop, then do it. 10844 10845 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10846 // Consider vectorizing the epilogue too if it's profitable. 10847 VectorizationFactor EpilogueVF = 10848 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10849 if (EpilogueVF.Width.isVector()) { 10850 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); 10851 10852 // The first pass vectorizes the main loop and creates a scalar epilogue 10853 // to be vectorized by executing the plan (potentially with a different 10854 // factor) again shortly afterwards. 10855 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); 10856 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); 10857 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, 10858 BestEpiPlan); 10859 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10860 EPI, &LVL, &CM, BFI, PSI, Checks, 10861 *BestMainPlan); 10862 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10863 *BestMainPlan, MainILV, DT, false); 10864 ++LoopsVectorized; 10865 10866 // Second pass vectorizes the epilogue and adjusts the control flow 10867 // edges from the first pass. 10868 EPI.MainLoopVF = EPI.EpilogueVF; 10869 EPI.MainLoopUF = EPI.EpilogueUF; 10870 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10871 ORE, EPI, &LVL, &CM, BFI, PSI, 10872 Checks, BestEpiPlan); 10873 EpilogILV.setTripCount(MainILV.getTripCount()); 10874 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); 10875 10876 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10877 DT, true, &ExpandedSCEVs); 10878 ++LoopsEpilogueVectorized; 10879 10880 if (!MainILV.areSafetyChecksAdded()) 10881 DisableRuntimeUnroll = true; 10882 } else { 10883 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10884 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10885 PSI, Checks, BestPlan); 10886 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10887 ++LoopsVectorized; 10888 10889 // Add metadata to disable runtime unrolling a scalar loop when there 10890 // are no runtime checks about strides and memory. A scalar loop that is 10891 // rarely used is not worth unrolling. 10892 if (!LB.areSafetyChecksAdded()) 10893 DisableRuntimeUnroll = true; 10894 } 10895 // Report the vectorization decision. 10896 reportVectorization(ORE, L, VF, IC); 10897 } 10898 10899 if (ORE->allowExtraAnalysis(LV_NAME)) 10900 checkMixedPrecision(L, ORE); 10901 } 10902 10903 assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 10904 "DT not preserved correctly"); 10905 10906 std::optional<MDNode *> RemainderLoopID = 10907 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10908 LLVMLoopVectorizeFollowupEpilogue}); 10909 if (RemainderLoopID) { 10910 L->setLoopID(*RemainderLoopID); 10911 } else { 10912 if (DisableRuntimeUnroll) 10913 addRuntimeUnrollDisableMetaData(L); 10914 10915 // Mark the loop as already vectorized to avoid vectorizing again. 10916 Hints.setAlreadyVectorized(); 10917 } 10918 10919 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10920 return true; 10921 } 10922 10923 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { 10924 10925 // Don't attempt if 10926 // 1. the target claims to have no vector registers, and 10927 // 2. interleaving won't help ILP. 10928 // 10929 // The second condition is necessary because, even if the target has no 10930 // vector registers, loop vectorization may still enable scalar 10931 // interleaving. 10932 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10933 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10934 return LoopVectorizeResult(false, false); 10935 10936 bool Changed = false, CFGChanged = false; 10937 10938 // The vectorizer requires loops to be in simplified form. 10939 // Since simplification may add new inner loops, it has to run before the 10940 // legality and profitability checks. This means running the loop vectorizer 10941 // will simplify all loops, regardless of whether anything end up being 10942 // vectorized. 10943 for (const auto &L : *LI) 10944 Changed |= CFGChanged |= 10945 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10946 10947 // Build up a worklist of inner-loops to vectorize. This is necessary as 10948 // the act of vectorizing or partially unrolling a loop creates new loops 10949 // and can invalidate iterators across the loops. 10950 SmallVector<Loop *, 8> Worklist; 10951 10952 for (Loop *L : *LI) 10953 collectSupportedLoops(*L, LI, ORE, Worklist); 10954 10955 LoopsAnalyzed += Worklist.size(); 10956 10957 // Now walk the identified inner loops. 10958 while (!Worklist.empty()) { 10959 Loop *L = Worklist.pop_back_val(); 10960 10961 // For the inner loops we actually process, form LCSSA to simplify the 10962 // transform. 10963 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10964 10965 Changed |= CFGChanged |= processLoop(L); 10966 10967 if (Changed) { 10968 LAIs->clear(); 10969 10970 #ifndef NDEBUG 10971 if (VerifySCEV) 10972 SE->verify(); 10973 #endif 10974 } 10975 } 10976 10977 // Process each loop nest in the function. 10978 return LoopVectorizeResult(Changed, CFGChanged); 10979 } 10980 10981 PreservedAnalyses LoopVectorizePass::run(Function &F, 10982 FunctionAnalysisManager &AM) { 10983 LI = &AM.getResult<LoopAnalysis>(F); 10984 // There are no loops in the function. Return before computing other 10985 // expensive analyses. 10986 if (LI->empty()) 10987 return PreservedAnalyses::all(); 10988 SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 10989 TTI = &AM.getResult<TargetIRAnalysis>(F); 10990 DT = &AM.getResult<DominatorTreeAnalysis>(F); 10991 TLI = &AM.getResult<TargetLibraryAnalysis>(F); 10992 AC = &AM.getResult<AssumptionAnalysis>(F); 10993 DB = &AM.getResult<DemandedBitsAnalysis>(F); 10994 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10995 LAIs = &AM.getResult<LoopAccessAnalysis>(F); 10996 10997 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10998 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10999 BFI = nullptr; 11000 if (PSI && PSI->hasProfileSummary()) 11001 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 11002 LoopVectorizeResult Result = runImpl(F); 11003 if (!Result.MadeAnyChange) 11004 return PreservedAnalyses::all(); 11005 PreservedAnalyses PA; 11006 11007 if (isAssignmentTrackingEnabled(*F.getParent())) { 11008 for (auto &BB : F) 11009 RemoveRedundantDbgInstrs(&BB); 11010 } 11011 11012 PA.preserve<LoopAnalysis>(); 11013 PA.preserve<DominatorTreeAnalysis>(); 11014 PA.preserve<ScalarEvolutionAnalysis>(); 11015 PA.preserve<LoopAccessAnalysis>(); 11016 11017 if (Result.MadeCFGChange) { 11018 // Making CFG changes likely means a loop got vectorized. Indicate that 11019 // extra simplification passes should be run. 11020 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 11021 // be run if runtime checks have been added. 11022 AM.getResult<ShouldRunExtraVectorPasses>(F); 11023 PA.preserve<ShouldRunExtraVectorPasses>(); 11024 } else { 11025 PA.preserveSet<CFGAnalyses>(); 11026 } 11027 return PA; 11028 } 11029 11030 void LoopVectorizePass::printPipeline( 11031 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 11032 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 11033 OS, MapClassName2PassName); 11034 11035 OS << '<'; 11036 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 11037 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 11038 OS << '>'; 11039 } 11040