1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanPatternMatch.h" 63 #include "VPlanTransforms.h" 64 #include "VPlanUtils.h" 65 #include "VPlanVerifier.h" 66 #include "llvm/ADT/APInt.h" 67 #include "llvm/ADT/ArrayRef.h" 68 #include "llvm/ADT/DenseMap.h" 69 #include "llvm/ADT/DenseMapInfo.h" 70 #include "llvm/ADT/Hashing.h" 71 #include "llvm/ADT/MapVector.h" 72 #include "llvm/ADT/STLExtras.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/TypeSwitch.h" 79 #include "llvm/ADT/iterator_range.h" 80 #include "llvm/Analysis/AssumptionCache.h" 81 #include "llvm/Analysis/BasicAliasAnalysis.h" 82 #include "llvm/Analysis/BlockFrequencyInfo.h" 83 #include "llvm/Analysis/CFG.h" 84 #include "llvm/Analysis/CodeMetrics.h" 85 #include "llvm/Analysis/DemandedBits.h" 86 #include "llvm/Analysis/GlobalsModRef.h" 87 #include "llvm/Analysis/LoopAccessAnalysis.h" 88 #include "llvm/Analysis/LoopAnalysisManager.h" 89 #include "llvm/Analysis/LoopInfo.h" 90 #include "llvm/Analysis/LoopIterator.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/ValueTracking.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfo.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/MDBuilder.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/PatternMatch.h" 122 #include "llvm/IR/ProfDataUtils.h" 123 #include "llvm/IR/Type.h" 124 #include "llvm/IR/Use.h" 125 #include "llvm/IR/User.h" 126 #include "llvm/IR/Value.h" 127 #include "llvm/IR/Verifier.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/NativeFormatting.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/Local.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 251 "data-and-control-without-rt-check", 252 "Similar to data-and-control, but remove the runtime check"), 253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", 254 "Use predicated EVL instructions for tail folding. If EVL " 255 "is unsupported, fallback to data-without-lane-mask."))); 256 257 static cl::opt<bool> MaximizeBandwidth( 258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 259 cl::desc("Maximize bandwidth when selecting vectorization factor which " 260 "will be determined by the smallest type in loop.")); 261 262 static cl::opt<bool> EnableInterleavedMemAccesses( 263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 264 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 265 266 /// An interleave-group may need masking if it resides in a block that needs 267 /// predication, or in order to mask away gaps. 268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// The number of stores in a loop that are allowed to need predication. 320 static cl::opt<unsigned> NumberOfStoresToPredicate( 321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 322 cl::desc("Max number of stores to be predicated behind an if.")); 323 324 static cl::opt<bool> EnableIndVarRegisterHeur( 325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 326 cl::desc("Count the induction variable only once when interleaving")); 327 328 static cl::opt<bool> EnableCondStoresVectorization( 329 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 330 cl::desc("Enable if predication of stores during vectorization.")); 331 332 static cl::opt<unsigned> MaxNestedScalarReductionIC( 333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 334 cl::desc("The maximum interleave count to use when interleaving a scalar " 335 "reduction in a nested loop.")); 336 337 static cl::opt<bool> 338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 339 cl::Hidden, 340 cl::desc("Prefer in-loop vector reductions, " 341 "overriding the targets preference.")); 342 343 static cl::opt<bool> ForceOrderedReductions( 344 "force-ordered-reductions", cl::init(false), cl::Hidden, 345 cl::desc("Enable the vectorisation of loops with in-order (strict) " 346 "FP reductions")); 347 348 static cl::opt<bool> PreferPredicatedReductionSelect( 349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 350 cl::desc( 351 "Prefer predicating a reduction operation over an after loop select.")); 352 353 namespace llvm { 354 cl::opt<bool> EnableVPlanNativePath( 355 "enable-vplan-native-path", cl::Hidden, 356 cl::desc("Enable VPlan-native vectorization path with " 357 "support for outer loop vectorization.")); 358 } // namespace llvm 359 360 // This flag enables the stress testing of the VPlan H-CFG construction in the 361 // VPlan-native vectorization path. It must be used in conjuction with 362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363 // verification of the H-CFGs built. 364 static cl::opt<bool> VPlanBuildStressTest( 365 "vplan-build-stress-test", cl::init(false), cl::Hidden, 366 cl::desc( 367 "Build VPlan for every supported loop nest in the function and bail " 368 "out right after the build (stress test the VPlan H-CFG construction " 369 "in the VPlan-native vectorization path).")); 370 371 cl::opt<bool> llvm::EnableLoopInterleaving( 372 "interleave-loops", cl::init(true), cl::Hidden, 373 cl::desc("Enable loop interleaving in Loop vectorization passes")); 374 cl::opt<bool> llvm::EnableLoopVectorization( 375 "vectorize-loops", cl::init(true), cl::Hidden, 376 cl::desc("Run the Loop vectorization passes")); 377 378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 379 "force-widen-divrem-via-safe-divisor", cl::Hidden, 380 cl::desc( 381 "Override cost based safe divisor widening for div/rem instructions")); 382 383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 385 cl::Hidden, 386 cl::desc("Try wider VFs if they enable the use of vector variants")); 387 388 static cl::opt<bool> EnableEarlyExitVectorization( 389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden, 390 cl::desc( 391 "Enable vectorization of early exit loops with uncountable exits.")); 392 393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 394 // variables not overflowing do not hold. See `emitSCEVChecks`. 395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 397 // `emitMemRuntimeChecks`. 398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 399 // Likelyhood of bypassing the vectorized loop because there are zero trips left 400 // after prolog. See `emitIterationCountCheck`. 401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 402 403 /// A helper function that returns true if the given type is irregular. The 404 /// type is irregular if its allocated size doesn't equal the store size of an 405 /// element of the corresponding vector type. 406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 407 // Determine if an array of N elements of type Ty is "bitcast compatible" 408 // with a <N x Ty> vector. 409 // This is only true if there is no padding between the array elements. 410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. 418 /// 4) Returns std::nullopt if all of the above failed. 419 static std::optional<unsigned> 420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, 421 bool CanUseConstantMax = true) { 422 // Check if exact trip count is known. 423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) 424 return ExpectedTC; 425 426 // Check if there is an expected trip count available from profile data. 427 if (LoopVectorizeWithBlockFrequency) 428 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 429 return *EstimatedTC; 430 431 if (!CanUseConstantMax) 432 return std::nullopt; 433 434 // Check if upper bound estimate is known. 435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) 436 return ExpectedTC; 437 438 return std::nullopt; 439 } 440 441 namespace { 442 // Forward declare GeneratedRTChecks. 443 class GeneratedRTChecks; 444 445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 446 } // namespace 447 448 namespace llvm { 449 450 AnalysisKey ShouldRunExtraVectorPasses::Key; 451 452 /// InnerLoopVectorizer vectorizes loops which contain only one basic 453 /// block to a specified vectorization factor (VF). 454 /// This class performs the widening of scalars into vectors, or multiple 455 /// scalars. This class also implements the following features: 456 /// * It inserts an epilogue loop for handling loops that don't have iteration 457 /// counts that are known to be a multiple of the vectorization factor. 458 /// * It handles the code generation for reduction variables. 459 /// * Scalarization (implementation using scalars) of un-vectorizable 460 /// instructions. 461 /// InnerLoopVectorizer does not perform any vectorization-legality 462 /// checks, and relies on the caller to check for the different legality 463 /// aspects. The InnerLoopVectorizer relies on the 464 /// LoopVectorizationLegality class to provide information about the induction 465 /// and reduction variables that were found to a given vectorization factor. 466 class InnerLoopVectorizer { 467 public: 468 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 469 LoopInfo *LI, DominatorTree *DT, 470 const TargetLibraryInfo *TLI, 471 const TargetTransformInfo *TTI, AssumptionCache *AC, 472 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 473 ElementCount MinProfitableTripCount, 474 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 475 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, 477 VPlan &Plan) 478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 479 AC(AC), ORE(ORE), VF(VecWidth), 480 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), 481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 482 PSI(PSI), RTChecks(RTChecks), Plan(Plan), 483 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { 484 // Query this against the original loop and save it here because the profile 485 // of the original loop header may change as the transformation happens. 486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 487 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 488 } 489 490 virtual ~InnerLoopVectorizer() = default; 491 492 /// Create a new empty loop that will contain vectorized instructions later 493 /// on, while the old loop will be used as the scalar remainder. Control flow 494 /// is generated around the vectorized (and scalar epilogue) loops consisting 495 /// of various checks and bypasses. Return the pre-header block of the new 496 /// loop. In the case of epilogue vectorization, this function is overriden to 497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is 498 /// used to look up SCEV expansions for expressions needed during skeleton 499 /// creation. 500 virtual BasicBlock * 501 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 502 503 /// Fix the vectorized code, taking care of header phi's, and more. 504 void fixVectorizedLoop(VPTransformState &State); 505 506 // Return true if any runtime check is added. 507 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 508 509 /// A helper function to scalarize a single Instruction in the innermost loop. 510 /// Generates a sequence of scalar instances for each lane between \p MinLane 511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 513 /// Instr's operands. 514 void scalarizeInstruction(const Instruction *Instr, 515 VPReplicateRecipe *RepRecipe, const VPLane &Lane, 516 VPTransformState &State); 517 518 /// Fix the non-induction PHIs in \p Plan. 519 void fixNonInductionPHIs(VPTransformState &State); 520 521 /// Returns the original loop trip count. 522 Value *getTripCount() const { return TripCount; } 523 524 /// Used to set the trip count after ILV's construction and after the 525 /// preheader block has been executed. Note that this always holds the trip 526 /// count of the original loop for both main loop and epilogue vectorization. 527 void setTripCount(Value *TC) { TripCount = TC; } 528 529 // Retrieve the additional bypass value associated with an original 530 /// induction header phi. 531 Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { 532 return Induction2AdditionalBypassValue.at(OrigPhi); 533 } 534 535 /// Return the additional bypass block which targets the scalar loop by 536 /// skipping the epilogue loop after completing the main loop. 537 BasicBlock *getAdditionalBypassBlock() const { 538 assert(AdditionalBypassBlock && 539 "Trying to access AdditionalBypassBlock but it has not been set"); 540 return AdditionalBypassBlock; 541 } 542 543 protected: 544 friend class LoopVectorizationPlanner; 545 546 /// Set up the values of the IVs correctly when exiting the vector loop. 547 virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 548 Value *VectorTripCount, BasicBlock *MiddleBlock, 549 VPTransformState &State); 550 551 /// Iteratively sink the scalarized operands of a predicated instruction into 552 /// the block that was created for it. 553 void sinkScalarOperands(Instruction *PredInst); 554 555 /// Returns (and creates if needed) the trip count of the widened loop. 556 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 557 558 /// Emit a bypass check to see if the vector trip count is zero, including if 559 /// it overflows. 560 void emitIterationCountCheck(BasicBlock *Bypass); 561 562 /// Emit a bypass check to see if all of the SCEV assumptions we've 563 /// had to make are correct. Returns the block containing the checks or 564 /// nullptr if no checks have been added. 565 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 566 567 /// Emit bypass checks to check any memory assumptions we may have made. 568 /// Returns the block containing the checks or nullptr if no checks have been 569 /// added. 570 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 571 572 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 573 /// vector loop preheader, middle block and scalar preheader. 574 void createVectorLoopSkeleton(StringRef Prefix); 575 576 /// Create and record the values for induction variables to resume coming from 577 /// the additional bypass block. 578 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, 579 Value *MainVectorTripCount); 580 581 /// Allow subclasses to override and print debug traces before/after vplan 582 /// execution, when trace information is requested. 583 virtual void printDebugTracesAtStart() {} 584 virtual void printDebugTracesAtEnd() {} 585 586 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the 587 /// vector preheader and its predecessor, also connecting the new block to the 588 /// scalar preheader. 589 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); 590 591 /// The original loop. 592 Loop *OrigLoop; 593 594 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 595 /// dynamic knowledge to simplify SCEV expressions and converts them to a 596 /// more usable form. 597 PredicatedScalarEvolution &PSE; 598 599 /// Loop Info. 600 LoopInfo *LI; 601 602 /// Dominator Tree. 603 DominatorTree *DT; 604 605 /// Target Library Info. 606 const TargetLibraryInfo *TLI; 607 608 /// Target Transform Info. 609 const TargetTransformInfo *TTI; 610 611 /// Assumption Cache. 612 AssumptionCache *AC; 613 614 /// Interface to emit optimization remarks. 615 OptimizationRemarkEmitter *ORE; 616 617 /// The vectorization SIMD factor to use. Each vector will have this many 618 /// vector elements. 619 ElementCount VF; 620 621 ElementCount MinProfitableTripCount; 622 623 /// The vectorization unroll factor to use. Each scalar is vectorized to this 624 /// many different vector instructions. 625 unsigned UF; 626 627 /// The builder that we use 628 IRBuilder<> Builder; 629 630 // --- Vectorization state --- 631 632 /// The vector-loop preheader. 633 BasicBlock *LoopVectorPreHeader; 634 635 /// The scalar-loop preheader. 636 BasicBlock *LoopScalarPreHeader; 637 638 /// Middle Block between the vector and the scalar. 639 BasicBlock *LoopMiddleBlock; 640 641 /// A list of all bypass blocks. The first block is the entry of the loop. 642 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 643 644 /// Store instructions that were predicated. 645 SmallVector<Instruction *, 4> PredicatedInstructions; 646 647 /// Trip count of the original loop. 648 Value *TripCount = nullptr; 649 650 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 651 Value *VectorTripCount = nullptr; 652 653 /// The legality analysis. 654 LoopVectorizationLegality *Legal; 655 656 /// The profitablity analysis. 657 LoopVectorizationCostModel *Cost; 658 659 // Record whether runtime checks are added. 660 bool AddedSafetyChecks = false; 661 662 /// BFI and PSI are used to check for profile guided size optimizations. 663 BlockFrequencyInfo *BFI; 664 ProfileSummaryInfo *PSI; 665 666 // Whether this loop should be optimized for size based on profile guided size 667 // optimizatios. 668 bool OptForSizeBasedOnProfile; 669 670 /// Structure to hold information about generated runtime checks, responsible 671 /// for cleaning the checks, if vectorization turns out unprofitable. 672 GeneratedRTChecks &RTChecks; 673 674 /// Mapping of induction phis to their additional bypass values. They 675 /// need to be added as operands to phi nodes in the scalar loop preheader 676 /// after the epilogue skeleton has been created. 677 DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue; 678 679 /// The additional bypass block which conditionally skips over the epilogue 680 /// loop after executing the main loop. Needed to resume inductions and 681 /// reductions during epilogue vectorization. 682 BasicBlock *AdditionalBypassBlock = nullptr; 683 684 VPlan &Plan; 685 686 /// The vector preheader block of \p Plan, used as target for check blocks 687 /// introduced during skeleton creation. 688 VPBlockBase *VectorPHVPB; 689 }; 690 691 /// Encapsulate information regarding vectorization of a loop and its epilogue. 692 /// This information is meant to be updated and used across two stages of 693 /// epilogue vectorization. 694 struct EpilogueLoopVectorizationInfo { 695 ElementCount MainLoopVF = ElementCount::getFixed(0); 696 unsigned MainLoopUF = 0; 697 ElementCount EpilogueVF = ElementCount::getFixed(0); 698 unsigned EpilogueUF = 0; 699 BasicBlock *MainLoopIterationCountCheck = nullptr; 700 BasicBlock *EpilogueIterationCountCheck = nullptr; 701 BasicBlock *SCEVSafetyCheck = nullptr; 702 BasicBlock *MemSafetyCheck = nullptr; 703 Value *TripCount = nullptr; 704 Value *VectorTripCount = nullptr; 705 VPlan &EpiloguePlan; 706 707 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 708 ElementCount EVF, unsigned EUF, 709 VPlan &EpiloguePlan) 710 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), 711 EpiloguePlan(EpiloguePlan) { 712 assert(EUF == 1 && 713 "A high UF for the epilogue loop is likely not beneficial."); 714 } 715 }; 716 717 /// An extension of the inner loop vectorizer that creates a skeleton for a 718 /// vectorized loop that has its epilogue (residual) also vectorized. 719 /// The idea is to run the vplan on a given loop twice, firstly to setup the 720 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 721 /// from the first step and vectorize the epilogue. This is achieved by 722 /// deriving two concrete strategy classes from this base class and invoking 723 /// them in succession from the loop vectorizer planner. 724 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 725 public: 726 InnerLoopAndEpilogueVectorizer( 727 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 728 DominatorTree *DT, const TargetLibraryInfo *TLI, 729 const TargetTransformInfo *TTI, AssumptionCache *AC, 730 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 731 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 732 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 733 GeneratedRTChecks &Checks, VPlan &Plan) 734 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 735 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 736 CM, BFI, PSI, Checks, Plan), 737 EPI(EPI) {} 738 739 // Override this function to handle the more complex control flow around the 740 // three loops. 741 BasicBlock * 742 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { 743 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 744 } 745 746 /// The interface for creating a vectorized skeleton using one of two 747 /// different strategies, each corresponding to one execution of the vplan 748 /// as described above. 749 virtual BasicBlock * 750 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 751 752 /// Holds and updates state information required to vectorize the main loop 753 /// and its epilogue in two separate passes. This setup helps us avoid 754 /// regenerating and recomputing runtime safety checks. It also helps us to 755 /// shorten the iteration-count-check path length for the cases where the 756 /// iteration count of the loop is so small that the main vector loop is 757 /// completely skipped. 758 EpilogueLoopVectorizationInfo &EPI; 759 }; 760 761 /// A specialized derived class of inner loop vectorizer that performs 762 /// vectorization of *main* loops in the process of vectorizing loops and their 763 /// epilogues. 764 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 765 public: 766 EpilogueVectorizerMainLoop( 767 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 768 DominatorTree *DT, const TargetLibraryInfo *TLI, 769 const TargetTransformInfo *TTI, AssumptionCache *AC, 770 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 771 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 772 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 773 GeneratedRTChecks &Check, VPlan &Plan) 774 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 775 EPI, LVL, CM, BFI, PSI, Check, Plan) {} 776 /// Implements the interface for creating a vectorized skeleton using the 777 /// *main loop* strategy (ie the first pass of vplan execution). 778 BasicBlock * 779 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 780 781 protected: 782 /// Emits an iteration count bypass check once for the main loop (when \p 783 /// ForEpilogue is false) and once for the epilogue loop (when \p 784 /// ForEpilogue is true). 785 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 786 void printDebugTracesAtStart() override; 787 void printDebugTracesAtEnd() override; 788 789 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 790 Value *VectorTripCount, BasicBlock *MiddleBlock, 791 VPTransformState &State) override {}; 792 }; 793 794 // A specialized derived class of inner loop vectorizer that performs 795 // vectorization of *epilogue* loops in the process of vectorizing loops and 796 // their epilogues. 797 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 798 public: 799 EpilogueVectorizerEpilogueLoop( 800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 801 DominatorTree *DT, const TargetLibraryInfo *TLI, 802 const TargetTransformInfo *TTI, AssumptionCache *AC, 803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 806 GeneratedRTChecks &Checks, VPlan &Plan) 807 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 808 EPI, LVL, CM, BFI, PSI, Checks, Plan) { 809 TripCount = EPI.TripCount; 810 } 811 /// Implements the interface for creating a vectorized skeleton using the 812 /// *epilogue loop* strategy (ie the second pass of vplan execution). 813 BasicBlock * 814 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 815 816 protected: 817 /// Emits an iteration count bypass check after the main vector loop has 818 /// finished to see if there are any iterations left to execute by either 819 /// the vector epilogue or the scalar epilogue. 820 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 821 BasicBlock *Bypass, 822 BasicBlock *Insert); 823 void printDebugTracesAtStart() override; 824 void printDebugTracesAtEnd() override; 825 }; 826 } // end namespace llvm 827 828 /// Look for a meaningful debug location on the instruction or its operands. 829 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 830 if (!I) 831 return DebugLoc(); 832 833 DebugLoc Empty; 834 if (I->getDebugLoc() != Empty) 835 return I->getDebugLoc(); 836 837 for (Use &Op : I->operands()) { 838 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 839 if (OpInst->getDebugLoc() != Empty) 840 return OpInst->getDebugLoc(); 841 } 842 843 return I->getDebugLoc(); 844 } 845 846 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 847 /// is passed, the message relates to that particular instruction. 848 #ifndef NDEBUG 849 static void debugVectorizationMessage(const StringRef Prefix, 850 const StringRef DebugMsg, 851 Instruction *I) { 852 dbgs() << "LV: " << Prefix << DebugMsg; 853 if (I != nullptr) 854 dbgs() << " " << *I; 855 else 856 dbgs() << '.'; 857 dbgs() << '\n'; 858 } 859 #endif 860 861 /// Create an analysis remark that explains why vectorization failed 862 /// 863 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 864 /// RemarkName is the identifier for the remark. If \p I is passed it is an 865 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 866 /// the location of the remark. If \p DL is passed, use it as debug location for 867 /// the remark. \return the remark object that can be streamed to. 868 static OptimizationRemarkAnalysis 869 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, 870 Instruction *I, DebugLoc DL = {}) { 871 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); 872 // If debug location is attached to the instruction, use it. Otherwise if DL 873 // was not provided, use the loop's. 874 if (I && I->getDebugLoc()) 875 DL = I->getDebugLoc(); 876 else if (!DL) 877 DL = TheLoop->getStartLoc(); 878 879 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 880 } 881 882 namespace llvm { 883 884 /// Return a value for Step multiplied by VF. 885 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 886 int64_t Step) { 887 assert(Ty->isIntegerTy() && "Expected an integer step"); 888 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 889 } 890 891 /// Return the runtime value for VF. 892 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 893 return B.CreateElementCount(Ty, VF); 894 } 895 896 void reportVectorizationFailure(const StringRef DebugMsg, 897 const StringRef OREMsg, const StringRef ORETag, 898 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 899 Instruction *I) { 900 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 901 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 902 ORE->emit( 903 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 904 << "loop not vectorized: " << OREMsg); 905 } 906 907 /// Reports an informative message: print \p Msg for debugging purposes as well 908 /// as an optimization remark. Uses either \p I as location of the remark, or 909 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the 910 /// remark. If \p DL is passed, use it as debug location for the remark. 911 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 912 OptimizationRemarkEmitter *ORE, 913 Loop *TheLoop, Instruction *I = nullptr, 914 DebugLoc DL = {}) { 915 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 916 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 917 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, 918 I, DL) 919 << Msg); 920 } 921 922 /// Report successful vectorization of the loop. In case an outer loop is 923 /// vectorized, prepend "outer" to the vectorization remark. 924 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 925 VectorizationFactor VF, unsigned IC) { 926 LLVM_DEBUG(debugVectorizationMessage( 927 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 928 nullptr)); 929 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 930 ORE->emit([&]() { 931 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 932 TheLoop->getHeader()) 933 << "vectorized " << LoopType << "loop (vectorization width: " 934 << ore::NV("VectorizationFactor", VF.Width) 935 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 936 }); 937 } 938 939 } // end namespace llvm 940 941 namespace llvm { 942 943 // Loop vectorization cost-model hints how the scalar epilogue loop should be 944 // lowered. 945 enum ScalarEpilogueLowering { 946 947 // The default: allowing scalar epilogues. 948 CM_ScalarEpilogueAllowed, 949 950 // Vectorization with OptForSize: don't allow epilogues. 951 CM_ScalarEpilogueNotAllowedOptSize, 952 953 // A special case of vectorisation with OptForSize: loops with a very small 954 // trip count are considered for vectorization under OptForSize, thereby 955 // making sure the cost of their loop body is dominant, free of runtime 956 // guards and scalar iteration overheads. 957 CM_ScalarEpilogueNotAllowedLowTripLoop, 958 959 // Loop hint predicate indicating an epilogue is undesired. 960 CM_ScalarEpilogueNotNeededUsePredicate, 961 962 // Directive indicating we must either tail fold or not vectorize 963 CM_ScalarEpilogueNotAllowedUsePredicate 964 }; 965 966 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 967 968 /// LoopVectorizationCostModel - estimates the expected speedups due to 969 /// vectorization. 970 /// In many cases vectorization is not profitable. This can happen because of 971 /// a number of reasons. In this class we mainly attempt to predict the 972 /// expected speedup/slowdowns due to the supported instruction set. We use the 973 /// TargetTransformInfo to query the different backends for the cost of 974 /// different operations. 975 class LoopVectorizationCostModel { 976 friend class LoopVectorizationPlanner; 977 978 public: 979 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 980 PredicatedScalarEvolution &PSE, LoopInfo *LI, 981 LoopVectorizationLegality *Legal, 982 const TargetTransformInfo &TTI, 983 const TargetLibraryInfo *TLI, DemandedBits *DB, 984 AssumptionCache *AC, 985 OptimizationRemarkEmitter *ORE, const Function *F, 986 const LoopVectorizeHints *Hints, 987 InterleavedAccessInfo &IAI) 988 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 989 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 990 Hints(Hints), InterleaveInfo(IAI) {} 991 992 /// \return An upper bound for the vectorization factors (both fixed and 993 /// scalable). If the factors are 0, vectorization and interleaving should be 994 /// avoided up front. 995 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 996 997 /// \return True if runtime checks are required for vectorization, and false 998 /// otherwise. 999 bool runtimeChecksRequired(); 1000 1001 /// Setup cost-based decisions for user vectorization factor. 1002 /// \return true if the UserVF is a feasible VF to be chosen. 1003 bool selectUserVectorizationFactor(ElementCount UserVF) { 1004 collectUniformsAndScalars(UserVF); 1005 collectInstsToScalarize(UserVF); 1006 return expectedCost(UserVF).isValid(); 1007 } 1008 1009 /// \return The size (in bits) of the smallest and widest types in the code 1010 /// that needs to be vectorized. We ignore values that remain scalar such as 1011 /// 64 bit loop indices. 1012 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1013 1014 /// \return The desired interleave count. 1015 /// If interleave count has been specified by metadata it will be returned. 1016 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1017 /// are the selected vectorization factor and the cost of the selected VF. 1018 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1019 1020 /// Memory access instruction may be vectorized in more than one way. 1021 /// Form of instruction after vectorization depends on cost. 1022 /// This function takes cost-based decisions for Load/Store instructions 1023 /// and collects them in a map. This decisions map is used for building 1024 /// the lists of loop-uniform and loop-scalar instructions. 1025 /// The calculated cost is saved with widening decision in order to 1026 /// avoid redundant calculations. 1027 void setCostBasedWideningDecision(ElementCount VF); 1028 1029 /// A call may be vectorized in different ways depending on whether we have 1030 /// vectorized variants available and whether the target supports masking. 1031 /// This function analyzes all calls in the function at the supplied VF, 1032 /// makes a decision based on the costs of available options, and stores that 1033 /// decision in a map for use in planning and plan execution. 1034 void setVectorizedCallDecision(ElementCount VF); 1035 1036 /// A struct that represents some properties of the register usage 1037 /// of a loop. 1038 struct RegisterUsage { 1039 /// Holds the number of loop invariant values that are used in the loop. 1040 /// The key is ClassID of target-provided register class. 1041 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1042 /// Holds the maximum number of concurrent live intervals in the loop. 1043 /// The key is ClassID of target-provided register class. 1044 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1045 }; 1046 1047 /// \return Returns information about the register usages of the loop for the 1048 /// given vectorization factors. 1049 SmallVector<RegisterUsage, 8> 1050 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1051 1052 /// Collect values we want to ignore in the cost model. 1053 void collectValuesToIgnore(); 1054 1055 /// Collect all element types in the loop for which widening is needed. 1056 void collectElementTypesForWidening(); 1057 1058 /// Split reductions into those that happen in the loop, and those that happen 1059 /// outside. In loop reductions are collected into InLoopReductions. 1060 void collectInLoopReductions(); 1061 1062 /// Returns true if we should use strict in-order reductions for the given 1063 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1064 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1065 /// of FP operations. 1066 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1067 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1068 } 1069 1070 /// \returns The smallest bitwidth each instruction can be represented with. 1071 /// The vector equivalents of these instructions should be truncated to this 1072 /// type. 1073 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1074 return MinBWs; 1075 } 1076 1077 /// \returns True if it is more profitable to scalarize instruction \p I for 1078 /// vectorization factor \p VF. 1079 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1080 assert(VF.isVector() && 1081 "Profitable to scalarize relevant only for VF > 1."); 1082 assert( 1083 TheLoop->isInnermost() && 1084 "cost-model should not be used for outer loops (in VPlan-native path)"); 1085 1086 auto Scalars = InstsToScalarize.find(VF); 1087 assert(Scalars != InstsToScalarize.end() && 1088 "VF not yet analyzed for scalarization profitability"); 1089 return Scalars->second.contains(I); 1090 } 1091 1092 /// Returns true if \p I is known to be uniform after vectorization. 1093 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1094 assert( 1095 TheLoop->isInnermost() && 1096 "cost-model should not be used for outer loops (in VPlan-native path)"); 1097 // Pseudo probe needs to be duplicated for each unrolled iteration and 1098 // vector lane so that profiled loop trip count can be accurately 1099 // accumulated instead of being under counted. 1100 if (isa<PseudoProbeInst>(I)) 1101 return false; 1102 1103 if (VF.isScalar()) 1104 return true; 1105 1106 auto UniformsPerVF = Uniforms.find(VF); 1107 assert(UniformsPerVF != Uniforms.end() && 1108 "VF not yet analyzed for uniformity"); 1109 return UniformsPerVF->second.count(I); 1110 } 1111 1112 /// Returns true if \p I is known to be scalar after vectorization. 1113 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1114 assert( 1115 TheLoop->isInnermost() && 1116 "cost-model should not be used for outer loops (in VPlan-native path)"); 1117 if (VF.isScalar()) 1118 return true; 1119 1120 auto ScalarsPerVF = Scalars.find(VF); 1121 assert(ScalarsPerVF != Scalars.end() && 1122 "Scalar values are not calculated for VF"); 1123 return ScalarsPerVF->second.count(I); 1124 } 1125 1126 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1127 /// for vectorization factor \p VF. 1128 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1129 return VF.isVector() && MinBWs.contains(I) && 1130 !isProfitableToScalarize(I, VF) && 1131 !isScalarAfterVectorization(I, VF); 1132 } 1133 1134 /// Decision that was taken during cost calculation for memory instruction. 1135 enum InstWidening { 1136 CM_Unknown, 1137 CM_Widen, // For consecutive accesses with stride +1. 1138 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1139 CM_Interleave, 1140 CM_GatherScatter, 1141 CM_Scalarize, 1142 CM_VectorCall, 1143 CM_IntrinsicCall 1144 }; 1145 1146 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1147 /// instruction \p I and vector width \p VF. 1148 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1149 InstructionCost Cost) { 1150 assert(VF.isVector() && "Expected VF >=2"); 1151 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1152 } 1153 1154 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1155 /// interleaving group \p Grp and vector width \p VF. 1156 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1157 ElementCount VF, InstWidening W, 1158 InstructionCost Cost) { 1159 assert(VF.isVector() && "Expected VF >=2"); 1160 /// Broadcast this decicion to all instructions inside the group. 1161 /// When interleaving, the cost will only be assigned one instruction, the 1162 /// insert position. For other cases, add the appropriate fraction of the 1163 /// total cost to each instruction. This ensures accurate costs are used, 1164 /// even if the insert position instruction is not used. 1165 InstructionCost InsertPosCost = Cost; 1166 InstructionCost OtherMemberCost = 0; 1167 if (W != CM_Interleave) 1168 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); 1169 ; 1170 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { 1171 if (auto *I = Grp->getMember(Idx)) { 1172 if (Grp->getInsertPos() == I) 1173 WideningDecisions[std::make_pair(I, VF)] = 1174 std::make_pair(W, InsertPosCost); 1175 else 1176 WideningDecisions[std::make_pair(I, VF)] = 1177 std::make_pair(W, OtherMemberCost); 1178 } 1179 } 1180 } 1181 1182 /// Return the cost model decision for the given instruction \p I and vector 1183 /// width \p VF. Return CM_Unknown if this instruction did not pass 1184 /// through the cost modeling. 1185 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1186 assert(VF.isVector() && "Expected VF to be a vector VF"); 1187 assert( 1188 TheLoop->isInnermost() && 1189 "cost-model should not be used for outer loops (in VPlan-native path)"); 1190 1191 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1192 auto Itr = WideningDecisions.find(InstOnVF); 1193 if (Itr == WideningDecisions.end()) 1194 return CM_Unknown; 1195 return Itr->second.first; 1196 } 1197 1198 /// Return the vectorization cost for the given instruction \p I and vector 1199 /// width \p VF. 1200 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1201 assert(VF.isVector() && "Expected VF >=2"); 1202 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1203 assert(WideningDecisions.contains(InstOnVF) && 1204 "The cost is not calculated"); 1205 return WideningDecisions[InstOnVF].second; 1206 } 1207 1208 struct CallWideningDecision { 1209 InstWidening Kind; 1210 Function *Variant; 1211 Intrinsic::ID IID; 1212 std::optional<unsigned> MaskPos; 1213 InstructionCost Cost; 1214 }; 1215 1216 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1217 Function *Variant, Intrinsic::ID IID, 1218 std::optional<unsigned> MaskPos, 1219 InstructionCost Cost) { 1220 assert(!VF.isScalar() && "Expected vector VF"); 1221 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1222 MaskPos, Cost}; 1223 } 1224 1225 CallWideningDecision getCallWideningDecision(CallInst *CI, 1226 ElementCount VF) const { 1227 assert(!VF.isScalar() && "Expected vector VF"); 1228 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1229 } 1230 1231 /// Return True if instruction \p I is an optimizable truncate whose operand 1232 /// is an induction variable. Such a truncate will be removed by adding a new 1233 /// induction variable with the destination type. 1234 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1235 // If the instruction is not a truncate, return false. 1236 auto *Trunc = dyn_cast<TruncInst>(I); 1237 if (!Trunc) 1238 return false; 1239 1240 // Get the source and destination types of the truncate. 1241 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1242 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1243 1244 // If the truncate is free for the given types, return false. Replacing a 1245 // free truncate with an induction variable would add an induction variable 1246 // update instruction to each iteration of the loop. We exclude from this 1247 // check the primary induction variable since it will need an update 1248 // instruction regardless. 1249 Value *Op = Trunc->getOperand(0); 1250 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1251 return false; 1252 1253 // If the truncated value is not an induction variable, return false. 1254 return Legal->isInductionPhi(Op); 1255 } 1256 1257 /// Collects the instructions to scalarize for each predicated instruction in 1258 /// the loop. 1259 void collectInstsToScalarize(ElementCount VF); 1260 1261 /// Collect Uniform and Scalar values for the given \p VF. 1262 /// The sets depend on CM decision for Load/Store instructions 1263 /// that may be vectorized as interleave, gather-scatter or scalarized. 1264 /// Also make a decision on what to do about call instructions in the loop 1265 /// at that VF -- scalarize, call a known vector routine, or call a 1266 /// vector intrinsic. 1267 void collectUniformsAndScalars(ElementCount VF) { 1268 // Do the analysis once. 1269 if (VF.isScalar() || Uniforms.contains(VF)) 1270 return; 1271 setCostBasedWideningDecision(VF); 1272 collectLoopUniforms(VF); 1273 setVectorizedCallDecision(VF); 1274 collectLoopScalars(VF); 1275 } 1276 1277 /// Returns true if the target machine supports masked store operation 1278 /// for the given \p DataType and kind of access to \p Ptr. 1279 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1280 return Legal->isConsecutivePtr(DataType, Ptr) && 1281 TTI.isLegalMaskedStore(DataType, Alignment); 1282 } 1283 1284 /// Returns true if the target machine supports masked load operation 1285 /// for the given \p DataType and kind of access to \p Ptr. 1286 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1287 return Legal->isConsecutivePtr(DataType, Ptr) && 1288 TTI.isLegalMaskedLoad(DataType, Alignment); 1289 } 1290 1291 /// Returns true if the target machine can represent \p V as a masked gather 1292 /// or scatter operation. 1293 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1294 bool LI = isa<LoadInst>(V); 1295 bool SI = isa<StoreInst>(V); 1296 if (!LI && !SI) 1297 return false; 1298 auto *Ty = getLoadStoreType(V); 1299 Align Align = getLoadStoreAlignment(V); 1300 if (VF.isVector()) 1301 Ty = VectorType::get(Ty, VF); 1302 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1303 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1304 } 1305 1306 /// Returns true if the target machine supports all of the reduction 1307 /// variables found for the given VF. 1308 bool canVectorizeReductions(ElementCount VF) const { 1309 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1310 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1311 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1312 })); 1313 } 1314 1315 /// Given costs for both strategies, return true if the scalar predication 1316 /// lowering should be used for div/rem. This incorporates an override 1317 /// option so it is not simply a cost comparison. 1318 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1319 InstructionCost SafeDivisorCost) const { 1320 switch (ForceSafeDivisor) { 1321 case cl::BOU_UNSET: 1322 return ScalarCost < SafeDivisorCost; 1323 case cl::BOU_TRUE: 1324 return false; 1325 case cl::BOU_FALSE: 1326 return true; 1327 } 1328 llvm_unreachable("impossible case value"); 1329 } 1330 1331 /// Returns true if \p I is an instruction which requires predication and 1332 /// for which our chosen predication strategy is scalarization (i.e. we 1333 /// don't have an alternate strategy such as masking available). 1334 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1335 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1336 1337 /// Returns true if \p I is an instruction that needs to be predicated 1338 /// at runtime. The result is independent of the predication mechanism. 1339 /// Superset of instructions that return true for isScalarWithPredication. 1340 bool isPredicatedInst(Instruction *I) const; 1341 1342 /// Return the costs for our two available strategies for lowering a 1343 /// div/rem operation which requires speculating at least one lane. 1344 /// First result is for scalarization (will be invalid for scalable 1345 /// vectors); second is for the safe-divisor strategy. 1346 std::pair<InstructionCost, InstructionCost> 1347 getDivRemSpeculationCost(Instruction *I, 1348 ElementCount VF) const; 1349 1350 /// Returns true if \p I is a memory instruction with consecutive memory 1351 /// access that can be widened. 1352 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1353 1354 /// Returns true if \p I is a memory instruction in an interleaved-group 1355 /// of memory accesses that can be vectorized with wide vector loads/stores 1356 /// and shuffles. 1357 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; 1358 1359 /// Check if \p Instr belongs to any interleaved access group. 1360 bool isAccessInterleaved(Instruction *Instr) const { 1361 return InterleaveInfo.isInterleaved(Instr); 1362 } 1363 1364 /// Get the interleaved access group that \p Instr belongs to. 1365 const InterleaveGroup<Instruction> * 1366 getInterleavedAccessGroup(Instruction *Instr) const { 1367 return InterleaveInfo.getInterleaveGroup(Instr); 1368 } 1369 1370 /// Returns true if we're required to use a scalar epilogue for at least 1371 /// the final iteration of the original loop. 1372 bool requiresScalarEpilogue(bool IsVectorizing) const { 1373 if (!isScalarEpilogueAllowed()) { 1374 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1375 return false; 1376 } 1377 // If we might exit from anywhere but the latch and early exit vectorization 1378 // is disabled, we must run the exiting iteration in scalar form. 1379 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && 1380 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { 1381 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " 1382 "from latch block\n"); 1383 return true; 1384 } 1385 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { 1386 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " 1387 "interleaved group requires scalar epilogue\n"); 1388 return true; 1389 } 1390 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1391 return false; 1392 } 1393 1394 /// Returns true if we're required to use a scalar epilogue for at least 1395 /// the final iteration of the original loop for all VFs in \p Range. 1396 /// A scalar epilogue must either be required for all VFs in \p Range or for 1397 /// none. 1398 bool requiresScalarEpilogue(VFRange Range) const { 1399 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1400 return requiresScalarEpilogue(VF.isVector()); 1401 }; 1402 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1403 assert( 1404 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1405 "all VFs in range must agree on whether a scalar epilogue is required"); 1406 return IsRequired; 1407 } 1408 1409 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1410 /// loop hint annotation. 1411 bool isScalarEpilogueAllowed() const { 1412 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1413 } 1414 1415 /// Returns the TailFoldingStyle that is best for the current loop. 1416 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1417 if (!ChosenTailFoldingStyle) 1418 return TailFoldingStyle::None; 1419 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first 1420 : ChosenTailFoldingStyle->second; 1421 } 1422 1423 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1424 /// overflow or not. 1425 /// \param IsScalableVF true if scalable vector factors enabled. 1426 /// \param UserIC User specific interleave count. 1427 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { 1428 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); 1429 if (!Legal->canFoldTailByMasking()) { 1430 ChosenTailFoldingStyle = 1431 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); 1432 return; 1433 } 1434 1435 if (!ForceTailFoldingStyle.getNumOccurrences()) { 1436 ChosenTailFoldingStyle = std::make_pair( 1437 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), 1438 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); 1439 return; 1440 } 1441 1442 // Set styles when forced. 1443 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), 1444 ForceTailFoldingStyle.getValue()); 1445 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) 1446 return; 1447 // Override forced styles if needed. 1448 // FIXME: use actual opcode/data type for analysis here. 1449 // FIXME: Investigate opportunity for fixed vector factor. 1450 bool EVLIsLegal = UserIC <= 1 && 1451 TTI.hasActiveVectorLength(0, nullptr, Align()) && 1452 !EnableVPlanNativePath; 1453 if (!EVLIsLegal) { 1454 // If for some reason EVL mode is unsupported, fallback to 1455 // DataWithoutLaneMask to try to vectorize the loop with folded tail 1456 // in a generic way. 1457 ChosenTailFoldingStyle = 1458 std::make_pair(TailFoldingStyle::DataWithoutLaneMask, 1459 TailFoldingStyle::DataWithoutLaneMask); 1460 LLVM_DEBUG( 1461 dbgs() 1462 << "LV: Preference for VP intrinsics indicated. Will " 1463 "not try to generate VP Intrinsics " 1464 << (UserIC > 1 1465 ? "since interleave count specified is greater than 1.\n" 1466 : "due to non-interleaving reasons.\n")); 1467 } 1468 } 1469 1470 /// Returns true if all loop blocks should be masked to fold tail loop. 1471 bool foldTailByMasking() const { 1472 // TODO: check if it is possible to check for None style independent of 1473 // IVUpdateMayOverflow flag in getTailFoldingStyle. 1474 return getTailFoldingStyle() != TailFoldingStyle::None; 1475 } 1476 1477 /// Return maximum safe number of elements to be processed per vector 1478 /// iteration, which do not prevent store-load forwarding and are safe with 1479 /// regard to the memory dependencies. Required for EVL-based VPlans to 1480 /// correctly calculate AVL (application vector length) as min(remaining AVL, 1481 /// MaxSafeElements). 1482 /// TODO: need to consider adjusting cost model to use this value as a 1483 /// vectorization factor for EVL-based vectorization. 1484 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; } 1485 1486 /// Returns true if the instructions in this block requires predication 1487 /// for any reason, e.g. because tail folding now requires a predicate 1488 /// or because the block in the original loop was predicated. 1489 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1490 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1491 } 1492 1493 /// Returns true if VP intrinsics with explicit vector length support should 1494 /// be generated in the tail folded loop. 1495 bool foldTailWithEVL() const { 1496 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; 1497 } 1498 1499 /// Returns true if the Phi is part of an inloop reduction. 1500 bool isInLoopReduction(PHINode *Phi) const { 1501 return InLoopReductions.contains(Phi); 1502 } 1503 1504 /// Returns true if the predicated reduction select should be used to set the 1505 /// incoming value for the reduction phi. 1506 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { 1507 // Force to use predicated reduction select since the EVL of the 1508 // second-to-last iteration might not be VF*UF. 1509 if (foldTailWithEVL()) 1510 return true; 1511 return PreferPredicatedReductionSelect || 1512 TTI.preferPredicatedReductionSelect( 1513 Opcode, PhiTy, TargetTransformInfo::ReductionFlags()); 1514 } 1515 1516 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1517 /// with factor VF. Return the cost of the instruction, including 1518 /// scalarization overhead if it's needed. 1519 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1520 1521 /// Estimate cost of a call instruction CI if it were vectorized with factor 1522 /// VF. Return the cost of the instruction, including scalarization overhead 1523 /// if it's needed. 1524 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1525 1526 /// Invalidates decisions already taken by the cost model. 1527 void invalidateCostModelingDecisions() { 1528 WideningDecisions.clear(); 1529 CallWideningDecisions.clear(); 1530 Uniforms.clear(); 1531 Scalars.clear(); 1532 } 1533 1534 /// Returns the expected execution cost. The unit of the cost does 1535 /// not matter because we use the 'cost' units to compare different 1536 /// vector widths. The cost that is returned is *not* normalized by 1537 /// the factor width. 1538 InstructionCost expectedCost(ElementCount VF); 1539 1540 bool hasPredStores() const { return NumPredStores > 0; } 1541 1542 /// Returns true if epilogue vectorization is considered profitable, and 1543 /// false otherwise. 1544 /// \p VF is the vectorization factor chosen for the original loop. 1545 /// \p Multiplier is an aditional scaling factor applied to VF before 1546 /// comparing to EpilogueVectorizationMinVF. 1547 bool isEpilogueVectorizationProfitable(const ElementCount VF, 1548 const unsigned IC) const; 1549 1550 /// Returns the execution time cost of an instruction for a given vector 1551 /// width. Vector width of one means scalar. 1552 InstructionCost getInstructionCost(Instruction *I, ElementCount VF); 1553 1554 /// Return the cost of instructions in an inloop reduction pattern, if I is 1555 /// part of that pattern. 1556 std::optional<InstructionCost> 1557 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1558 TTI::TargetCostKind CostKind) const; 1559 1560 /// Returns true if \p Op should be considered invariant and if it is 1561 /// trivially hoistable. 1562 bool shouldConsiderInvariant(Value *Op); 1563 1564 private: 1565 unsigned NumPredStores = 0; 1566 1567 /// \return An upper bound for the vectorization factors for both 1568 /// fixed and scalable vectorization, where the minimum-known number of 1569 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1570 /// disabled or unsupported, then the scalable part will be equal to 1571 /// ElementCount::getScalable(0). 1572 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1573 ElementCount UserVF, 1574 bool FoldTailByMasking); 1575 1576 /// \return the maximized element count based on the targets vector 1577 /// registers and the loop trip-count, but limited to a maximum safe VF. 1578 /// This is a helper function of computeFeasibleMaxVF. 1579 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1580 unsigned SmallestType, 1581 unsigned WidestType, 1582 ElementCount MaxSafeVF, 1583 bool FoldTailByMasking); 1584 1585 /// Checks if scalable vectorization is supported and enabled. Caches the 1586 /// result to avoid repeated debug dumps for repeated queries. 1587 bool isScalableVectorizationAllowed(); 1588 1589 /// \return the maximum legal scalable VF, based on the safe max number 1590 /// of elements. 1591 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1592 1593 /// Calculate vectorization cost of memory instruction \p I. 1594 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1595 1596 /// The cost computation for scalarized memory instruction. 1597 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1598 1599 /// The cost computation for interleaving group of memory instructions. 1600 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1601 1602 /// The cost computation for Gather/Scatter instruction. 1603 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1604 1605 /// The cost computation for widening instruction \p I with consecutive 1606 /// memory access. 1607 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1608 1609 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1610 /// Load: scalar load + broadcast. 1611 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1612 /// element) 1613 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1614 1615 /// Estimate the overhead of scalarizing an instruction. This is a 1616 /// convenience wrapper for the type-based getScalarizationOverhead API. 1617 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1618 TTI::TargetCostKind CostKind) const; 1619 1620 /// Returns true if an artificially high cost for emulated masked memrefs 1621 /// should be used. 1622 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1623 1624 /// Map of scalar integer values to the smallest bitwidth they can be legally 1625 /// represented as. The vector equivalents of these values should be truncated 1626 /// to this type. 1627 MapVector<Instruction *, uint64_t> MinBWs; 1628 1629 /// A type representing the costs for instructions if they were to be 1630 /// scalarized rather than vectorized. The entries are Instruction-Cost 1631 /// pairs. 1632 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1633 1634 /// A set containing all BasicBlocks that are known to present after 1635 /// vectorization as a predicated block. 1636 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1637 PredicatedBBsAfterVectorization; 1638 1639 /// Records whether it is allowed to have the original scalar loop execute at 1640 /// least once. This may be needed as a fallback loop in case runtime 1641 /// aliasing/dependence checks fail, or to handle the tail/remainder 1642 /// iterations when the trip count is unknown or doesn't divide by the VF, 1643 /// or as a peel-loop to handle gaps in interleave-groups. 1644 /// Under optsize and when the trip count is very small we don't allow any 1645 /// iterations to execute in the scalar loop. 1646 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1647 1648 /// Control finally chosen tail folding style. The first element is used if 1649 /// the IV update may overflow, the second element - if it does not. 1650 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> 1651 ChosenTailFoldingStyle; 1652 1653 /// true if scalable vectorization is supported and enabled. 1654 std::optional<bool> IsScalableVectorizationAllowed; 1655 1656 /// Maximum safe number of elements to be processed per vector iteration, 1657 /// which do not prevent store-load forwarding and are safe with regard to the 1658 /// memory dependencies. Required for EVL-based veectorization, where this 1659 /// value is used as the upper bound of the safe AVL. 1660 std::optional<unsigned> MaxSafeElements; 1661 1662 /// A map holding scalar costs for different vectorization factors. The 1663 /// presence of a cost for an instruction in the mapping indicates that the 1664 /// instruction will be scalarized when vectorizing with the associated 1665 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1666 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1667 1668 /// Holds the instructions known to be uniform after vectorization. 1669 /// The data is collected per VF. 1670 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1671 1672 /// Holds the instructions known to be scalar after vectorization. 1673 /// The data is collected per VF. 1674 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1675 1676 /// Holds the instructions (address computations) that are forced to be 1677 /// scalarized. 1678 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1679 1680 /// PHINodes of the reductions that should be expanded in-loop. 1681 SmallPtrSet<PHINode *, 4> InLoopReductions; 1682 1683 /// A Map of inloop reduction operations and their immediate chain operand. 1684 /// FIXME: This can be removed once reductions can be costed correctly in 1685 /// VPlan. This was added to allow quick lookup of the inloop operations. 1686 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1687 1688 /// Returns the expected difference in cost from scalarizing the expression 1689 /// feeding a predicated instruction \p PredInst. The instructions to 1690 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1691 /// non-negative return value implies the expression will be scalarized. 1692 /// Currently, only single-use chains are considered for scalarization. 1693 InstructionCost computePredInstDiscount(Instruction *PredInst, 1694 ScalarCostsTy &ScalarCosts, 1695 ElementCount VF); 1696 1697 /// Collect the instructions that are uniform after vectorization. An 1698 /// instruction is uniform if we represent it with a single scalar value in 1699 /// the vectorized loop corresponding to each vector iteration. Examples of 1700 /// uniform instructions include pointer operands of consecutive or 1701 /// interleaved memory accesses. Note that although uniformity implies an 1702 /// instruction will be scalar, the reverse is not true. In general, a 1703 /// scalarized instruction will be represented by VF scalar values in the 1704 /// vectorized loop, each corresponding to an iteration of the original 1705 /// scalar loop. 1706 void collectLoopUniforms(ElementCount VF); 1707 1708 /// Collect the instructions that are scalar after vectorization. An 1709 /// instruction is scalar if it is known to be uniform or will be scalarized 1710 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1711 /// to the list if they are used by a load/store instruction that is marked as 1712 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1713 /// VF values in the vectorized loop, each corresponding to an iteration of 1714 /// the original scalar loop. 1715 void collectLoopScalars(ElementCount VF); 1716 1717 /// Keeps cost model vectorization decision and cost for instructions. 1718 /// Right now it is used for memory instructions only. 1719 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1720 std::pair<InstWidening, InstructionCost>>; 1721 1722 DecisionList WideningDecisions; 1723 1724 using CallDecisionList = 1725 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1726 1727 CallDecisionList CallWideningDecisions; 1728 1729 /// Returns true if \p V is expected to be vectorized and it needs to be 1730 /// extracted. 1731 bool needsExtract(Value *V, ElementCount VF) const { 1732 Instruction *I = dyn_cast<Instruction>(V); 1733 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1734 TheLoop->isLoopInvariant(I) || 1735 getWideningDecision(I, VF) == CM_Scalarize) 1736 return false; 1737 1738 // Assume we can vectorize V (and hence we need extraction) if the 1739 // scalars are not computed yet. This can happen, because it is called 1740 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1741 // the scalars are collected. That should be a safe assumption in most 1742 // cases, because we check if the operands have vectorizable types 1743 // beforehand in LoopVectorizationLegality. 1744 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1745 }; 1746 1747 /// Returns a range containing only operands needing to be extracted. 1748 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1749 ElementCount VF) const { 1750 return SmallVector<Value *, 4>(make_filter_range( 1751 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1752 } 1753 1754 public: 1755 /// The loop that we evaluate. 1756 Loop *TheLoop; 1757 1758 /// Predicated scalar evolution analysis. 1759 PredicatedScalarEvolution &PSE; 1760 1761 /// Loop Info analysis. 1762 LoopInfo *LI; 1763 1764 /// Vectorization legality. 1765 LoopVectorizationLegality *Legal; 1766 1767 /// Vector target information. 1768 const TargetTransformInfo &TTI; 1769 1770 /// Target Library Info. 1771 const TargetLibraryInfo *TLI; 1772 1773 /// Demanded bits analysis. 1774 DemandedBits *DB; 1775 1776 /// Assumption cache. 1777 AssumptionCache *AC; 1778 1779 /// Interface to emit optimization remarks. 1780 OptimizationRemarkEmitter *ORE; 1781 1782 const Function *TheFunction; 1783 1784 /// Loop Vectorize Hint. 1785 const LoopVectorizeHints *Hints; 1786 1787 /// The interleave access information contains groups of interleaved accesses 1788 /// with the same stride and close to each other. 1789 InterleavedAccessInfo &InterleaveInfo; 1790 1791 /// Values to ignore in the cost model. 1792 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1793 1794 /// Values to ignore in the cost model when VF > 1. 1795 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1796 1797 /// All element types found in the loop. 1798 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1799 }; 1800 } // end namespace llvm 1801 1802 namespace { 1803 /// Helper struct to manage generating runtime checks for vectorization. 1804 /// 1805 /// The runtime checks are created up-front in temporary blocks to allow better 1806 /// estimating the cost and un-linked from the existing IR. After deciding to 1807 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1808 /// temporary blocks are completely removed. 1809 class GeneratedRTChecks { 1810 /// Basic block which contains the generated SCEV checks, if any. 1811 BasicBlock *SCEVCheckBlock = nullptr; 1812 1813 /// The value representing the result of the generated SCEV checks. If it is 1814 /// nullptr, either no SCEV checks have been generated or they have been used. 1815 Value *SCEVCheckCond = nullptr; 1816 1817 /// Basic block which contains the generated memory runtime checks, if any. 1818 BasicBlock *MemCheckBlock = nullptr; 1819 1820 /// The value representing the result of the generated memory runtime checks. 1821 /// If it is nullptr, either no memory runtime checks have been generated or 1822 /// they have been used. 1823 Value *MemRuntimeCheckCond = nullptr; 1824 1825 DominatorTree *DT; 1826 LoopInfo *LI; 1827 TargetTransformInfo *TTI; 1828 1829 SCEVExpander SCEVExp; 1830 SCEVExpander MemCheckExp; 1831 1832 bool CostTooHigh = false; 1833 const bool AddBranchWeights; 1834 1835 Loop *OuterLoop = nullptr; 1836 1837 PredicatedScalarEvolution &PSE; 1838 1839 public: 1840 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, 1841 LoopInfo *LI, TargetTransformInfo *TTI, 1842 const DataLayout &DL, bool AddBranchWeights) 1843 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), 1844 MemCheckExp(*PSE.getSE(), DL, "scev.check"), 1845 AddBranchWeights(AddBranchWeights), PSE(PSE) {} 1846 1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1848 /// accurately estimate the cost of the runtime checks. The blocks are 1849 /// un-linked from the IR and are added back during vector code generation. If 1850 /// there is no vector code generation, the check blocks are removed 1851 /// completely. 1852 void create(Loop *L, const LoopAccessInfo &LAI, 1853 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1854 1855 // Hard cutoff to limit compile-time increase in case a very large number of 1856 // runtime checks needs to be generated. 1857 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1858 // profile info. 1859 CostTooHigh = 1860 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1861 if (CostTooHigh) 1862 return; 1863 1864 BasicBlock *LoopHeader = L->getHeader(); 1865 BasicBlock *Preheader = L->getLoopPreheader(); 1866 1867 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1868 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1869 // may be used by SCEVExpander. The blocks will be un-linked from their 1870 // predecessors and removed from LI & DT at the end of the function. 1871 if (!UnionPred.isAlwaysTrue()) { 1872 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1873 nullptr, "vector.scevcheck"); 1874 1875 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1876 &UnionPred, SCEVCheckBlock->getTerminator()); 1877 } 1878 1879 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1880 if (RtPtrChecking.Need) { 1881 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1882 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1883 "vector.memcheck"); 1884 1885 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1886 if (DiffChecks) { 1887 Value *RuntimeVF = nullptr; 1888 MemRuntimeCheckCond = addDiffRuntimeChecks( 1889 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1890 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1891 if (!RuntimeVF) 1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1893 return RuntimeVF; 1894 }, 1895 IC); 1896 } else { 1897 MemRuntimeCheckCond = addRuntimeChecks( 1898 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 1899 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 1900 } 1901 assert(MemRuntimeCheckCond && 1902 "no RT checks generated although RtPtrChecking " 1903 "claimed checks are required"); 1904 } 1905 1906 if (!MemCheckBlock && !SCEVCheckBlock) 1907 return; 1908 1909 // Unhook the temporary block with the checks, update various places 1910 // accordingly. 1911 if (SCEVCheckBlock) 1912 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1913 if (MemCheckBlock) 1914 MemCheckBlock->replaceAllUsesWith(Preheader); 1915 1916 if (SCEVCheckBlock) { 1917 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1918 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1919 Preheader->getTerminator()->eraseFromParent(); 1920 } 1921 if (MemCheckBlock) { 1922 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1923 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1924 Preheader->getTerminator()->eraseFromParent(); 1925 } 1926 1927 DT->changeImmediateDominator(LoopHeader, Preheader); 1928 if (MemCheckBlock) { 1929 DT->eraseNode(MemCheckBlock); 1930 LI->removeBlock(MemCheckBlock); 1931 } 1932 if (SCEVCheckBlock) { 1933 DT->eraseNode(SCEVCheckBlock); 1934 LI->removeBlock(SCEVCheckBlock); 1935 } 1936 1937 // Outer loop is used as part of the later cost calculations. 1938 OuterLoop = L->getParentLoop(); 1939 } 1940 1941 InstructionCost getCost() { 1942 if (SCEVCheckBlock || MemCheckBlock) 1943 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1944 1945 if (CostTooHigh) { 1946 InstructionCost Cost; 1947 Cost.setInvalid(); 1948 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1949 return Cost; 1950 } 1951 1952 InstructionCost RTCheckCost = 0; 1953 if (SCEVCheckBlock) 1954 for (Instruction &I : *SCEVCheckBlock) { 1955 if (SCEVCheckBlock->getTerminator() == &I) 1956 continue; 1957 InstructionCost C = 1958 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1959 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1960 RTCheckCost += C; 1961 } 1962 if (MemCheckBlock) { 1963 InstructionCost MemCheckCost = 0; 1964 for (Instruction &I : *MemCheckBlock) { 1965 if (MemCheckBlock->getTerminator() == &I) 1966 continue; 1967 InstructionCost C = 1968 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1969 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1970 MemCheckCost += C; 1971 } 1972 1973 // If the runtime memory checks are being created inside an outer loop 1974 // we should find out if these checks are outer loop invariant. If so, 1975 // the checks will likely be hoisted out and so the effective cost will 1976 // reduce according to the outer loop trip count. 1977 if (OuterLoop) { 1978 ScalarEvolution *SE = MemCheckExp.getSE(); 1979 // TODO: If profitable, we could refine this further by analysing every 1980 // individual memory check, since there could be a mixture of loop 1981 // variant and invariant checks that mean the final condition is 1982 // variant. 1983 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 1984 if (SE->isLoopInvariant(Cond, OuterLoop)) { 1985 // It seems reasonable to assume that we can reduce the effective 1986 // cost of the checks even when we know nothing about the trip 1987 // count. Assume that the outer loop executes at least twice. 1988 unsigned BestTripCount = 2; 1989 1990 // Get the best known TC estimate. 1991 if (auto EstimatedTC = getSmallBestKnownTC( 1992 PSE, OuterLoop, /* CanUseConstantMax = */ false)) 1993 BestTripCount = *EstimatedTC; 1994 1995 BestTripCount = std::max(BestTripCount, 1U); 1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 1997 1998 // Let's ensure the cost is always at least 1. 1999 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 2000 (InstructionCost::CostType)1); 2001 2002 if (BestTripCount > 1) 2003 LLVM_DEBUG(dbgs() 2004 << "We expect runtime memory checks to be hoisted " 2005 << "out of the outer loop. Cost reduced from " 2006 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2007 2008 MemCheckCost = NewMemCheckCost; 2009 } 2010 } 2011 2012 RTCheckCost += MemCheckCost; 2013 } 2014 2015 if (SCEVCheckBlock || MemCheckBlock) 2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2017 << "\n"); 2018 2019 return RTCheckCost; 2020 } 2021 2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2023 /// unused. 2024 ~GeneratedRTChecks() { 2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2027 if (!SCEVCheckCond) 2028 SCEVCleaner.markResultUsed(); 2029 2030 if (!MemRuntimeCheckCond) 2031 MemCheckCleaner.markResultUsed(); 2032 2033 if (MemRuntimeCheckCond) { 2034 auto &SE = *MemCheckExp.getSE(); 2035 // Memory runtime check generation creates compares that use expanded 2036 // values. Remove them before running the SCEVExpanderCleaners. 2037 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2038 if (MemCheckExp.isInsertedInstruction(&I)) 2039 continue; 2040 SE.forgetValue(&I); 2041 I.eraseFromParent(); 2042 } 2043 } 2044 MemCheckCleaner.cleanup(); 2045 SCEVCleaner.cleanup(); 2046 2047 if (SCEVCheckCond) 2048 SCEVCheckBlock->eraseFromParent(); 2049 if (MemRuntimeCheckCond) 2050 MemCheckBlock->eraseFromParent(); 2051 } 2052 2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2054 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2055 /// depending on the generated condition. 2056 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2057 BasicBlock *LoopVectorPreHeader) { 2058 if (!SCEVCheckCond) 2059 return nullptr; 2060 2061 Value *Cond = SCEVCheckCond; 2062 // Mark the check as used, to prevent it from being removed during cleanup. 2063 SCEVCheckCond = nullptr; 2064 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2065 if (C->isZero()) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2071 // Create new preheader for vector loop. 2072 if (OuterLoop) 2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2074 2075 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2078 SCEVCheckBlock); 2079 2080 DT->addNewBlock(SCEVCheckBlock, Pred); 2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2082 2083 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2084 if (AddBranchWeights) 2085 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); 2086 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2087 return SCEVCheckBlock; 2088 } 2089 2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2091 /// the branches to branch to the vector preheader or \p Bypass, depending on 2092 /// the generated condition. 2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2094 BasicBlock *LoopVectorPreHeader) { 2095 // Check if we generated code that checks in runtime if arrays overlap. 2096 if (!MemRuntimeCheckCond) 2097 return nullptr; 2098 2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2101 MemCheckBlock); 2102 2103 DT->addNewBlock(MemCheckBlock, Pred); 2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2105 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2106 2107 if (OuterLoop) 2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2109 2110 BranchInst &BI = 2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2112 if (AddBranchWeights) { 2113 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); 2114 } 2115 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2116 MemCheckBlock->getTerminator()->setDebugLoc( 2117 Pred->getTerminator()->getDebugLoc()); 2118 2119 // Mark the check as used, to prevent it from being removed during cleanup. 2120 MemRuntimeCheckCond = nullptr; 2121 return MemCheckBlock; 2122 } 2123 }; 2124 } // namespace 2125 2126 static bool useActiveLaneMask(TailFoldingStyle Style) { 2127 return Style == TailFoldingStyle::Data || 2128 Style == TailFoldingStyle::DataAndControlFlow || 2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2130 } 2131 2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2133 return Style == TailFoldingStyle::DataAndControlFlow || 2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2135 } 2136 2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2138 // vectorization. The loop needs to be annotated with #pragma omp simd 2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2140 // vector length information is not provided, vectorization is not considered 2141 // explicit. Interleave hints are not allowed either. These limitations will be 2142 // relaxed in the future. 2143 // Please, note that we are currently forced to abuse the pragma 'clang 2144 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2146 // provides *explicit vectorization hints* (LV can bypass legal checks and 2147 // assume that vectorization is legal). However, both hints are implemented 2148 // using the same metadata (llvm.loop.vectorize, processed by 2149 // LoopVectorizeHints). This will be fixed in the future when the native IR 2150 // representation for pragma 'omp simd' is introduced. 2151 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2152 OptimizationRemarkEmitter *ORE) { 2153 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2155 2156 // Only outer loops with an explicit vectorization hint are supported. 2157 // Unannotated outer loops are ignored. 2158 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2159 return false; 2160 2161 Function *Fn = OuterLp->getHeader()->getParent(); 2162 if (!Hints.allowVectorization(Fn, OuterLp, 2163 true /*VectorizeOnlyWhenForced*/)) { 2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2165 return false; 2166 } 2167 2168 if (Hints.getInterleave() > 1) { 2169 // TODO: Interleave support is future work. 2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2171 "outer loops.\n"); 2172 Hints.emitRemarkWithHints(); 2173 return false; 2174 } 2175 2176 return true; 2177 } 2178 2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2180 OptimizationRemarkEmitter *ORE, 2181 SmallVectorImpl<Loop *> &V) { 2182 // Collect inner loops and outer loops without irreducible control flow. For 2183 // now, only collect outer loops that have explicit vectorization hints. If we 2184 // are stress testing the VPlan H-CFG construction, we collect the outermost 2185 // loop of every loop nest. 2186 if (L.isInnermost() || VPlanBuildStressTest || 2187 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2188 LoopBlocksRPO RPOT(&L); 2189 RPOT.perform(LI); 2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2191 V.push_back(&L); 2192 // TODO: Collect inner loops inside marked outer loops in case 2193 // vectorization fails for the outer loop. Do not invoke 2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2195 // already known to be reducible. We can use an inherited attribute for 2196 // that. 2197 return; 2198 } 2199 } 2200 for (Loop *InnerL : L) 2201 collectSupportedLoops(*InnerL, LI, ORE, V); 2202 } 2203 2204 //===----------------------------------------------------------------------===// 2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2206 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2207 //===----------------------------------------------------------------------===// 2208 2209 /// Compute the transformed value of Index at offset StartValue using step 2210 /// StepValue. 2211 /// For integer induction, returns StartValue + Index * StepValue. 2212 /// For pointer induction, returns StartValue[Index * StepValue]. 2213 /// FIXME: The newly created binary instructions should contain nsw/nuw 2214 /// flags, which can be found from the original scalar operations. 2215 static Value * 2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2217 Value *Step, 2218 InductionDescriptor::InductionKind InductionKind, 2219 const BinaryOperator *InductionBinOp) { 2220 Type *StepTy = Step->getType(); 2221 Value *CastedIndex = StepTy->isIntegerTy() 2222 ? B.CreateSExtOrTrunc(Index, StepTy) 2223 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2224 if (CastedIndex != Index) { 2225 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2226 Index = CastedIndex; 2227 } 2228 2229 // Note: the IR at this point is broken. We cannot use SE to create any new 2230 // SCEV and then expand it, hoping that SCEV's simplification will give us 2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2232 // lead to various SCEV crashes. So all we can do is to use builder and rely 2233 // on InstCombine for future simplifications. Here we handle some trivial 2234 // cases only. 2235 auto CreateAdd = [&B](Value *X, Value *Y) { 2236 assert(X->getType() == Y->getType() && "Types don't match!"); 2237 if (auto *CX = dyn_cast<ConstantInt>(X)) 2238 if (CX->isZero()) 2239 return Y; 2240 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2241 if (CY->isZero()) 2242 return X; 2243 return B.CreateAdd(X, Y); 2244 }; 2245 2246 // We allow X to be a vector type, in which case Y will potentially be 2247 // splatted into a vector with the same element count. 2248 auto CreateMul = [&B](Value *X, Value *Y) { 2249 assert(X->getType()->getScalarType() == Y->getType() && 2250 "Types don't match!"); 2251 if (auto *CX = dyn_cast<ConstantInt>(X)) 2252 if (CX->isOne()) 2253 return Y; 2254 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2255 if (CY->isOne()) 2256 return X; 2257 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2258 if (XVTy && !isa<VectorType>(Y->getType())) 2259 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2260 return B.CreateMul(X, Y); 2261 }; 2262 2263 switch (InductionKind) { 2264 case InductionDescriptor::IK_IntInduction: { 2265 assert(!isa<VectorType>(Index->getType()) && 2266 "Vector indices not supported for integer inductions yet"); 2267 assert(Index->getType() == StartValue->getType() && 2268 "Index type does not match StartValue type"); 2269 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2270 return B.CreateSub(StartValue, Index); 2271 auto *Offset = CreateMul(Index, Step); 2272 return CreateAdd(StartValue, Offset); 2273 } 2274 case InductionDescriptor::IK_PtrInduction: 2275 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2276 case InductionDescriptor::IK_FpInduction: { 2277 assert(!isa<VectorType>(Index->getType()) && 2278 "Vector indices not supported for FP inductions yet"); 2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2280 assert(InductionBinOp && 2281 (InductionBinOp->getOpcode() == Instruction::FAdd || 2282 InductionBinOp->getOpcode() == Instruction::FSub) && 2283 "Original bin op should be defined for FP induction"); 2284 2285 Value *MulExp = B.CreateFMul(Step, Index); 2286 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2287 "induction"); 2288 } 2289 case InductionDescriptor::IK_NoInduction: 2290 return nullptr; 2291 } 2292 llvm_unreachable("invalid enum"); 2293 } 2294 2295 std::optional<unsigned> getMaxVScale(const Function &F, 2296 const TargetTransformInfo &TTI) { 2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2298 return MaxVScale; 2299 2300 if (F.hasFnAttribute(Attribute::VScaleRange)) 2301 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2302 2303 return std::nullopt; 2304 } 2305 2306 /// For the given VF and UF and maximum trip count computed for the loop, return 2307 /// whether the induction variable might overflow in the vectorized loop. If not, 2308 /// then we know a runtime overflow check always evaluates to false and can be 2309 /// removed. 2310 static bool isIndvarOverflowCheckKnownFalse( 2311 const LoopVectorizationCostModel *Cost, 2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2313 // Always be conservative if we don't know the exact unroll factor. 2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2315 2316 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2317 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2318 2319 // We know the runtime overflow check is known false iff the (max) trip-count 2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2321 // the vector loop induction variable. 2322 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { 2323 uint64_t MaxVF = VF.getKnownMinValue(); 2324 if (VF.isScalable()) { 2325 std::optional<unsigned> MaxVScale = 2326 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2327 if (!MaxVScale) 2328 return false; 2329 MaxVF *= *MaxVScale; 2330 } 2331 2332 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2333 } 2334 2335 return false; 2336 } 2337 2338 // Return whether we allow using masked interleave-groups (for dealing with 2339 // strided loads/stores that reside in predicated blocks, or for dealing 2340 // with gaps). 2341 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2342 // If an override option has been passed in for interleaved accesses, use it. 2343 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2344 return EnableMaskedInterleavedMemAccesses; 2345 2346 return TTI.enableMaskedInterleavedAccessVectorization(); 2347 } 2348 2349 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2350 VPReplicateRecipe *RepRecipe, 2351 const VPLane &Lane, 2352 VPTransformState &State) { 2353 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2354 2355 // Does this instruction return a value ? 2356 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2357 2358 Instruction *Cloned = Instr->clone(); 2359 if (!IsVoidRetTy) { 2360 Cloned->setName(Instr->getName() + ".cloned"); 2361 #if !defined(NDEBUG) 2362 // Verify that VPlan type inference results agree with the type of the 2363 // generated values. 2364 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2365 "inferred type and type from generated instructions do not match"); 2366 #endif 2367 } 2368 2369 RepRecipe->setFlags(Cloned); 2370 2371 if (auto DL = Instr->getDebugLoc()) 2372 State.setDebugLocFrom(DL); 2373 2374 // Replace the operands of the cloned instructions with their scalar 2375 // equivalents in the new loop. 2376 for (const auto &I : enumerate(RepRecipe->operands())) { 2377 auto InputLane = Lane; 2378 VPValue *Operand = I.value(); 2379 if (vputils::isUniformAfterVectorization(Operand)) 2380 InputLane = VPLane::getFirstLane(); 2381 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2382 } 2383 State.addNewMetadata(Cloned, Instr); 2384 2385 // Place the cloned scalar in the new loop. 2386 State.Builder.Insert(Cloned); 2387 2388 State.set(RepRecipe, Cloned, Lane); 2389 2390 // If we just cloned a new assumption, add it the assumption cache. 2391 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2392 AC->registerAssumption(II); 2393 2394 // End if-block. 2395 VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); 2396 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; 2397 assert( 2398 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2399 all_of(RepRecipe->operands(), 2400 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2401 "Expected a recipe is either within a region or all of its operands " 2402 "are defined outside the vectorized region."); 2403 if (IfPredicateInstr) 2404 PredicatedInstructions.push_back(Cloned); 2405 } 2406 2407 Value * 2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2409 if (VectorTripCount) 2410 return VectorTripCount; 2411 2412 Value *TC = getTripCount(); 2413 IRBuilder<> Builder(InsertBlock->getTerminator()); 2414 2415 Type *Ty = TC->getType(); 2416 // This is where we can make the step a runtime constant. 2417 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2418 2419 // If the tail is to be folded by masking, round the number of iterations N 2420 // up to a multiple of Step instead of rounding down. This is done by first 2421 // adding Step-1 and then rounding down. Note that it's ok if this addition 2422 // overflows: the vector induction variable will eventually wrap to zero given 2423 // that it starts at zero and its Step is a power of two; the loop will then 2424 // exit, with the last early-exit vector comparison also producing all-true. 2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2426 // is accounted for in emitIterationCountCheck that adds an overflow check. 2427 if (Cost->foldTailByMasking()) { 2428 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2429 "VF*UF must be a power of 2 when folding tail by masking"); 2430 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), 2431 "n.rnd.up"); 2432 } 2433 2434 // Now we need to generate the expression for the part of the loop that the 2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2436 // iterations are not required for correctness, or N - Step, otherwise. Step 2437 // is equal to the vectorization factor (number of SIMD elements) times the 2438 // unroll factor (number of SIMD instructions). 2439 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2440 2441 // There are cases where we *must* run at least one iteration in the remainder 2442 // loop. See the cost model for when this can happen. If the step evenly 2443 // divides the trip count, we set the remainder to be equal to the step. If 2444 // the step does not evenly divide the trip count, no adjustment is necessary 2445 // since there will already be scalar iterations. Note that the minimum 2446 // iterations check ensures that N >= Step. 2447 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2448 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2449 R = Builder.CreateSelect(IsZero, Step, R); 2450 } 2451 2452 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2453 2454 return VectorTripCount; 2455 } 2456 2457 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { 2458 VPBlockBase *ScalarPH = Plan.getScalarPreheader(); 2459 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); 2460 if (PreVectorPH->getNumSuccessors() != 1) { 2461 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); 2462 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && 2463 "Unexpected successor"); 2464 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); 2465 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); 2466 PreVectorPH = CheckVPIRBB; 2467 } 2468 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); 2469 PreVectorPH->swapSuccessors(); 2470 } 2471 2472 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2473 Value *Count = getTripCount(); 2474 // Reuse existing vector loop preheader for TC checks. 2475 // Note that new preheader block is generated for vector loop. 2476 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2477 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2478 2479 // Generate code to check if the loop's trip count is less than VF * UF, or 2480 // equal to it in case a scalar epilogue is required; this implies that the 2481 // vector trip count is zero. This check also covers the case where adding one 2482 // to the backedge-taken count overflowed leading to an incorrect trip count 2483 // of zero. In this case we will also jump to the scalar loop. 2484 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2485 : ICmpInst::ICMP_ULT; 2486 2487 // If tail is to be folded, vector loop takes care of all iterations. 2488 Type *CountTy = Count->getType(); 2489 Value *CheckMinIters = Builder.getFalse(); 2490 auto CreateStep = [&]() -> Value * { 2491 // Create step with max(MinProTripCount, UF * VF). 2492 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2493 return createStepForVF(Builder, CountTy, VF, UF); 2494 2495 Value *MinProfTC = 2496 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2497 if (!VF.isScalable()) 2498 return MinProfTC; 2499 return Builder.CreateBinaryIntrinsic( 2500 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2501 }; 2502 2503 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2504 if (Style == TailFoldingStyle::None) { 2505 Value *Step = CreateStep(); 2506 ScalarEvolution &SE = *PSE.getSE(); 2507 // TODO: Emit unconditional branch to vector preheader instead of 2508 // conditional branch with known condition. 2509 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); 2510 // Check if the trip count is < the step. 2511 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { 2512 // TODO: Ensure step is at most the trip count when determining max VF and 2513 // UF, w/o tail folding. 2514 CheckMinIters = Builder.getTrue(); 2515 } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), 2516 TripCountSCEV, SE.getSCEV(Step))) { 2517 // Generate the minimum iteration check only if we cannot prove the 2518 // check is known to be true, or known to be false. 2519 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2520 } // else step known to be < trip count, use CheckMinIters preset to false. 2521 } else if (VF.isScalable() && 2522 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2523 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2524 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2525 // an overflow to zero when updating induction variables and so an 2526 // additional overflow check is required before entering the vector loop. 2527 2528 // Get the maximum unsigned value for the type. 2529 Value *MaxUIntTripCount = 2530 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2531 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2532 2533 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2534 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2535 } 2536 2537 // Create new preheader for vector loop. 2538 LoopVectorPreHeader = 2539 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2540 "vector.ph"); 2541 2542 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2543 DT->getNode(Bypass)->getIDom()) && 2544 "TC check is expected to dominate Bypass"); 2545 2546 BranchInst &BI = 2547 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2548 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2549 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 2550 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2551 LoopBypassBlocks.push_back(TCCheckBlock); 2552 2553 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. 2554 introduceCheckBlockInVPlan(TCCheckBlock); 2555 } 2556 2557 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2558 BasicBlock *const SCEVCheckBlock = 2559 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader); 2560 if (!SCEVCheckBlock) 2561 return nullptr; 2562 2563 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2564 (OptForSizeBasedOnProfile && 2565 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2566 "Cannot SCEV check stride or overflow when optimizing for size"); 2567 assert(!LoopBypassBlocks.empty() && 2568 "Should already be a bypass block due to iteration count check"); 2569 LoopBypassBlocks.push_back(SCEVCheckBlock); 2570 AddedSafetyChecks = true; 2571 2572 introduceCheckBlockInVPlan(SCEVCheckBlock); 2573 return SCEVCheckBlock; 2574 } 2575 2576 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2577 // VPlan-native path does not do any analysis for runtime checks currently. 2578 if (EnableVPlanNativePath) 2579 return nullptr; 2580 2581 BasicBlock *const MemCheckBlock = 2582 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2583 2584 // Check if we generated code that checks in runtime if arrays overlap. We put 2585 // the checks into a separate block to make the more common case of few 2586 // elements faster. 2587 if (!MemCheckBlock) 2588 return nullptr; 2589 2590 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2591 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2592 "Cannot emit memory checks when optimizing for size, unless forced " 2593 "to vectorize."); 2594 ORE->emit([&]() { 2595 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2596 OrigLoop->getStartLoc(), 2597 OrigLoop->getHeader()) 2598 << "Code-size may be reduced by not forcing " 2599 "vectorization, or by source-code modifications " 2600 "eliminating the need for runtime checks " 2601 "(e.g., adding 'restrict')."; 2602 }); 2603 } 2604 2605 LoopBypassBlocks.push_back(MemCheckBlock); 2606 2607 AddedSafetyChecks = true; 2608 2609 introduceCheckBlockInVPlan(MemCheckBlock); 2610 return MemCheckBlock; 2611 } 2612 2613 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p 2614 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must 2615 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All 2616 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. 2617 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { 2618 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); 2619 for (auto &R : make_early_inc_range(*VPBB)) { 2620 assert(!R.isPhi() && "Tried to move phi recipe to end of block"); 2621 R.moveBefore(*IRVPBB, IRVPBB->end()); 2622 } 2623 2624 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); 2625 // VPBB is now dead and will be cleaned up when the plan gets destroyed. 2626 } 2627 2628 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2629 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2630 assert(LoopVectorPreHeader && "Invalid loop structure"); 2631 assert((OrigLoop->getUniqueLatchExitBlock() || 2632 Cost->requiresScalarEpilogue(VF.isVector())) && 2633 "loops not exiting via the latch without required epilogue?"); 2634 2635 LoopMiddleBlock = 2636 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2637 LI, nullptr, Twine(Prefix) + "middle.block"); 2638 replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); 2639 LoopScalarPreHeader = 2640 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2641 nullptr, Twine(Prefix) + "scalar.ph"); 2642 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); 2643 } 2644 2645 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2646 /// expansion results. 2647 static Value *getExpandedStep(const InductionDescriptor &ID, 2648 const SCEV2ValueTy &ExpandedSCEVs) { 2649 const SCEV *Step = ID.getStep(); 2650 if (auto *C = dyn_cast<SCEVConstant>(Step)) 2651 return C->getValue(); 2652 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 2653 return U->getValue(); 2654 auto I = ExpandedSCEVs.find(Step); 2655 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 2656 return I->second; 2657 } 2658 2659 /// Knowing that loop \p L executes a single vector iteration, add instructions 2660 /// that will get simplified and thus should not have any cost to \p 2661 /// InstsToIgnore. 2662 static void addFullyUnrolledInstructionsToIgnore( 2663 Loop *L, const LoopVectorizationLegality::InductionList &IL, 2664 SmallPtrSetImpl<Instruction *> &InstsToIgnore) { 2665 auto *Cmp = L->getLatchCmpInst(); 2666 if (Cmp) 2667 InstsToIgnore.insert(Cmp); 2668 for (const auto &KV : IL) { 2669 // Extract the key by hand so that it can be used in the lambda below. Note 2670 // that captured structured bindings are a C++20 extension. 2671 const PHINode *IV = KV.first; 2672 2673 // Get next iteration value of the induction variable. 2674 Instruction *IVInst = 2675 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch())); 2676 if (all_of(IVInst->users(), 2677 [&](const User *U) { return U == IV || U == Cmp; })) 2678 InstsToIgnore.insert(IVInst); 2679 } 2680 } 2681 2682 void InnerLoopVectorizer::createInductionAdditionalBypassValues( 2683 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { 2684 assert(MainVectorTripCount && "Must have bypass information"); 2685 2686 Instruction *OldInduction = Legal->getPrimaryInduction(); 2687 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), 2688 getAdditionalBypassBlock()->getFirstInsertionPt()); 2689 for (const auto &InductionEntry : Legal->getInductionVars()) { 2690 PHINode *OrigPhi = InductionEntry.first; 2691 const InductionDescriptor &II = InductionEntry.second; 2692 Value *Step = getExpandedStep(II, ExpandedSCEVs); 2693 // For the primary induction the additional bypass end value is known. 2694 // Otherwise it is computed. 2695 Value *EndValueFromAdditionalBypass = MainVectorTripCount; 2696 if (OrigPhi != OldInduction) { 2697 auto *BinOp = II.getInductionBinOp(); 2698 // Fast-math-flags propagate from the original induction instruction. 2699 if (isa_and_nonnull<FPMathOperator>(BinOp)) 2700 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); 2701 2702 // Compute the end value for the additional bypass. 2703 EndValueFromAdditionalBypass = 2704 emitTransformedIndex(BypassBuilder, MainVectorTripCount, 2705 II.getStartValue(), Step, II.getKind(), BinOp); 2706 EndValueFromAdditionalBypass->setName("ind.end"); 2707 } 2708 2709 // Store the bypass value here, as it needs to be added as operand to its 2710 // scalar preheader phi node after the epilogue skeleton has been created. 2711 // TODO: Directly add as extra operand to the VPResumePHI recipe. 2712 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && 2713 "entry for OrigPhi already exits"); 2714 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; 2715 } 2716 } 2717 2718 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( 2719 const SCEV2ValueTy &ExpandedSCEVs) { 2720 /* 2721 In this function we generate a new loop. The new loop will contain 2722 the vectorized instructions while the old loop will continue to run the 2723 scalar remainder. 2724 2725 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2726 / | preheader are expanded here. Eventually all required SCEV 2727 / | expansion should happen here. 2728 / v 2729 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2730 | / | 2731 | / v 2732 || [ ] <-- vector pre header. 2733 |/ | 2734 | v 2735 | [ ] \ 2736 | [ ]_| <-- vector loop (created during VPlan execution). 2737 | | 2738 | v 2739 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2740 | | successors created during VPlan execution) 2741 \/ | 2742 /\ v 2743 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2744 | | 2745 (opt) v <-- edge from middle to exit iff epilogue is not required. 2746 | [ ] \ 2747 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header 2748 | | wrapped in VPIRBasicBlock). 2749 \ | 2750 \ v 2751 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2752 ... 2753 */ 2754 2755 // Create an empty vector loop, and prepare basic blocks for the runtime 2756 // checks. 2757 createVectorLoopSkeleton(""); 2758 2759 // Now, compare the new count to zero. If it is zero skip the vector loop and 2760 // jump to the scalar loop. This check also covers the case where the 2761 // backedge-taken count is uint##_max: adding one to it will overflow leading 2762 // to an incorrect trip count of zero. In this (rare) case we will also jump 2763 // to the scalar loop. 2764 emitIterationCountCheck(LoopScalarPreHeader); 2765 2766 // Generate the code to check any assumptions that we've made for SCEV 2767 // expressions. 2768 emitSCEVChecks(LoopScalarPreHeader); 2769 2770 // Generate the code that checks in runtime if arrays overlap. We put the 2771 // checks into a separate block to make the more common case of few elements 2772 // faster. 2773 emitMemRuntimeChecks(LoopScalarPreHeader); 2774 2775 return LoopVectorPreHeader; 2776 } 2777 2778 // Fix up external users of the induction variable. At this point, we are 2779 // in LCSSA form, with all external PHIs that use the IV having one input value, 2780 // coming from the remainder loop. We need those PHIs to also have a correct 2781 // value for the IV when arriving directly from the middle block. 2782 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 2783 const InductionDescriptor &II, 2784 Value *VectorTripCount, 2785 BasicBlock *MiddleBlock, 2786 VPTransformState &State) { 2787 // There are two kinds of external IV usages - those that use the value 2788 // computed in the last iteration (the PHI) and those that use the penultimate 2789 // value (the value that feeds into the phi from the loop latch). 2790 // We allow both, but they, obviously, have different values. 2791 2792 DenseMap<Value *, Value *> MissingVals; 2793 2794 Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock( 2795 OrigLoop->getLoopPreheader())) 2796 ->getIncomingValueForBlock(MiddleBlock); 2797 2798 // An external user of the last iteration's value should see the value that 2799 // the remainder loop uses to initialize its own IV. 2800 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 2801 for (User *U : PostInc->users()) { 2802 Instruction *UI = cast<Instruction>(U); 2803 if (!OrigLoop->contains(UI)) { 2804 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2805 MissingVals[UI] = EndValue; 2806 } 2807 } 2808 2809 // An external user of the penultimate value need to see EndValue - Step. 2810 // The simplest way to get this is to recompute it from the constituent SCEVs, 2811 // that is Start + (Step * (CRD - 1)). 2812 for (User *U : OrigPhi->users()) { 2813 auto *UI = cast<Instruction>(U); 2814 if (!OrigLoop->contains(UI)) { 2815 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2816 IRBuilder<> B(MiddleBlock->getTerminator()); 2817 2818 // Fast-math-flags propagate from the original induction instruction. 2819 if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp())) 2820 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 2821 2822 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 2823 assert(StepVPV && "step must have been expanded during VPlan execution"); 2824 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 2825 : State.get(StepVPV, VPLane(0)); 2826 Value *Escape = nullptr; 2827 if (EndValue->getType()->isIntegerTy()) 2828 Escape = B.CreateSub(EndValue, Step); 2829 else if (EndValue->getType()->isPointerTy()) 2830 Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step)); 2831 else { 2832 assert(EndValue->getType()->isFloatingPointTy() && 2833 "Unexpected induction type"); 2834 Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() == 2835 Instruction::FAdd 2836 ? Instruction::FSub 2837 : Instruction::FAdd, 2838 EndValue, Step); 2839 } 2840 Escape->setName("ind.escape"); 2841 MissingVals[UI] = Escape; 2842 } 2843 } 2844 2845 assert((MissingVals.empty() || 2846 all_of(MissingVals, 2847 [MiddleBlock, this](const std::pair<Value *, Value *> &P) { 2848 return all_of( 2849 predecessors(cast<Instruction>(P.first)->getParent()), 2850 [MiddleBlock, this](BasicBlock *Pred) { 2851 return Pred == MiddleBlock || 2852 Pred == OrigLoop->getLoopLatch(); 2853 }); 2854 })) && 2855 "Expected escaping values from latch/middle.block only"); 2856 2857 for (auto &I : MissingVals) { 2858 PHINode *PHI = cast<PHINode>(I.first); 2859 // One corner case we have to handle is two IVs "chasing" each-other, 2860 // that is %IV2 = phi [...], [ %IV1, %latch ] 2861 // In this case, if IV1 has an external use, we need to avoid adding both 2862 // "last value of IV1" and "penultimate value of IV2". So, verify that we 2863 // don't already have an incoming value for the middle block. 2864 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 2865 PHI->addIncoming(I.second, MiddleBlock); 2866 } 2867 } 2868 2869 namespace { 2870 2871 struct CSEDenseMapInfo { 2872 static bool canHandle(const Instruction *I) { 2873 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2874 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2875 } 2876 2877 static inline Instruction *getEmptyKey() { 2878 return DenseMapInfo<Instruction *>::getEmptyKey(); 2879 } 2880 2881 static inline Instruction *getTombstoneKey() { 2882 return DenseMapInfo<Instruction *>::getTombstoneKey(); 2883 } 2884 2885 static unsigned getHashValue(const Instruction *I) { 2886 assert(canHandle(I) && "Unknown instruction!"); 2887 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2888 I->value_op_end())); 2889 } 2890 2891 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 2892 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2893 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2894 return LHS == RHS; 2895 return LHS->isIdenticalTo(RHS); 2896 } 2897 }; 2898 2899 } // end anonymous namespace 2900 2901 ///Perform cse of induction variable instructions. 2902 static void cse(BasicBlock *BB) { 2903 // Perform simple cse. 2904 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2905 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 2906 if (!CSEDenseMapInfo::canHandle(&In)) 2907 continue; 2908 2909 // Check if we can replace this instruction with any of the 2910 // visited instructions. 2911 if (Instruction *V = CSEMap.lookup(&In)) { 2912 In.replaceAllUsesWith(V); 2913 In.eraseFromParent(); 2914 continue; 2915 } 2916 2917 CSEMap[&In] = &In; 2918 } 2919 } 2920 2921 InstructionCost 2922 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 2923 ElementCount VF) const { 2924 // We only need to calculate a cost if the VF is scalar; for actual vectors 2925 // we should already have a pre-calculated cost at each VF. 2926 if (!VF.isScalar()) 2927 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 2928 2929 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2930 Type *RetTy = CI->getType(); 2931 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 2932 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 2933 return *RedCost; 2934 2935 SmallVector<Type *, 4> Tys; 2936 for (auto &ArgOp : CI->args()) 2937 Tys.push_back(ArgOp->getType()); 2938 2939 InstructionCost ScalarCallCost = 2940 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 2941 2942 // If this is an intrinsic we may have a lower cost for it. 2943 if (getVectorIntrinsicIDForCall(CI, TLI)) { 2944 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 2945 return std::min(ScalarCallCost, IntrinsicCost); 2946 } 2947 return ScalarCallCost; 2948 } 2949 2950 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { 2951 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 2952 return Elt; 2953 return VectorType::get(Elt, VF); 2954 } 2955 2956 InstructionCost 2957 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 2958 ElementCount VF) const { 2959 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 2960 assert(ID && "Expected intrinsic call!"); 2961 Type *RetTy = maybeVectorizeType(CI->getType(), VF); 2962 FastMathFlags FMF; 2963 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 2964 FMF = FPMO->getFastMathFlags(); 2965 2966 SmallVector<const Value *> Arguments(CI->args()); 2967 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 2968 SmallVector<Type *> ParamTys; 2969 std::transform(FTy->param_begin(), FTy->param_end(), 2970 std::back_inserter(ParamTys), 2971 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); 2972 2973 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 2974 dyn_cast<IntrinsicInst>(CI)); 2975 return TTI.getIntrinsicInstrCost(CostAttrs, 2976 TargetTransformInfo::TCK_RecipThroughput); 2977 } 2978 2979 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 2980 // Fix widened non-induction PHIs by setting up the PHI operands. 2981 if (EnableVPlanNativePath) 2982 fixNonInductionPHIs(State); 2983 2984 // Forget the original basic block. 2985 PSE.getSE()->forgetLoop(OrigLoop); 2986 PSE.getSE()->forgetBlockAndLoopDispositions(); 2987 2988 // After vectorization, the exit blocks of the original loop will have 2989 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 2990 // looked through single-entry phis. 2991 SmallVector<BasicBlock *> ExitBlocks; 2992 OrigLoop->getExitBlocks(ExitBlocks); 2993 for (BasicBlock *Exit : ExitBlocks) 2994 for (PHINode &PN : Exit->phis()) 2995 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 2996 2997 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2998 // No edge from the middle block to the unique exit block has been inserted 2999 // and there is nothing to fix from vector loop; phis should have incoming 3000 // from scalar loop only. 3001 } else { 3002 // TODO: Check in VPlan to see if IV users need fixing instead of checking 3003 // the cost model. 3004 3005 // If we inserted an edge from the middle block to the unique exit block, 3006 // update uses outside the loop (phis) to account for the newly inserted 3007 // edge. 3008 3009 // Fix-up external users of the induction variables. 3010 for (const auto &Entry : Legal->getInductionVars()) 3011 fixupIVUsers(Entry.first, Entry.second, 3012 getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); 3013 } 3014 3015 // Don't apply optimizations below when no vector region remains, as they all 3016 // require a vector loop at the moment. 3017 if (!State.Plan->getVectorLoopRegion()) 3018 return; 3019 3020 for (Instruction *PI : PredicatedInstructions) 3021 sinkScalarOperands(&*PI); 3022 3023 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 3024 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 3025 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; 3026 3027 // Remove redundant induction instructions. 3028 cse(HeaderBB); 3029 3030 // Set/update profile weights for the vector and remainder loops as original 3031 // loop iterations are now distributed among them. Note that original loop 3032 // becomes the scalar remainder loop after vectorization. 3033 // 3034 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3035 // end up getting slightly roughened result but that should be OK since 3036 // profile is not inherently precise anyway. Note also possible bypass of 3037 // vector code caused by legality checks is ignored, assigning all the weight 3038 // to the vector loop, optimistically. 3039 // 3040 // For scalable vectorization we can't know at compile time how many 3041 // iterations of the loop are handled in one vector iteration, so instead 3042 // assume a pessimistic vscale of '1'. 3043 Loop *VectorLoop = LI->getLoopFor(HeaderBB); 3044 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, 3045 VF.getKnownMinValue() * UF); 3046 } 3047 3048 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3049 // The basic block and loop containing the predicated instruction. 3050 auto *PredBB = PredInst->getParent(); 3051 auto *VectorLoop = LI->getLoopFor(PredBB); 3052 3053 // Initialize a worklist with the operands of the predicated instruction. 3054 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3055 3056 // Holds instructions that we need to analyze again. An instruction may be 3057 // reanalyzed if we don't yet know if we can sink it or not. 3058 SmallVector<Instruction *, 8> InstsToReanalyze; 3059 3060 // Returns true if a given use occurs in the predicated block. Phi nodes use 3061 // their operands in their corresponding predecessor blocks. 3062 auto IsBlockOfUsePredicated = [&](Use &U) -> bool { 3063 auto *I = cast<Instruction>(U.getUser()); 3064 BasicBlock *BB = I->getParent(); 3065 if (auto *Phi = dyn_cast<PHINode>(I)) 3066 BB = Phi->getIncomingBlock( 3067 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3068 return BB == PredBB; 3069 }; 3070 3071 // Iteratively sink the scalarized operands of the predicated instruction 3072 // into the block we created for it. When an instruction is sunk, it's 3073 // operands are then added to the worklist. The algorithm ends after one pass 3074 // through the worklist doesn't sink a single instruction. 3075 bool Changed; 3076 do { 3077 // Add the instructions that need to be reanalyzed to the worklist, and 3078 // reset the changed indicator. 3079 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3080 InstsToReanalyze.clear(); 3081 Changed = false; 3082 3083 while (!Worklist.empty()) { 3084 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3085 3086 // We can't sink an instruction if it is a phi node, is not in the loop, 3087 // may have side effects or may read from memory. 3088 // TODO: Could do more granular checking to allow sinking 3089 // a load past non-store instructions. 3090 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3091 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3092 continue; 3093 3094 // If the instruction is already in PredBB, check if we can sink its 3095 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3096 // sinking the scalar instruction I, hence it appears in PredBB; but it 3097 // may have failed to sink I's operands (recursively), which we try 3098 // (again) here. 3099 if (I->getParent() == PredBB) { 3100 Worklist.insert(I->op_begin(), I->op_end()); 3101 continue; 3102 } 3103 3104 // It's legal to sink the instruction if all its uses occur in the 3105 // predicated block. Otherwise, there's nothing to do yet, and we may 3106 // need to reanalyze the instruction. 3107 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { 3108 InstsToReanalyze.push_back(I); 3109 continue; 3110 } 3111 3112 // Move the instruction to the beginning of the predicated block, and add 3113 // it's operands to the worklist. 3114 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3115 Worklist.insert(I->op_begin(), I->op_end()); 3116 3117 // The sinking may have enabled other instructions to be sunk, so we will 3118 // need to iterate. 3119 Changed = true; 3120 } 3121 } while (Changed); 3122 } 3123 3124 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 3125 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3126 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3127 for (VPRecipeBase &P : VPBB->phis()) { 3128 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3129 if (!VPPhi) 3130 continue; 3131 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi)); 3132 // Make sure the builder has a valid insert point. 3133 Builder.SetInsertPoint(NewPhi); 3134 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) { 3135 VPValue *Inc = VPPhi->getIncomingValue(Idx); 3136 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); 3137 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); 3138 } 3139 } 3140 } 3141 } 3142 3143 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3144 // We should not collect Scalars more than once per VF. Right now, this 3145 // function is called from collectUniformsAndScalars(), which already does 3146 // this check. Collecting Scalars for VF=1 does not make any sense. 3147 assert(VF.isVector() && !Scalars.contains(VF) && 3148 "This function should not be visited twice for the same VF"); 3149 3150 // This avoids any chances of creating a REPLICATE recipe during planning 3151 // since that would result in generation of scalarized code during execution, 3152 // which is not supported for scalable vectors. 3153 if (VF.isScalable()) { 3154 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3155 return; 3156 } 3157 3158 SmallSetVector<Instruction *, 8> Worklist; 3159 3160 // These sets are used to seed the analysis with pointers used by memory 3161 // accesses that will remain scalar. 3162 SmallSetVector<Instruction *, 8> ScalarPtrs; 3163 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3164 auto *Latch = TheLoop->getLoopLatch(); 3165 3166 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3167 // The pointer operands of loads and stores will be scalar as long as the 3168 // memory access is not a gather or scatter operation. The value operand of a 3169 // store will remain scalar if the store is scalarized. 3170 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3171 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3172 assert(WideningDecision != CM_Unknown && 3173 "Widening decision should be ready at this moment"); 3174 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3175 if (Ptr == Store->getValueOperand()) 3176 return WideningDecision == CM_Scalarize; 3177 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3178 "Ptr is neither a value or pointer operand"); 3179 return WideningDecision != CM_GatherScatter; 3180 }; 3181 3182 // A helper that returns true if the given value is a getelementptr 3183 // instruction contained in the loop. 3184 auto IsLoopVaryingGEP = [&](Value *V) { 3185 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V); 3186 }; 3187 3188 // A helper that evaluates a memory access's use of a pointer. If the use will 3189 // be a scalar use and the pointer is only used by memory accesses, we place 3190 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3191 // PossibleNonScalarPtrs. 3192 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3193 // We only care about bitcast and getelementptr instructions contained in 3194 // the loop. 3195 if (!IsLoopVaryingGEP(Ptr)) 3196 return; 3197 3198 // If the pointer has already been identified as scalar (e.g., if it was 3199 // also identified as uniform), there's nothing to do. 3200 auto *I = cast<Instruction>(Ptr); 3201 if (Worklist.count(I)) 3202 return; 3203 3204 // If the use of the pointer will be a scalar use, and all users of the 3205 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3206 // place the pointer in PossibleNonScalarPtrs. 3207 if (IsScalarUse(MemAccess, Ptr) && 3208 all_of(I->users(), IsaPred<LoadInst, StoreInst>)) 3209 ScalarPtrs.insert(I); 3210 else 3211 PossibleNonScalarPtrs.insert(I); 3212 }; 3213 3214 // We seed the scalars analysis with three classes of instructions: (1) 3215 // instructions marked uniform-after-vectorization and (2) bitcast, 3216 // getelementptr and (pointer) phi instructions used by memory accesses 3217 // requiring a scalar use. 3218 // 3219 // (1) Add to the worklist all instructions that have been identified as 3220 // uniform-after-vectorization. 3221 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3222 3223 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3224 // memory accesses requiring a scalar use. The pointer operands of loads and 3225 // stores will be scalar unless the operation is a gather or scatter. 3226 // The value operand of a store will remain scalar if the store is scalarized. 3227 for (auto *BB : TheLoop->blocks()) 3228 for (auto &I : *BB) { 3229 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3230 EvaluatePtrUse(Load, Load->getPointerOperand()); 3231 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3232 EvaluatePtrUse(Store, Store->getPointerOperand()); 3233 EvaluatePtrUse(Store, Store->getValueOperand()); 3234 } 3235 } 3236 for (auto *I : ScalarPtrs) 3237 if (!PossibleNonScalarPtrs.count(I)) { 3238 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3239 Worklist.insert(I); 3240 } 3241 3242 // Insert the forced scalars. 3243 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3244 // induction variable when the PHI user is scalarized. 3245 auto ForcedScalar = ForcedScalars.find(VF); 3246 if (ForcedScalar != ForcedScalars.end()) 3247 for (auto *I : ForcedScalar->second) { 3248 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3249 Worklist.insert(I); 3250 } 3251 3252 // Expand the worklist by looking through any bitcasts and getelementptr 3253 // instructions we've already identified as scalar. This is similar to the 3254 // expansion step in collectLoopUniforms(); however, here we're only 3255 // expanding to include additional bitcasts and getelementptr instructions. 3256 unsigned Idx = 0; 3257 while (Idx != Worklist.size()) { 3258 Instruction *Dst = Worklist[Idx++]; 3259 if (!IsLoopVaryingGEP(Dst->getOperand(0))) 3260 continue; 3261 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3262 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3263 auto *J = cast<Instruction>(U); 3264 return !TheLoop->contains(J) || Worklist.count(J) || 3265 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3266 IsScalarUse(J, Src)); 3267 })) { 3268 Worklist.insert(Src); 3269 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3270 } 3271 } 3272 3273 // An induction variable will remain scalar if all users of the induction 3274 // variable and induction variable update remain scalar. 3275 for (const auto &Induction : Legal->getInductionVars()) { 3276 auto *Ind = Induction.first; 3277 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3278 3279 // If tail-folding is applied, the primary induction variable will be used 3280 // to feed a vector compare. 3281 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3282 continue; 3283 3284 // Returns true if \p Indvar is a pointer induction that is used directly by 3285 // load/store instruction \p I. 3286 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3287 Instruction *I) { 3288 return Induction.second.getKind() == 3289 InductionDescriptor::IK_PtrInduction && 3290 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3291 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar); 3292 }; 3293 3294 // Determine if all users of the induction variable are scalar after 3295 // vectorization. 3296 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { 3297 auto *I = cast<Instruction>(U); 3298 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3299 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3300 }); 3301 if (!ScalarInd) 3302 continue; 3303 3304 // If the induction variable update is a fixed-order recurrence, neither the 3305 // induction variable or its update should be marked scalar after 3306 // vectorization. 3307 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); 3308 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) 3309 continue; 3310 3311 // Determine if all users of the induction variable update instruction are 3312 // scalar after vectorization. 3313 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3314 auto *I = cast<Instruction>(U); 3315 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3316 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3317 }); 3318 if (!ScalarIndUpdate) 3319 continue; 3320 3321 // The induction variable and its update instruction will remain scalar. 3322 Worklist.insert(Ind); 3323 Worklist.insert(IndUpdate); 3324 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3325 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3326 << "\n"); 3327 } 3328 3329 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3330 } 3331 3332 bool LoopVectorizationCostModel::isScalarWithPredication( 3333 Instruction *I, ElementCount VF) const { 3334 if (!isPredicatedInst(I)) 3335 return false; 3336 3337 // Do we have a non-scalar lowering for this predicated 3338 // instruction? No - it is scalar with predication. 3339 switch(I->getOpcode()) { 3340 default: 3341 return true; 3342 case Instruction::Call: 3343 if (VF.isScalar()) 3344 return true; 3345 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3346 .Kind == CM_Scalarize; 3347 case Instruction::Load: 3348 case Instruction::Store: { 3349 auto *Ptr = getLoadStorePointerOperand(I); 3350 auto *Ty = getLoadStoreType(I); 3351 Type *VTy = Ty; 3352 if (VF.isVector()) 3353 VTy = VectorType::get(Ty, VF); 3354 const Align Alignment = getLoadStoreAlignment(I); 3355 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3356 TTI.isLegalMaskedGather(VTy, Alignment)) 3357 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3358 TTI.isLegalMaskedScatter(VTy, Alignment)); 3359 } 3360 case Instruction::UDiv: 3361 case Instruction::SDiv: 3362 case Instruction::SRem: 3363 case Instruction::URem: { 3364 // We have the option to use the safe-divisor idiom to avoid predication. 3365 // The cost based decision here will always select safe-divisor for 3366 // scalable vectors as scalarization isn't legal. 3367 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3368 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3369 } 3370 } 3371 } 3372 3373 // TODO: Fold into LoopVectorizationLegality::isMaskRequired. 3374 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3375 // If predication is not needed, avoid it. 3376 // TODO: We can use the loop-preheader as context point here and get 3377 // context sensitive reasoning for isSafeToSpeculativelyExecute. 3378 if (!blockNeedsPredicationForAnyReason(I->getParent()) || 3379 isSafeToSpeculativelyExecute(I) || 3380 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) || 3381 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I)) 3382 return false; 3383 3384 // If the instruction was executed conditionally in the original scalar loop, 3385 // predication is needed with a mask whose lanes are all possibly inactive. 3386 if (Legal->blockNeedsPredication(I->getParent())) 3387 return true; 3388 3389 // All that remain are instructions with side-effects originally executed in 3390 // the loop unconditionally, but now execute under a tail-fold mask (only) 3391 // having at least one active lane (the first). If the side-effects of the 3392 // instruction are invariant, executing it w/o (the tail-folding) mask is safe 3393 // - it will cause the same side-effects as when masked. 3394 switch(I->getOpcode()) { 3395 default: 3396 llvm_unreachable( 3397 "instruction should have been considered by earlier checks"); 3398 case Instruction::Call: 3399 // Side-effects of a Call are assumed to be non-invariant, needing a 3400 // (fold-tail) mask. 3401 assert(Legal->isMaskRequired(I) && 3402 "should have returned earlier for calls not needing a mask"); 3403 return true; 3404 case Instruction::Load: 3405 // If the address is loop invariant no predication is needed. 3406 return !Legal->isInvariant(getLoadStorePointerOperand(I)); 3407 case Instruction::Store: { 3408 // For stores, we need to prove both speculation safety (which follows from 3409 // the same argument as loads), but also must prove the value being stored 3410 // is correct. The easiest form of the later is to require that all values 3411 // stored are the same. 3412 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && 3413 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); 3414 } 3415 case Instruction::UDiv: 3416 case Instruction::SDiv: 3417 case Instruction::SRem: 3418 case Instruction::URem: 3419 // If the divisor is loop-invariant no predication is needed. 3420 return !TheLoop->isLoopInvariant(I->getOperand(1)); 3421 } 3422 } 3423 3424 std::pair<InstructionCost, InstructionCost> 3425 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3426 ElementCount VF) const { 3427 assert(I->getOpcode() == Instruction::UDiv || 3428 I->getOpcode() == Instruction::SDiv || 3429 I->getOpcode() == Instruction::SRem || 3430 I->getOpcode() == Instruction::URem); 3431 assert(!isSafeToSpeculativelyExecute(I)); 3432 3433 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3434 3435 // Scalarization isn't legal for scalable vector types 3436 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3437 if (!VF.isScalable()) { 3438 // Get the scalarization cost and scale this amount by the probability of 3439 // executing the predicated block. If the instruction is not predicated, 3440 // we fall through to the next case. 3441 ScalarizationCost = 0; 3442 3443 // These instructions have a non-void type, so account for the phi nodes 3444 // that we will create. This cost is likely to be zero. The phi node 3445 // cost, if any, should be scaled by the block probability because it 3446 // models a copy at the end of each predicated block. 3447 ScalarizationCost += VF.getKnownMinValue() * 3448 TTI.getCFInstrCost(Instruction::PHI, CostKind); 3449 3450 // The cost of the non-predicated instruction. 3451 ScalarizationCost += VF.getKnownMinValue() * 3452 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 3453 3454 // The cost of insertelement and extractelement instructions needed for 3455 // scalarization. 3456 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 3457 3458 // Scale the cost by the probability of executing the predicated blocks. 3459 // This assumes the predicated block for each vector lane is equally 3460 // likely. 3461 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 3462 } 3463 InstructionCost SafeDivisorCost = 0; 3464 3465 auto *VecTy = toVectorTy(I->getType(), VF); 3466 3467 // The cost of the select guard to ensure all lanes are well defined 3468 // after we speculate above any internal control flow. 3469 SafeDivisorCost += 3470 TTI.getCmpSelInstrCost(Instruction::Select, VecTy, 3471 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 3472 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3473 3474 // Certain instructions can be cheaper to vectorize if they have a constant 3475 // second vector operand. One example of this are shifts on x86. 3476 Value *Op2 = I->getOperand(1); 3477 auto Op2Info = TTI.getOperandInfo(Op2); 3478 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 3479 Legal->isInvariant(Op2)) 3480 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 3481 3482 SmallVector<const Value *, 4> Operands(I->operand_values()); 3483 SafeDivisorCost += TTI.getArithmeticInstrCost( 3484 I->getOpcode(), VecTy, CostKind, 3485 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 3486 Op2Info, Operands, I); 3487 return {ScalarizationCost, SafeDivisorCost}; 3488 } 3489 3490 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 3491 Instruction *I, ElementCount VF) const { 3492 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 3493 assert(getWideningDecision(I, VF) == CM_Unknown && 3494 "Decision should not be set yet."); 3495 auto *Group = getInterleavedAccessGroup(I); 3496 assert(Group && "Must have a group."); 3497 unsigned InterleaveFactor = Group->getFactor(); 3498 3499 // If the instruction's allocated size doesn't equal its type size, it 3500 // requires padding and will be scalarized. 3501 auto &DL = I->getDataLayout(); 3502 auto *ScalarTy = getLoadStoreType(I); 3503 if (hasIrregularType(ScalarTy, DL)) 3504 return false; 3505 3506 // We currently only know how to emit interleave/deinterleave with 3507 // Factor=2 for scalable vectors. This is purely an implementation 3508 // limit. 3509 if (VF.isScalable() && InterleaveFactor != 2) 3510 return false; 3511 3512 // If the group involves a non-integral pointer, we may not be able to 3513 // losslessly cast all values to a common type. 3514 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 3515 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) { 3516 Instruction *Member = Group->getMember(Idx); 3517 if (!Member) 3518 continue; 3519 auto *MemberTy = getLoadStoreType(Member); 3520 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 3521 // Don't coerce non-integral pointers to integers or vice versa. 3522 if (MemberNI != ScalarNI) 3523 // TODO: Consider adding special nullptr value case here 3524 return false; 3525 if (MemberNI && ScalarNI && 3526 ScalarTy->getPointerAddressSpace() != 3527 MemberTy->getPointerAddressSpace()) 3528 return false; 3529 } 3530 3531 // Check if masking is required. 3532 // A Group may need masking for one of two reasons: it resides in a block that 3533 // needs predication, or it was decided to use masking to deal with gaps 3534 // (either a gap at the end of a load-access that may result in a speculative 3535 // load, or any gaps in a store-access). 3536 bool PredicatedAccessRequiresMasking = 3537 blockNeedsPredicationForAnyReason(I->getParent()) && 3538 Legal->isMaskRequired(I); 3539 bool LoadAccessWithGapsRequiresEpilogMasking = 3540 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 3541 !isScalarEpilogueAllowed(); 3542 bool StoreAccessWithGapsRequiresMasking = 3543 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 3544 if (!PredicatedAccessRequiresMasking && 3545 !LoadAccessWithGapsRequiresEpilogMasking && 3546 !StoreAccessWithGapsRequiresMasking) 3547 return true; 3548 3549 // If masked interleaving is required, we expect that the user/target had 3550 // enabled it, because otherwise it either wouldn't have been created or 3551 // it should have been invalidated by the CostModel. 3552 assert(useMaskedInterleavedAccesses(TTI) && 3553 "Masked interleave-groups for predicated accesses are not enabled."); 3554 3555 if (Group->isReverse()) 3556 return false; 3557 3558 auto *Ty = getLoadStoreType(I); 3559 const Align Alignment = getLoadStoreAlignment(I); 3560 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 3561 : TTI.isLegalMaskedStore(Ty, Alignment); 3562 } 3563 3564 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 3565 Instruction *I, ElementCount VF) { 3566 // Get and ensure we have a valid memory instruction. 3567 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 3568 3569 auto *Ptr = getLoadStorePointerOperand(I); 3570 auto *ScalarTy = getLoadStoreType(I); 3571 3572 // In order to be widened, the pointer should be consecutive, first of all. 3573 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 3574 return false; 3575 3576 // If the instruction is a store located in a predicated block, it will be 3577 // scalarized. 3578 if (isScalarWithPredication(I, VF)) 3579 return false; 3580 3581 // If the instruction's allocated size doesn't equal it's type size, it 3582 // requires padding and will be scalarized. 3583 auto &DL = I->getDataLayout(); 3584 if (hasIrregularType(ScalarTy, DL)) 3585 return false; 3586 3587 return true; 3588 } 3589 3590 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 3591 // We should not collect Uniforms more than once per VF. Right now, 3592 // this function is called from collectUniformsAndScalars(), which 3593 // already does this check. Collecting Uniforms for VF=1 does not make any 3594 // sense. 3595 3596 assert(VF.isVector() && !Uniforms.contains(VF) && 3597 "This function should not be visited twice for the same VF"); 3598 3599 // Visit the list of Uniforms. If we find no uniform value, we won't 3600 // analyze again. Uniforms.count(VF) will return 1. 3601 Uniforms[VF].clear(); 3602 3603 // Now we know that the loop is vectorizable! 3604 // Collect instructions inside the loop that will remain uniform after 3605 // vectorization. 3606 3607 // Global values, params and instructions outside of current loop are out of 3608 // scope. 3609 auto IsOutOfScope = [&](Value *V) -> bool { 3610 Instruction *I = dyn_cast<Instruction>(V); 3611 return (!I || !TheLoop->contains(I)); 3612 }; 3613 3614 // Worklist containing uniform instructions demanding lane 0. 3615 SetVector<Instruction *> Worklist; 3616 3617 // Add uniform instructions demanding lane 0 to the worklist. Instructions 3618 // that require predication must not be considered uniform after 3619 // vectorization, because that would create an erroneous replicating region 3620 // where only a single instance out of VF should be formed. 3621 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void { 3622 if (IsOutOfScope(I)) { 3623 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 3624 << *I << "\n"); 3625 return; 3626 } 3627 if (isPredicatedInst(I)) { 3628 LLVM_DEBUG( 3629 dbgs() << "LV: Found not uniform due to requiring predication: " << *I 3630 << "\n"); 3631 return; 3632 } 3633 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 3634 Worklist.insert(I); 3635 }; 3636 3637 // Start with the conditional branches exiting the loop. If the branch 3638 // condition is an instruction contained in the loop that is only used by the 3639 // branch, it is uniform. Note conditions from uncountable early exits are not 3640 // uniform. 3641 SmallVector<BasicBlock *> Exiting; 3642 TheLoop->getExitingBlocks(Exiting); 3643 for (BasicBlock *E : Exiting) { 3644 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) 3645 continue; 3646 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); 3647 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 3648 AddToWorklistIfAllowed(Cmp); 3649 } 3650 3651 auto PrevVF = VF.divideCoefficientBy(2); 3652 // Return true if all lanes perform the same memory operation, and we can 3653 // thus choose to execute only one. 3654 auto IsUniformMemOpUse = [&](Instruction *I) { 3655 // If the value was already known to not be uniform for the previous 3656 // (smaller VF), it cannot be uniform for the larger VF. 3657 if (PrevVF.isVector()) { 3658 auto Iter = Uniforms.find(PrevVF); 3659 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 3660 return false; 3661 } 3662 if (!Legal->isUniformMemOp(*I, VF)) 3663 return false; 3664 if (isa<LoadInst>(I)) 3665 // Loading the same address always produces the same result - at least 3666 // assuming aliasing and ordering which have already been checked. 3667 return true; 3668 // Storing the same value on every iteration. 3669 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 3670 }; 3671 3672 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) { 3673 InstWidening WideningDecision = getWideningDecision(I, VF); 3674 assert(WideningDecision != CM_Unknown && 3675 "Widening decision should be ready at this moment"); 3676 3677 if (IsUniformMemOpUse(I)) 3678 return true; 3679 3680 return (WideningDecision == CM_Widen || 3681 WideningDecision == CM_Widen_Reverse || 3682 WideningDecision == CM_Interleave); 3683 }; 3684 3685 // Returns true if Ptr is the pointer operand of a memory access instruction 3686 // I, I is known to not require scalarization, and the pointer is not also 3687 // stored. 3688 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 3689 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 3690 return false; 3691 return getLoadStorePointerOperand(I) == Ptr && 3692 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 3693 }; 3694 3695 // Holds a list of values which are known to have at least one uniform use. 3696 // Note that there may be other uses which aren't uniform. A "uniform use" 3697 // here is something which only demands lane 0 of the unrolled iterations; 3698 // it does not imply that all lanes produce the same value (e.g. this is not 3699 // the usual meaning of uniform) 3700 SetVector<Value *> HasUniformUse; 3701 3702 // Scan the loop for instructions which are either a) known to have only 3703 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3704 for (auto *BB : TheLoop->blocks()) 3705 for (auto &I : *BB) { 3706 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 3707 switch (II->getIntrinsicID()) { 3708 case Intrinsic::sideeffect: 3709 case Intrinsic::experimental_noalias_scope_decl: 3710 case Intrinsic::assume: 3711 case Intrinsic::lifetime_start: 3712 case Intrinsic::lifetime_end: 3713 if (TheLoop->hasLoopInvariantOperands(&I)) 3714 AddToWorklistIfAllowed(&I); 3715 break; 3716 default: 3717 break; 3718 } 3719 } 3720 3721 // ExtractValue instructions must be uniform, because the operands are 3722 // known to be loop-invariant. 3723 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 3724 assert(IsOutOfScope(EVI->getAggregateOperand()) && 3725 "Expected aggregate value to be loop invariant"); 3726 AddToWorklistIfAllowed(EVI); 3727 continue; 3728 } 3729 3730 // If there's no pointer operand, there's nothing to do. 3731 auto *Ptr = getLoadStorePointerOperand(&I); 3732 if (!Ptr) 3733 continue; 3734 3735 if (IsUniformMemOpUse(&I)) 3736 AddToWorklistIfAllowed(&I); 3737 3738 if (IsVectorizedMemAccessUse(&I, Ptr)) 3739 HasUniformUse.insert(Ptr); 3740 } 3741 3742 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 3743 // demanding) users. Since loops are assumed to be in LCSSA form, this 3744 // disallows uses outside the loop as well. 3745 for (auto *V : HasUniformUse) { 3746 if (IsOutOfScope(V)) 3747 continue; 3748 auto *I = cast<Instruction>(V); 3749 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool { 3750 auto *UI = cast<Instruction>(U); 3751 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V); 3752 }); 3753 if (UsersAreMemAccesses) 3754 AddToWorklistIfAllowed(I); 3755 } 3756 3757 // Expand Worklist in topological order: whenever a new instruction 3758 // is added , its users should be already inside Worklist. It ensures 3759 // a uniform instruction will only be used by uniform instructions. 3760 unsigned Idx = 0; 3761 while (Idx != Worklist.size()) { 3762 Instruction *I = Worklist[Idx++]; 3763 3764 for (auto *OV : I->operand_values()) { 3765 // isOutOfScope operands cannot be uniform instructions. 3766 if (IsOutOfScope(OV)) 3767 continue; 3768 // First order recurrence Phi's should typically be considered 3769 // non-uniform. 3770 auto *OP = dyn_cast<PHINode>(OV); 3771 if (OP && Legal->isFixedOrderRecurrence(OP)) 3772 continue; 3773 // If all the users of the operand are uniform, then add the 3774 // operand into the uniform worklist. 3775 auto *OI = cast<Instruction>(OV); 3776 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 3777 auto *J = cast<Instruction>(U); 3778 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI); 3779 })) 3780 AddToWorklistIfAllowed(OI); 3781 } 3782 } 3783 3784 // For an instruction to be added into Worklist above, all its users inside 3785 // the loop should also be in Worklist. However, this condition cannot be 3786 // true for phi nodes that form a cyclic dependence. We must process phi 3787 // nodes separately. An induction variable will remain uniform if all users 3788 // of the induction variable and induction variable update remain uniform. 3789 // The code below handles both pointer and non-pointer induction variables. 3790 BasicBlock *Latch = TheLoop->getLoopLatch(); 3791 for (const auto &Induction : Legal->getInductionVars()) { 3792 auto *Ind = Induction.first; 3793 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3794 3795 // Determine if all users of the induction variable are uniform after 3796 // vectorization. 3797 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool { 3798 auto *I = cast<Instruction>(U); 3799 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3800 IsVectorizedMemAccessUse(I, Ind); 3801 }); 3802 if (!UniformInd) 3803 continue; 3804 3805 // Determine if all users of the induction variable update instruction are 3806 // uniform after vectorization. 3807 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3808 auto *I = cast<Instruction>(U); 3809 return I == Ind || Worklist.count(I) || 3810 IsVectorizedMemAccessUse(I, IndUpdate); 3811 }); 3812 if (!UniformIndUpdate) 3813 continue; 3814 3815 // The induction variable and its update instruction will remain uniform. 3816 AddToWorklistIfAllowed(Ind); 3817 AddToWorklistIfAllowed(IndUpdate); 3818 } 3819 3820 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 3821 } 3822 3823 bool LoopVectorizationCostModel::runtimeChecksRequired() { 3824 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 3825 3826 if (Legal->getRuntimePointerChecking()->Need) { 3827 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 3828 "runtime pointer checks needed. Enable vectorization of this " 3829 "loop with '#pragma clang loop vectorize(enable)' when " 3830 "compiling with -Os/-Oz", 3831 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3832 return true; 3833 } 3834 3835 if (!PSE.getPredicate().isAlwaysTrue()) { 3836 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 3837 "runtime SCEV checks needed. Enable vectorization of this " 3838 "loop with '#pragma clang loop vectorize(enable)' when " 3839 "compiling with -Os/-Oz", 3840 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3841 return true; 3842 } 3843 3844 // FIXME: Avoid specializing for stride==1 instead of bailing out. 3845 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 3846 reportVectorizationFailure("Runtime stride check for small trip count", 3847 "runtime stride == 1 checks needed. Enable vectorization of " 3848 "this loop without such check by compiling with -Os/-Oz", 3849 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3850 return true; 3851 } 3852 3853 return false; 3854 } 3855 3856 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { 3857 if (IsScalableVectorizationAllowed) 3858 return *IsScalableVectorizationAllowed; 3859 3860 IsScalableVectorizationAllowed = false; 3861 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 3862 return false; 3863 3864 if (Hints->isScalableVectorizationDisabled()) { 3865 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 3866 "ScalableVectorizationDisabled", ORE, TheLoop); 3867 return false; 3868 } 3869 3870 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 3871 3872 auto MaxScalableVF = ElementCount::getScalable( 3873 std::numeric_limits<ElementCount::ScalarTy>::max()); 3874 3875 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 3876 // FIXME: While for scalable vectors this is currently sufficient, this should 3877 // be replaced by a more detailed mechanism that filters out specific VFs, 3878 // instead of invalidating vectorization for a whole set of VFs based on the 3879 // MaxVF. 3880 3881 // Disable scalable vectorization if the loop contains unsupported reductions. 3882 if (!canVectorizeReductions(MaxScalableVF)) { 3883 reportVectorizationInfo( 3884 "Scalable vectorization not supported for the reduction " 3885 "operations found in this loop.", 3886 "ScalableVFUnfeasible", ORE, TheLoop); 3887 return false; 3888 } 3889 3890 // Disable scalable vectorization if the loop contains any instructions 3891 // with element types not supported for scalable vectors. 3892 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 3893 return !Ty->isVoidTy() && 3894 !this->TTI.isElementTypeLegalForScalableVector(Ty); 3895 })) { 3896 reportVectorizationInfo("Scalable vectorization is not supported " 3897 "for all element types found in this loop.", 3898 "ScalableVFUnfeasible", ORE, TheLoop); 3899 return false; 3900 } 3901 3902 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { 3903 reportVectorizationInfo("The target does not provide maximum vscale value " 3904 "for safe distance analysis.", 3905 "ScalableVFUnfeasible", ORE, TheLoop); 3906 return false; 3907 } 3908 3909 IsScalableVectorizationAllowed = true; 3910 return true; 3911 } 3912 3913 ElementCount 3914 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 3915 if (!isScalableVectorizationAllowed()) 3916 return ElementCount::getScalable(0); 3917 3918 auto MaxScalableVF = ElementCount::getScalable( 3919 std::numeric_limits<ElementCount::ScalarTy>::max()); 3920 if (Legal->isSafeForAnyVectorWidth()) 3921 return MaxScalableVF; 3922 3923 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 3924 // Limit MaxScalableVF by the maximum safe dependence distance. 3925 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 3926 3927 if (!MaxScalableVF) 3928 reportVectorizationInfo( 3929 "Max legal vector width too small, scalable vectorization " 3930 "unfeasible.", 3931 "ScalableVFUnfeasible", ORE, TheLoop); 3932 3933 return MaxScalableVF; 3934 } 3935 3936 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 3937 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 3938 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 3939 unsigned SmallestType, WidestType; 3940 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 3941 3942 // Get the maximum safe dependence distance in bits computed by LAA. 3943 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3944 // the memory accesses that is most restrictive (involved in the smallest 3945 // dependence distance). 3946 unsigned MaxSafeElements = 3947 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 3948 3949 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 3950 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 3951 if (!Legal->isSafeForAnyVectorWidth()) 3952 this->MaxSafeElements = MaxSafeElements; 3953 3954 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 3955 << ".\n"); 3956 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 3957 << ".\n"); 3958 3959 // First analyze the UserVF, fall back if the UserVF should be ignored. 3960 if (UserVF) { 3961 auto MaxSafeUserVF = 3962 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 3963 3964 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 3965 // If `VF=vscale x N` is safe, then so is `VF=N` 3966 if (UserVF.isScalable()) 3967 return FixedScalableVFPair( 3968 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 3969 3970 return UserVF; 3971 } 3972 3973 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 3974 3975 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3976 // is better to ignore the hint and let the compiler choose a suitable VF. 3977 if (!UserVF.isScalable()) { 3978 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3979 << " is unsafe, clamping to max safe VF=" 3980 << MaxSafeFixedVF << ".\n"); 3981 ORE->emit([&]() { 3982 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3983 TheLoop->getStartLoc(), 3984 TheLoop->getHeader()) 3985 << "User-specified vectorization factor " 3986 << ore::NV("UserVectorizationFactor", UserVF) 3987 << " is unsafe, clamping to maximum safe vectorization factor " 3988 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 3989 }); 3990 return MaxSafeFixedVF; 3991 } 3992 3993 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 3994 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3995 << " is ignored because scalable vectors are not " 3996 "available.\n"); 3997 ORE->emit([&]() { 3998 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3999 TheLoop->getStartLoc(), 4000 TheLoop->getHeader()) 4001 << "User-specified vectorization factor " 4002 << ore::NV("UserVectorizationFactor", UserVF) 4003 << " is ignored because the target does not support scalable " 4004 "vectors. The compiler will pick a more suitable value."; 4005 }); 4006 } else { 4007 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4008 << " is unsafe. Ignoring scalable UserVF.\n"); 4009 ORE->emit([&]() { 4010 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4011 TheLoop->getStartLoc(), 4012 TheLoop->getHeader()) 4013 << "User-specified vectorization factor " 4014 << ore::NV("UserVectorizationFactor", UserVF) 4015 << " is unsafe. Ignoring the hint to let the compiler pick a " 4016 "more suitable value."; 4017 }); 4018 } 4019 } 4020 4021 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4022 << " / " << WidestType << " bits.\n"); 4023 4024 FixedScalableVFPair Result(ElementCount::getFixed(1), 4025 ElementCount::getScalable(0)); 4026 if (auto MaxVF = 4027 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4028 MaxSafeFixedVF, FoldTailByMasking)) 4029 Result.FixedVF = MaxVF; 4030 4031 if (auto MaxVF = 4032 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4033 MaxSafeScalableVF, FoldTailByMasking)) 4034 if (MaxVF.isScalable()) { 4035 Result.ScalableVF = MaxVF; 4036 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4037 << "\n"); 4038 } 4039 4040 return Result; 4041 } 4042 4043 FixedScalableVFPair 4044 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4045 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4046 // TODO: It may be useful to do since it's still likely to be dynamically 4047 // uniform if the target can skip. 4048 reportVectorizationFailure( 4049 "Not inserting runtime ptr check for divergent target", 4050 "runtime pointer checks needed. Not enabled for divergent target", 4051 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4052 return FixedScalableVFPair::getNone(); 4053 } 4054 4055 ScalarEvolution *SE = PSE.getSE(); 4056 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4057 unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); 4058 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4059 if (TC != MaxTC) 4060 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); 4061 if (TC == 1) { 4062 reportVectorizationFailure("Single iteration (non) loop", 4063 "loop trip count is one, irrelevant for vectorization", 4064 "SingleIterationLoop", ORE, TheLoop); 4065 return FixedScalableVFPair::getNone(); 4066 } 4067 4068 // If BTC matches the widest induction type and is -1 then the trip count 4069 // computation will wrap to 0 and the vector trip count will be 0. Do not try 4070 // to vectorize. 4071 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); 4072 if (!isa<SCEVCouldNotCompute>(BTC) && 4073 BTC->getType()->getScalarSizeInBits() >= 4074 Legal->getWidestInductionType()->getScalarSizeInBits() && 4075 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, 4076 SE->getMinusOne(BTC->getType()))) { 4077 reportVectorizationFailure( 4078 "Trip count computation wrapped", 4079 "backedge-taken count is -1, loop trip count wrapped to 0", 4080 "TripCountWrapped", ORE, TheLoop); 4081 return FixedScalableVFPair::getNone(); 4082 } 4083 4084 switch (ScalarEpilogueStatus) { 4085 case CM_ScalarEpilogueAllowed: 4086 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4087 case CM_ScalarEpilogueNotAllowedUsePredicate: 4088 [[fallthrough]]; 4089 case CM_ScalarEpilogueNotNeededUsePredicate: 4090 LLVM_DEBUG( 4091 dbgs() << "LV: vector predicate hint/switch found.\n" 4092 << "LV: Not allowing scalar epilogue, creating predicated " 4093 << "vector loop.\n"); 4094 break; 4095 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4096 // fallthrough as a special case of OptForSize 4097 case CM_ScalarEpilogueNotAllowedOptSize: 4098 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4099 LLVM_DEBUG( 4100 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4101 else 4102 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4103 << "count.\n"); 4104 4105 // Bail if runtime checks are required, which are not good when optimising 4106 // for size. 4107 if (runtimeChecksRequired()) 4108 return FixedScalableVFPair::getNone(); 4109 4110 break; 4111 } 4112 4113 // The only loops we can vectorize without a scalar epilogue, are loops with 4114 // a bottom-test and a single exiting block. We'd have to handle the fact 4115 // that not every instruction executes on the last iteration. This will 4116 // require a lane mask which varies through the vector loop body. (TODO) 4117 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4118 // If there was a tail-folding hint/switch, but we can't fold the tail by 4119 // masking, fallback to a vectorization with a scalar epilogue. 4120 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4121 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4122 "scalar epilogue instead.\n"); 4123 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4124 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4125 } 4126 return FixedScalableVFPair::getNone(); 4127 } 4128 4129 // Now try the tail folding 4130 4131 // Invalidate interleave groups that require an epilogue if we can't mask 4132 // the interleave-group. 4133 if (!useMaskedInterleavedAccesses(TTI)) { 4134 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4135 "No decisions should have been taken at this point"); 4136 // Note: There is no need to invalidate any cost modeling decisions here, as 4137 // none were taken so far. 4138 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4139 } 4140 4141 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4142 4143 // Avoid tail folding if the trip count is known to be a multiple of any VF 4144 // we choose. 4145 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4146 MaxFactors.FixedVF.getFixedValue(); 4147 if (MaxFactors.ScalableVF) { 4148 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4149 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4150 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4151 *MaxPowerOf2RuntimeVF, 4152 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4153 } else 4154 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4155 } 4156 4157 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4158 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4159 "MaxFixedVF must be a power of 2"); 4160 unsigned MaxVFtimesIC = 4161 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4162 ScalarEvolution *SE = PSE.getSE(); 4163 // Currently only loops with countable exits are vectorized, but calling 4164 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with 4165 // uncountable exits whilst also ensuring the symbolic maximum and known 4166 // back-edge taken count remain identical for loops with countable exits. 4167 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount(); 4168 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() && 4169 "Invalid loop count"); 4170 const SCEV *ExitCount = SE->getAddExpr( 4171 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4172 const SCEV *Rem = SE->getURemExpr( 4173 SE->applyLoopGuards(ExitCount, TheLoop), 4174 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4175 if (Rem->isZero()) { 4176 // Accept MaxFixedVF if we do not have a tail. 4177 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4178 return MaxFactors; 4179 } 4180 } 4181 4182 // If we don't know the precise trip count, or if the trip count that we 4183 // found modulo the vectorization factor is not zero, try to fold the tail 4184 // by masking. 4185 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4186 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); 4187 if (foldTailByMasking()) { 4188 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { 4189 LLVM_DEBUG( 4190 dbgs() 4191 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4192 "try to generate VP Intrinsics with scalable vector " 4193 "factors only.\n"); 4194 // Tail folded loop using VP intrinsics restricts the VF to be scalable 4195 // for now. 4196 // TODO: extend it for fixed vectors, if required. 4197 assert(MaxFactors.ScalableVF.isScalable() && 4198 "Expected scalable vector factor."); 4199 4200 MaxFactors.FixedVF = ElementCount::getFixed(1); 4201 } 4202 return MaxFactors; 4203 } 4204 4205 // If there was a tail-folding hint/switch, but we can't fold the tail by 4206 // masking, fallback to a vectorization with a scalar epilogue. 4207 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4208 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4209 "scalar epilogue instead.\n"); 4210 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4211 return MaxFactors; 4212 } 4213 4214 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4215 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4216 return FixedScalableVFPair::getNone(); 4217 } 4218 4219 if (TC == 0) { 4220 reportVectorizationFailure( 4221 "unable to calculate the loop count due to complex control flow", 4222 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4223 return FixedScalableVFPair::getNone(); 4224 } 4225 4226 reportVectorizationFailure( 4227 "Cannot optimize for size and vectorize at the same time.", 4228 "cannot optimize for size and vectorize at the same time. " 4229 "Enable vectorization of this loop with '#pragma clang loop " 4230 "vectorize(enable)' when compiling with -Os/-Oz", 4231 "NoTailLoopWithOptForSize", ORE, TheLoop); 4232 return FixedScalableVFPair::getNone(); 4233 } 4234 4235 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4236 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4237 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4238 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4239 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4240 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4241 : TargetTransformInfo::RGK_FixedWidthVector); 4242 4243 // Convenience function to return the minimum of two ElementCounts. 4244 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4245 assert((LHS.isScalable() == RHS.isScalable()) && 4246 "Scalable flags must match"); 4247 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4248 }; 4249 4250 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4251 // Note that both WidestRegister and WidestType may not be a powers of 2. 4252 auto MaxVectorElementCount = ElementCount::get( 4253 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4254 ComputeScalableMaxVF); 4255 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4256 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4257 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4258 4259 if (!MaxVectorElementCount) { 4260 LLVM_DEBUG(dbgs() << "LV: The target has no " 4261 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4262 << " vector registers.\n"); 4263 return ElementCount::getFixed(1); 4264 } 4265 4266 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4267 if (MaxVectorElementCount.isScalable() && 4268 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4269 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4270 auto Min = Attr.getVScaleRangeMin(); 4271 WidestRegisterMinEC *= Min; 4272 } 4273 4274 // When a scalar epilogue is required, at least one iteration of the scalar 4275 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4276 // max VF that results in a dead vector loop. 4277 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4278 MaxTripCount -= 1; 4279 4280 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4281 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4282 // If upper bound loop trip count (TC) is known at compile time there is no 4283 // point in choosing VF greater than TC (as done in the loop below). Select 4284 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4285 // scalable, we only fall back on a fixed VF when the TC is less than or 4286 // equal to the known number of lanes. 4287 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4288 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4289 "exceeding the constant trip count: " 4290 << ClampedUpperTripCount << "\n"); 4291 return ElementCount::get( 4292 ClampedUpperTripCount, 4293 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4294 } 4295 4296 TargetTransformInfo::RegisterKind RegKind = 4297 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4298 : TargetTransformInfo::RGK_FixedWidthVector; 4299 ElementCount MaxVF = MaxVectorElementCount; 4300 if (MaximizeBandwidth || 4301 (MaximizeBandwidth.getNumOccurrences() == 0 && 4302 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4303 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4304 auto MaxVectorElementCountMaxBW = ElementCount::get( 4305 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4306 ComputeScalableMaxVF); 4307 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4308 4309 // Collect all viable vectorization factors larger than the default MaxVF 4310 // (i.e. MaxVectorElementCount). 4311 SmallVector<ElementCount, 8> VFs; 4312 for (ElementCount VS = MaxVectorElementCount * 2; 4313 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4314 VFs.push_back(VS); 4315 4316 // For each VF calculate its register usage. 4317 auto RUs = calculateRegisterUsage(VFs); 4318 4319 // Select the largest VF which doesn't require more registers than existing 4320 // ones. 4321 for (int I = RUs.size() - 1; I >= 0; --I) { 4322 const auto &MLU = RUs[I].MaxLocalUsers; 4323 if (all_of(MLU, [&](decltype(MLU.front()) &LU) { 4324 return LU.second <= TTI.getNumberOfRegisters(LU.first); 4325 })) { 4326 MaxVF = VFs[I]; 4327 break; 4328 } 4329 } 4330 if (ElementCount MinVF = 4331 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4332 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4333 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4334 << ") with target's minimum: " << MinVF << '\n'); 4335 MaxVF = MinVF; 4336 } 4337 } 4338 4339 // Invalidate any widening decisions we might have made, in case the loop 4340 // requires prediction (decided later), but we have already made some 4341 // load/store widening decisions. 4342 invalidateCostModelingDecisions(); 4343 } 4344 return MaxVF; 4345 } 4346 4347 /// Convenience function that returns the value of vscale_range iff 4348 /// vscale_range.min == vscale_range.max or otherwise returns the value 4349 /// returned by the corresponding TTI method. 4350 static std::optional<unsigned> 4351 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4352 const Function *Fn = L->getHeader()->getParent(); 4353 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4354 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4355 auto Min = Attr.getVScaleRangeMin(); 4356 auto Max = Attr.getVScaleRangeMax(); 4357 if (Max && Min == Max) 4358 return Max; 4359 } 4360 4361 return TTI.getVScaleForTuning(); 4362 } 4363 4364 /// This function attempts to return a value that represents the vectorization 4365 /// factor at runtime. For fixed-width VFs we know this precisely at compile 4366 /// time, but for scalable VFs we calculate it based on an estimate of the 4367 /// vscale value. 4368 static unsigned getEstimatedRuntimeVF(const Loop *L, 4369 const TargetTransformInfo &TTI, 4370 ElementCount VF) { 4371 unsigned EstimatedVF = VF.getKnownMinValue(); 4372 if (VF.isScalable()) 4373 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI)) 4374 EstimatedVF *= *VScale; 4375 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 4376 return EstimatedVF; 4377 } 4378 4379 bool LoopVectorizationPlanner::isMoreProfitable( 4380 const VectorizationFactor &A, const VectorizationFactor &B, 4381 const unsigned MaxTripCount) const { 4382 InstructionCost CostA = A.Cost; 4383 InstructionCost CostB = B.Cost; 4384 4385 // Improve estimate for the vector width if it is scalable. 4386 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4387 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4388 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4389 if (A.Width.isScalable()) 4390 EstimatedWidthA *= *VScale; 4391 if (B.Width.isScalable()) 4392 EstimatedWidthB *= *VScale; 4393 } 4394 4395 // Assume vscale may be larger than 1 (or the value being tuned for), 4396 // so that scalable vectorization is slightly favorable over fixed-width 4397 // vectorization. 4398 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && 4399 A.Width.isScalable() && !B.Width.isScalable(); 4400 4401 auto CmpFn = [PreferScalable](const InstructionCost &LHS, 4402 const InstructionCost &RHS) { 4403 return PreferScalable ? LHS <= RHS : LHS < RHS; 4404 }; 4405 4406 // To avoid the need for FP division: 4407 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4408 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4409 if (!MaxTripCount) 4410 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); 4411 4412 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4413 InstructionCost VectorCost, 4414 InstructionCost ScalarCost) { 4415 // If the trip count is a known (possibly small) constant, the trip count 4416 // will be rounded up to an integer number of iterations under 4417 // FoldTailByMasking. The total cost in that case will be 4418 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4419 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4420 // some extra overheads, but for the purpose of comparing the costs of 4421 // different VFs we can use this to compare the total loop-body cost 4422 // expected after vectorization. 4423 if (CM.foldTailByMasking()) 4424 return VectorCost * divideCeil(MaxTripCount, VF); 4425 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); 4426 }; 4427 4428 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); 4429 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); 4430 return CmpFn(RTCostA, RTCostB); 4431 } 4432 4433 bool LoopVectorizationPlanner::isMoreProfitable( 4434 const VectorizationFactor &A, const VectorizationFactor &B) const { 4435 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); 4436 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); 4437 } 4438 4439 void LoopVectorizationPlanner::emitInvalidCostRemarks( 4440 OptimizationRemarkEmitter *ORE) { 4441 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; 4442 SmallVector<RecipeVFPair> InvalidCosts; 4443 for (const auto &Plan : VPlans) { 4444 for (ElementCount VF : Plan->vectorFactors()) { 4445 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), 4446 CM); 4447 precomputeCosts(*Plan, VF, CostCtx); 4448 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); 4449 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4450 for (auto &R : *VPBB) { 4451 if (!R.cost(VF, CostCtx).isValid()) 4452 InvalidCosts.emplace_back(&R, VF); 4453 } 4454 } 4455 } 4456 } 4457 if (InvalidCosts.empty()) 4458 return; 4459 4460 // Emit a report of VFs with invalid costs in the loop. 4461 4462 // Group the remarks per recipe, keeping the recipe order from InvalidCosts. 4463 DenseMap<VPRecipeBase *, unsigned> Numbering; 4464 unsigned I = 0; 4465 for (auto &Pair : InvalidCosts) 4466 if (!Numbering.count(Pair.first)) 4467 Numbering[Pair.first] = I++; 4468 4469 // Sort the list, first on recipe(number) then on VF. 4470 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) { 4471 if (Numbering[A.first] != Numbering[B.first]) 4472 return Numbering[A.first] < Numbering[B.first]; 4473 const auto &LHS = A.second; 4474 const auto &RHS = B.second; 4475 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 4476 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 4477 }); 4478 4479 // For a list of ordered recipe-VF pairs: 4480 // [(load, VF1), (load, VF2), (store, VF1)] 4481 // group the recipes together to emit separate remarks for: 4482 // load (VF1, VF2) 4483 // store (VF1) 4484 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts); 4485 auto Subset = ArrayRef<RecipeVFPair>(); 4486 do { 4487 if (Subset.empty()) 4488 Subset = Tail.take_front(1); 4489 4490 VPRecipeBase *R = Subset.front().first; 4491 4492 unsigned Opcode = 4493 TypeSwitch<const VPRecipeBase *, unsigned>(R) 4494 .Case<VPHeaderPHIRecipe>( 4495 [](const auto *R) { return Instruction::PHI; }) 4496 .Case<VPWidenSelectRecipe>( 4497 [](const auto *R) { return Instruction::Select; }) 4498 .Case<VPWidenStoreRecipe>( 4499 [](const auto *R) { return Instruction::Store; }) 4500 .Case<VPWidenLoadRecipe>( 4501 [](const auto *R) { return Instruction::Load; }) 4502 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>( 4503 [](const auto *R) { return Instruction::Call; }) 4504 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe, 4505 VPWidenCastRecipe>( 4506 [](const auto *R) { return R->getOpcode(); }) 4507 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) { 4508 return R->getStoredValues().empty() ? Instruction::Load 4509 : Instruction::Store; 4510 }); 4511 4512 // If the next recipe is different, or if there are no other pairs, 4513 // emit a remark for the collated subset. e.g. 4514 // [(load, VF1), (load, VF2))] 4515 // to emit: 4516 // remark: invalid costs for 'load' at VF=(VF1, VF2) 4517 if (Subset == Tail || Tail[Subset.size()].first != R) { 4518 std::string OutString; 4519 raw_string_ostream OS(OutString); 4520 assert(!Subset.empty() && "Unexpected empty range"); 4521 OS << "Recipe with invalid costs prevented vectorization at VF=("; 4522 for (const auto &Pair : Subset) 4523 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4524 OS << "):"; 4525 if (Opcode == Instruction::Call) { 4526 StringRef Name = ""; 4527 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) { 4528 Name = Int->getIntrinsicName(); 4529 } else { 4530 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R); 4531 Function *CalledFn = 4532 WidenCall ? WidenCall->getCalledScalarFunction() 4533 : cast<Function>(R->getOperand(R->getNumOperands() - 1) 4534 ->getLiveInIRValue()); 4535 Name = CalledFn->getName(); 4536 } 4537 OS << " call to " << Name; 4538 } else 4539 OS << " " << Instruction::getOpcodeName(Opcode); 4540 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr, 4541 R->getDebugLoc()); 4542 Tail = Tail.drop_front(Subset.size()); 4543 Subset = {}; 4544 } else 4545 // Grow the subset by one element 4546 Subset = Tail.take_front(Subset.size() + 1); 4547 } while (!Tail.empty()); 4548 } 4549 4550 /// Check if any recipe of \p Plan will generate a vector value, which will be 4551 /// assigned a vector register. 4552 static bool willGenerateVectors(VPlan &Plan, ElementCount VF, 4553 const TargetTransformInfo &TTI) { 4554 assert(VF.isVector() && "Checking a scalar VF?"); 4555 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 4556 DenseSet<VPRecipeBase *> EphemeralRecipes; 4557 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); 4558 // Set of already visited types. 4559 DenseSet<Type *> Visited; 4560 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 4561 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { 4562 for (VPRecipeBase &R : *VPBB) { 4563 if (EphemeralRecipes.contains(&R)) 4564 continue; 4565 // Continue early if the recipe is considered to not produce a vector 4566 // result. Note that this includes VPInstruction where some opcodes may 4567 // produce a vector, to preserve existing behavior as VPInstructions model 4568 // aspects not directly mapped to existing IR instructions. 4569 switch (R.getVPDefID()) { 4570 case VPDef::VPDerivedIVSC: 4571 case VPDef::VPScalarIVStepsSC: 4572 case VPDef::VPScalarCastSC: 4573 case VPDef::VPReplicateSC: 4574 case VPDef::VPInstructionSC: 4575 case VPDef::VPCanonicalIVPHISC: 4576 case VPDef::VPVectorPointerSC: 4577 case VPDef::VPReverseVectorPointerSC: 4578 case VPDef::VPExpandSCEVSC: 4579 case VPDef::VPEVLBasedIVPHISC: 4580 case VPDef::VPPredInstPHISC: 4581 case VPDef::VPBranchOnMaskSC: 4582 continue; 4583 case VPDef::VPReductionSC: 4584 case VPDef::VPActiveLaneMaskPHISC: 4585 case VPDef::VPWidenCallSC: 4586 case VPDef::VPWidenCanonicalIVSC: 4587 case VPDef::VPWidenCastSC: 4588 case VPDef::VPWidenGEPSC: 4589 case VPDef::VPWidenIntrinsicSC: 4590 case VPDef::VPWidenSC: 4591 case VPDef::VPWidenSelectSC: 4592 case VPDef::VPBlendSC: 4593 case VPDef::VPFirstOrderRecurrencePHISC: 4594 case VPDef::VPWidenPHISC: 4595 case VPDef::VPWidenIntOrFpInductionSC: 4596 case VPDef::VPWidenPointerInductionSC: 4597 case VPDef::VPReductionPHISC: 4598 case VPDef::VPInterleaveSC: 4599 case VPDef::VPWidenLoadEVLSC: 4600 case VPDef::VPWidenLoadSC: 4601 case VPDef::VPWidenStoreEVLSC: 4602 case VPDef::VPWidenStoreSC: 4603 break; 4604 default: 4605 llvm_unreachable("unhandled recipe"); 4606 } 4607 4608 auto WillWiden = [&TTI, VF](Type *ScalarTy) { 4609 Type *VectorTy = toVectorTy(ScalarTy, VF); 4610 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); 4611 if (!NumLegalParts) 4612 return false; 4613 if (VF.isScalable()) { 4614 // <vscale x 1 x iN> is assumed to be profitable over iN because 4615 // scalable registers are a distinct register class from scalar 4616 // ones. If we ever find a target which wants to lower scalable 4617 // vectors back to scalars, we'll need to update this code to 4618 // explicitly ask TTI about the register class uses for each part. 4619 return NumLegalParts <= VF.getKnownMinValue(); 4620 } 4621 // Two or more parts that share a register - are vectorized. 4622 return NumLegalParts < VF.getKnownMinValue(); 4623 }; 4624 4625 // If no def nor is a store, e.g., branches, continue - no value to check. 4626 if (R.getNumDefinedValues() == 0 && 4627 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( 4628 &R)) 4629 continue; 4630 // For multi-def recipes, currently only interleaved loads, suffice to 4631 // check first def only. 4632 // For stores check their stored value; for interleaved stores suffice 4633 // the check first stored value only. In all cases this is the second 4634 // operand. 4635 VPValue *ToCheck = 4636 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); 4637 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); 4638 if (!Visited.insert({ScalarTy}).second) 4639 continue; 4640 if (WillWiden(ScalarTy)) 4641 return true; 4642 } 4643 } 4644 4645 return false; 4646 } 4647 4648 #ifndef NDEBUG 4649 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { 4650 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); 4651 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4652 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4653 assert(any_of(VPlans, 4654 [](std::unique_ptr<VPlan> &P) { 4655 return P->hasVF(ElementCount::getFixed(1)); 4656 }) && 4657 "Expected Scalar VF to be a candidate"); 4658 4659 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4660 ExpectedCost); 4661 VectorizationFactor ChosenFactor = ScalarCost; 4662 4663 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4664 if (ForceVectorization && 4665 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { 4666 // Ignore scalar width, because the user explicitly wants vectorization. 4667 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4668 // evaluation. 4669 ChosenFactor.Cost = InstructionCost::getMax(); 4670 } 4671 4672 for (auto &P : VPlans) { 4673 for (ElementCount VF : P->vectorFactors()) { 4674 // The cost for scalar VF=1 is already calculated, so ignore it. 4675 if (VF.isScalar()) 4676 continue; 4677 4678 InstructionCost C = CM.expectedCost(VF); 4679 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); 4680 4681 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); 4682 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF 4683 << " costs: " << (Candidate.Cost / Width)); 4684 if (VF.isScalable()) 4685 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 4686 << getVScaleForTuning(OrigLoop, TTI).value_or(1) 4687 << ")"); 4688 LLVM_DEBUG(dbgs() << ".\n"); 4689 4690 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 4691 LLVM_DEBUG( 4692 dbgs() 4693 << "LV: Not considering vector loop of width " << VF 4694 << " because it will not generate any vector instructions.\n"); 4695 continue; 4696 } 4697 4698 if (isMoreProfitable(Candidate, ChosenFactor)) 4699 ChosenFactor = Candidate; 4700 } 4701 } 4702 4703 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 4704 reportVectorizationFailure( 4705 "There are conditional stores.", 4706 "store that is conditionally executed prevents vectorization", 4707 "ConditionalStore", ORE, OrigLoop); 4708 ChosenFactor = ScalarCost; 4709 } 4710 4711 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 4712 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 4713 << "LV: Vectorization seems to be not beneficial, " 4714 << "but was forced by a user.\n"); 4715 return ChosenFactor; 4716 } 4717 #endif 4718 4719 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 4720 ElementCount VF) const { 4721 // Cross iteration phis such as reductions need special handling and are 4722 // currently unsupported. 4723 if (any_of(OrigLoop->getHeader()->phis(), 4724 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 4725 return false; 4726 4727 // Phis with uses outside of the loop require special handling and are 4728 // currently unsupported. 4729 for (const auto &Entry : Legal->getInductionVars()) { 4730 // Look for uses of the value of the induction at the last iteration. 4731 Value *PostInc = 4732 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 4733 for (User *U : PostInc->users()) 4734 if (!OrigLoop->contains(cast<Instruction>(U))) 4735 return false; 4736 // Look for uses of penultimate value of the induction. 4737 for (User *U : Entry.first->users()) 4738 if (!OrigLoop->contains(cast<Instruction>(U))) 4739 return false; 4740 } 4741 4742 // Epilogue vectorization code has not been auditted to ensure it handles 4743 // non-latch exits properly. It may be fine, but it needs auditted and 4744 // tested. 4745 // TODO: Add support for loops with an early exit. 4746 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 4747 return false; 4748 4749 return true; 4750 } 4751 4752 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 4753 const ElementCount VF, const unsigned IC) const { 4754 // FIXME: We need a much better cost-model to take different parameters such 4755 // as register pressure, code size increase and cost of extra branches into 4756 // account. For now we apply a very crude heuristic and only consider loops 4757 // with vectorization factors larger than a certain value. 4758 4759 // Allow the target to opt out entirely. 4760 if (!TTI.preferEpilogueVectorization()) 4761 return false; 4762 4763 // We also consider epilogue vectorization unprofitable for targets that don't 4764 // consider interleaving beneficial (eg. MVE). 4765 if (TTI.getMaxInterleaveFactor(VF) <= 1) 4766 return false; 4767 4768 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable 4769 // VFs when deciding profitability. 4770 // See related "TODO: extend to support scalable VFs." in 4771 // selectEpilogueVectorizationFactor. 4772 unsigned Multiplier = VF.isFixed() ? IC : 1; 4773 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 4774 ? EpilogueVectorizationMinVF 4775 : TTI.getEpilogueVectorizationMinVF(); 4776 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; 4777 } 4778 4779 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 4780 const ElementCount MainLoopVF, unsigned IC) { 4781 VectorizationFactor Result = VectorizationFactor::Disabled(); 4782 if (!EnableEpilogueVectorization) { 4783 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 4784 return Result; 4785 } 4786 4787 if (!CM.isScalarEpilogueAllowed()) { 4788 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 4789 "epilogue is allowed.\n"); 4790 return Result; 4791 } 4792 4793 // Not really a cost consideration, but check for unsupported cases here to 4794 // simplify the logic. 4795 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 4796 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 4797 "is not a supported candidate.\n"); 4798 return Result; 4799 } 4800 4801 if (EpilogueVectorizationForceVF > 1) { 4802 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 4803 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 4804 if (hasPlanWithVF(ForcedEC)) 4805 return {ForcedEC, 0, 0}; 4806 4807 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 4808 "viable.\n"); 4809 return Result; 4810 } 4811 4812 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 4813 OrigLoop->getHeader()->getParent()->hasMinSize()) { 4814 LLVM_DEBUG( 4815 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 4816 return Result; 4817 } 4818 4819 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) { 4820 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 4821 "this loop\n"); 4822 return Result; 4823 } 4824 4825 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4826 // the main loop handles 8 lanes per iteration. We could still benefit from 4827 // vectorizing the epilogue loop with VF=4. 4828 ElementCount EstimatedRuntimeVF = 4829 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); 4830 4831 ScalarEvolution &SE = *PSE.getSE(); 4832 Type *TCType = Legal->getWidestInductionType(); 4833 const SCEV *RemainingIterations = nullptr; 4834 unsigned MaxTripCount = 0; 4835 for (auto &NextVF : ProfitableVFs) { 4836 // Skip candidate VFs without a corresponding VPlan. 4837 if (!hasPlanWithVF(NextVF.Width)) 4838 continue; 4839 4840 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable 4841 // vectors) or > the VF of the main loop (fixed vectors). 4842 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 4843 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 4844 (NextVF.Width.isScalable() && 4845 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || 4846 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && 4847 ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) 4848 continue; 4849 4850 // If NextVF is greater than the number of remaining iterations, the 4851 // epilogue loop would be dead. Skip such factors. 4852 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 4853 // TODO: extend to support scalable VFs. 4854 if (!RemainingIterations) { 4855 const SCEV *TC = vputils::getSCEVExprForVPValue( 4856 getPlanFor(NextVF.Width).getTripCount(), SE); 4857 assert(!isa<SCEVCouldNotCompute>(TC) && 4858 "Trip count SCEV must be computable"); 4859 RemainingIterations = SE.getURemExpr( 4860 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 4861 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; 4862 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, 4863 SE.getConstant(TCType, MaxTripCount))) { 4864 MaxTripCount = 4865 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); 4866 } 4867 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " 4868 << MaxTripCount << "\n"); 4869 } 4870 if (SE.isKnownPredicate( 4871 CmpInst::ICMP_UGT, 4872 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 4873 RemainingIterations)) 4874 continue; 4875 } 4876 4877 if (Result.Width.isScalar() || 4878 isMoreProfitable(NextVF, Result, MaxTripCount)) 4879 Result = NextVF; 4880 } 4881 4882 if (Result != VectorizationFactor::Disabled()) 4883 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 4884 << Result.Width << "\n"); 4885 return Result; 4886 } 4887 4888 std::pair<unsigned, unsigned> 4889 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4890 unsigned MinWidth = -1U; 4891 unsigned MaxWidth = 8; 4892 const DataLayout &DL = TheFunction->getDataLayout(); 4893 // For in-loop reductions, no element types are added to ElementTypesInLoop 4894 // if there are no loads/stores in the loop. In this case, check through the 4895 // reduction variables to determine the maximum width. 4896 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 4897 // Reset MaxWidth so that we can find the smallest type used by recurrences 4898 // in the loop. 4899 MaxWidth = -1U; 4900 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 4901 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 4902 // When finding the min width used by the recurrence we need to account 4903 // for casts on the input operands of the recurrence. 4904 MaxWidth = std::min<unsigned>( 4905 MaxWidth, std::min<unsigned>( 4906 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 4907 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 4908 } 4909 } else { 4910 for (Type *T : ElementTypesInLoop) { 4911 MinWidth = std::min<unsigned>( 4912 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4913 MaxWidth = std::max<unsigned>( 4914 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4915 } 4916 } 4917 return {MinWidth, MaxWidth}; 4918 } 4919 4920 void LoopVectorizationCostModel::collectElementTypesForWidening() { 4921 ElementTypesInLoop.clear(); 4922 // For each block. 4923 for (BasicBlock *BB : TheLoop->blocks()) { 4924 // For each instruction in the loop. 4925 for (Instruction &I : BB->instructionsWithoutDebug()) { 4926 Type *T = I.getType(); 4927 4928 // Skip ignored values. 4929 if (ValuesToIgnore.count(&I)) 4930 continue; 4931 4932 // Only examine Loads, Stores and PHINodes. 4933 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 4934 continue; 4935 4936 // Examine PHI nodes that are reduction variables. Update the type to 4937 // account for the recurrence type. 4938 if (auto *PN = dyn_cast<PHINode>(&I)) { 4939 if (!Legal->isReductionVariable(PN)) 4940 continue; 4941 const RecurrenceDescriptor &RdxDesc = 4942 Legal->getReductionVars().find(PN)->second; 4943 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 4944 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 4945 RdxDesc.getRecurrenceType(), 4946 TargetTransformInfo::ReductionFlags())) 4947 continue; 4948 T = RdxDesc.getRecurrenceType(); 4949 } 4950 4951 // Examine the stored values. 4952 if (auto *ST = dyn_cast<StoreInst>(&I)) 4953 T = ST->getValueOperand()->getType(); 4954 4955 assert(T->isSized() && 4956 "Expected the load/store/recurrence type to be sized"); 4957 4958 ElementTypesInLoop.insert(T); 4959 } 4960 } 4961 } 4962 4963 unsigned 4964 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 4965 InstructionCost LoopCost) { 4966 // -- The interleave heuristics -- 4967 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4968 // There are many micro-architectural considerations that we can't predict 4969 // at this level. For example, frontend pressure (on decode or fetch) due to 4970 // code size, or the number and capabilities of the execution ports. 4971 // 4972 // We use the following heuristics to select the interleave count: 4973 // 1. If the code has reductions, then we interleave to break the cross 4974 // iteration dependency. 4975 // 2. If the loop is really small, then we interleave to reduce the loop 4976 // overhead. 4977 // 3. We don't interleave if we think that we will spill registers to memory 4978 // due to the increased register pressure. 4979 4980 if (!isScalarEpilogueAllowed()) 4981 return 1; 4982 4983 // Do not interleave if EVL is preferred and no User IC is specified. 4984 if (foldTailWithEVL()) { 4985 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " 4986 "Unroll factor forced to be 1.\n"); 4987 return 1; 4988 } 4989 4990 // We used the distance for the interleave count. 4991 if (!Legal->isSafeForAnyVectorWidth()) 4992 return 1; 4993 4994 // We don't attempt to perform interleaving for loops with uncountable early 4995 // exits because the VPInstruction::AnyOf code cannot currently handle 4996 // multiple parts. 4997 if (Legal->hasUncountableEarlyExit()) 4998 return 1; 4999 5000 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); 5001 const bool HasReductions = !Legal->getReductionVars().empty(); 5002 5003 // If we did not calculate the cost for VF (because the user selected the VF) 5004 // then we calculate the cost of VF here. 5005 if (LoopCost == 0) { 5006 LoopCost = expectedCost(VF); 5007 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5008 5009 // Loop body is free and there is no need for interleaving. 5010 if (LoopCost == 0) 5011 return 1; 5012 } 5013 5014 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5015 // We divide by these constants so assume that we have at least one 5016 // instruction that uses at least one register. 5017 for (auto &Pair : R.MaxLocalUsers) { 5018 Pair.second = std::max(Pair.second, 1U); 5019 } 5020 5021 // We calculate the interleave count using the following formula. 5022 // Subtract the number of loop invariants from the number of available 5023 // registers. These registers are used by all of the interleaved instances. 5024 // Next, divide the remaining registers by the number of registers that is 5025 // required by the loop, in order to estimate how many parallel instances 5026 // fit without causing spills. All of this is rounded down if necessary to be 5027 // a power of two. We want power of two interleave count to simplify any 5028 // addressing operations or alignment considerations. 5029 // We also want power of two interleave counts to ensure that the induction 5030 // variable of the vector loop wraps to zero, when tail is folded by masking; 5031 // this currently happens when OptForSize, in which case IC is set to 1 above. 5032 unsigned IC = UINT_MAX; 5033 5034 for (const auto &Pair : R.MaxLocalUsers) { 5035 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first); 5036 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5037 << " registers of " 5038 << TTI.getRegisterClassName(Pair.first) 5039 << " register class\n"); 5040 if (VF.isScalar()) { 5041 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5042 TargetNumRegisters = ForceTargetNumScalarRegs; 5043 } else { 5044 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5045 TargetNumRegisters = ForceTargetNumVectorRegs; 5046 } 5047 unsigned MaxLocalUsers = Pair.second; 5048 unsigned LoopInvariantRegs = 0; 5049 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end()) 5050 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first]; 5051 5052 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5053 MaxLocalUsers); 5054 // Don't count the induction variable as interleaved. 5055 if (EnableIndVarRegisterHeur) { 5056 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5057 std::max(1U, (MaxLocalUsers - 1))); 5058 } 5059 5060 IC = std::min(IC, TmpIC); 5061 } 5062 5063 // Clamp the interleave ranges to reasonable counts. 5064 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5065 5066 // Check if the user has overridden the max. 5067 if (VF.isScalar()) { 5068 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5069 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5070 } else { 5071 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5072 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5073 } 5074 5075 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); 5076 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5077 if (KnownTC > 0) { 5078 // At least one iteration must be scalar when this constraint holds. So the 5079 // maximum available iterations for interleaving is one less. 5080 unsigned AvailableTC = 5081 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; 5082 5083 // If trip count is known we select between two prospective ICs, where 5084 // 1) the aggressive IC is capped by the trip count divided by VF 5085 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 5086 // The final IC is selected in a way that the epilogue loop trip count is 5087 // minimized while maximizing the IC itself, so that we either run the 5088 // vector loop at least once if it generates a small epilogue loop, or else 5089 // we run the vector loop at least twice. 5090 5091 unsigned InterleaveCountUB = bit_floor( 5092 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); 5093 unsigned InterleaveCountLB = bit_floor(std::max( 5094 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5095 MaxInterleaveCount = InterleaveCountLB; 5096 5097 if (InterleaveCountUB != InterleaveCountLB) { 5098 unsigned TailTripCountUB = 5099 (AvailableTC % (EstimatedVF * InterleaveCountUB)); 5100 unsigned TailTripCountLB = 5101 (AvailableTC % (EstimatedVF * InterleaveCountLB)); 5102 // If both produce same scalar tail, maximize the IC to do the same work 5103 // in fewer vector loop iterations 5104 if (TailTripCountUB == TailTripCountLB) 5105 MaxInterleaveCount = InterleaveCountUB; 5106 } 5107 } else if (BestKnownTC && *BestKnownTC > 0) { 5108 // At least one iteration must be scalar when this constraint holds. So the 5109 // maximum available iterations for interleaving is one less. 5110 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) 5111 ? (*BestKnownTC) - 1 5112 : *BestKnownTC; 5113 5114 // If trip count is an estimated compile time constant, limit the 5115 // IC to be capped by the trip count divided by VF * 2, such that the vector 5116 // loop runs at least twice to make interleaving seem profitable when there 5117 // is an epilogue loop present. Since exact Trip count is not known we 5118 // choose to be conservative in our IC estimate. 5119 MaxInterleaveCount = bit_floor(std::max( 5120 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5121 } 5122 5123 assert(MaxInterleaveCount > 0 && 5124 "Maximum interleave count must be greater than 0"); 5125 5126 // Clamp the calculated IC to be between the 1 and the max interleave count 5127 // that the target and trip count allows. 5128 if (IC > MaxInterleaveCount) 5129 IC = MaxInterleaveCount; 5130 else 5131 // Make sure IC is greater than 0. 5132 IC = std::max(1u, IC); 5133 5134 assert(IC > 0 && "Interleave count must be greater than 0."); 5135 5136 // Interleave if we vectorized this loop and there is a reduction that could 5137 // benefit from interleaving. 5138 if (VF.isVector() && HasReductions) { 5139 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5140 return IC; 5141 } 5142 5143 // For any scalar loop that either requires runtime checks or predication we 5144 // are better off leaving this to the unroller. Note that if we've already 5145 // vectorized the loop we will have done the runtime check and so interleaving 5146 // won't require further checks. 5147 bool ScalarInterleavingRequiresPredication = 5148 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5149 return Legal->blockNeedsPredication(BB); 5150 })); 5151 bool ScalarInterleavingRequiresRuntimePointerCheck = 5152 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5153 5154 // We want to interleave small loops in order to reduce the loop overhead and 5155 // potentially expose ILP opportunities. 5156 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5157 << "LV: IC is " << IC << '\n' 5158 << "LV: VF is " << VF << '\n'); 5159 const bool AggressivelyInterleaveReductions = 5160 TTI.enableAggressiveInterleaving(HasReductions); 5161 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5162 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5163 // We assume that the cost overhead is 1 and we use the cost model 5164 // to estimate the cost of the loop and interleave until the cost of the 5165 // loop overhead is about 5% of the cost of the loop. 5166 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5167 SmallLoopCost / *LoopCost.getValue())); 5168 5169 // Interleave until store/load ports (estimated by max interleave count) are 5170 // saturated. 5171 unsigned NumStores = Legal->getNumStores(); 5172 unsigned NumLoads = Legal->getNumLoads(); 5173 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5174 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5175 5176 // There is little point in interleaving for reductions containing selects 5177 // and compares when VF=1 since it may just create more overhead than it's 5178 // worth for loops with small trip counts. This is because we still have to 5179 // do the final reduction after the loop. 5180 bool HasSelectCmpReductions = 5181 HasReductions && 5182 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5183 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5184 RecurKind RK = RdxDesc.getRecurrenceKind(); 5185 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 5186 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK); 5187 }); 5188 if (HasSelectCmpReductions) { 5189 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5190 return 1; 5191 } 5192 5193 // If we have a scalar reduction (vector reductions are already dealt with 5194 // by this point), we can increase the critical path length if the loop 5195 // we're interleaving is inside another loop. For tree-wise reductions 5196 // set the limit to 2, and for ordered reductions it's best to disable 5197 // interleaving entirely. 5198 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5199 bool HasOrderedReductions = 5200 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5201 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5202 return RdxDesc.isOrdered(); 5203 }); 5204 if (HasOrderedReductions) { 5205 LLVM_DEBUG( 5206 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5207 return 1; 5208 } 5209 5210 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5211 SmallIC = std::min(SmallIC, F); 5212 StoresIC = std::min(StoresIC, F); 5213 LoadsIC = std::min(LoadsIC, F); 5214 } 5215 5216 if (EnableLoadStoreRuntimeInterleave && 5217 std::max(StoresIC, LoadsIC) > SmallIC) { 5218 LLVM_DEBUG( 5219 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5220 return std::max(StoresIC, LoadsIC); 5221 } 5222 5223 // If there are scalar reductions and TTI has enabled aggressive 5224 // interleaving for reductions, we will interleave to expose ILP. 5225 if (VF.isScalar() && AggressivelyInterleaveReductions) { 5226 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5227 // Interleave no less than SmallIC but not as aggressive as the normal IC 5228 // to satisfy the rare situation when resources are too limited. 5229 return std::max(IC / 2, SmallIC); 5230 } 5231 5232 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5233 return SmallIC; 5234 } 5235 5236 // Interleave if this is a large loop (small loops are already dealt with by 5237 // this point) that could benefit from interleaving. 5238 if (AggressivelyInterleaveReductions) { 5239 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5240 return IC; 5241 } 5242 5243 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5244 return 1; 5245 } 5246 5247 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5248 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5249 // This function calculates the register usage by measuring the highest number 5250 // of values that are alive at a single location. Obviously, this is a very 5251 // rough estimation. We scan the loop in a topological order in order and 5252 // assign a number to each instruction. We use RPO to ensure that defs are 5253 // met before their users. We assume that each instruction that has in-loop 5254 // users starts an interval. We record every time that an in-loop value is 5255 // used, so we have a list of the first and last occurrences of each 5256 // instruction. Next, we transpose this data structure into a multi map that 5257 // holds the list of intervals that *end* at a specific location. This multi 5258 // map allows us to perform a linear search. We scan the instructions linearly 5259 // and record each time that a new interval starts, by placing it in a set. 5260 // If we find this value in the multi-map then we remove it from the set. 5261 // The max register usage is the maximum size of the set. 5262 // We also search for instructions that are defined outside the loop, but are 5263 // used inside the loop. We need this number separately from the max-interval 5264 // usage number because when we unroll, loop-invariant values do not take 5265 // more register. 5266 LoopBlocksDFS DFS(TheLoop); 5267 DFS.perform(LI); 5268 5269 RegisterUsage RU; 5270 5271 // Each 'key' in the map opens a new interval. The values 5272 // of the map are the index of the 'last seen' usage of the 5273 // instruction that is the key. 5274 using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>; 5275 5276 // Maps instruction to its index. 5277 SmallVector<Instruction *, 64> IdxToInstr; 5278 // Marks the end of each interval. 5279 IntervalMap EndPoint; 5280 // Saves the list of instruction indices that are used in the loop. 5281 SmallPtrSet<Instruction *, 8> Ends; 5282 // Saves the list of values that are used in the loop but are defined outside 5283 // the loop (not including non-instruction values such as arguments and 5284 // constants). 5285 SmallSetVector<Instruction *, 8> LoopInvariants; 5286 5287 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5288 for (Instruction &I : BB->instructionsWithoutDebug()) { 5289 IdxToInstr.push_back(&I); 5290 5291 // Save the end location of each USE. 5292 for (Value *U : I.operands()) { 5293 auto *Instr = dyn_cast<Instruction>(U); 5294 5295 // Ignore non-instruction values such as arguments, constants, etc. 5296 // FIXME: Might need some motivation why these values are ignored. If 5297 // for example an argument is used inside the loop it will increase the 5298 // register pressure (so shouldn't we add it to LoopInvariants). 5299 if (!Instr) 5300 continue; 5301 5302 // If this instruction is outside the loop then record it and continue. 5303 if (!TheLoop->contains(Instr)) { 5304 LoopInvariants.insert(Instr); 5305 continue; 5306 } 5307 5308 // Overwrite previous end points. 5309 EndPoint[Instr] = IdxToInstr.size(); 5310 Ends.insert(Instr); 5311 } 5312 } 5313 } 5314 5315 // Saves the list of intervals that end with the index in 'key'. 5316 using InstrList = SmallVector<Instruction *, 2>; 5317 SmallDenseMap<unsigned, InstrList, 16> TransposeEnds; 5318 5319 // Transpose the EndPoints to a list of values that end at each index. 5320 for (auto &Interval : EndPoint) 5321 TransposeEnds[Interval.second].push_back(Interval.first); 5322 5323 SmallPtrSet<Instruction *, 8> OpenIntervals; 5324 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5325 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5326 5327 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5328 5329 const auto &TTICapture = TTI; 5330 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5331 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || 5332 (VF.isScalable() && 5333 !TTICapture.isElementTypeLegalForScalableVector(Ty))) 5334 return 0; 5335 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5336 }; 5337 5338 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { 5339 Instruction *I = IdxToInstr[Idx]; 5340 5341 // Remove all of the instructions that end at this location. 5342 InstrList &List = TransposeEnds[Idx]; 5343 for (Instruction *ToRemove : List) 5344 OpenIntervals.erase(ToRemove); 5345 5346 // Ignore instructions that are never used within the loop. 5347 if (!Ends.count(I)) 5348 continue; 5349 5350 // Skip ignored values. 5351 if (ValuesToIgnore.count(I)) 5352 continue; 5353 5354 collectInLoopReductions(); 5355 5356 // For each VF find the maximum usage of registers. 5357 for (unsigned J = 0, E = VFs.size(); J < E; ++J) { 5358 // Count the number of registers used, per register class, given all open 5359 // intervals. 5360 // Note that elements in this SmallMapVector will be default constructed 5361 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5362 // there is no previous entry for ClassID. 5363 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5364 5365 if (VFs[J].isScalar()) { 5366 for (auto *Inst : OpenIntervals) { 5367 unsigned ClassID = 5368 TTI.getRegisterClassForType(false, Inst->getType()); 5369 // FIXME: The target might use more than one register for the type 5370 // even in the scalar case. 5371 RegUsage[ClassID] += 1; 5372 } 5373 } else { 5374 collectUniformsAndScalars(VFs[J]); 5375 for (auto *Inst : OpenIntervals) { 5376 // Skip ignored values for VF > 1. 5377 if (VecValuesToIgnore.count(Inst)) 5378 continue; 5379 if (isScalarAfterVectorization(Inst, VFs[J])) { 5380 unsigned ClassID = 5381 TTI.getRegisterClassForType(false, Inst->getType()); 5382 // FIXME: The target might use more than one register for the type 5383 // even in the scalar case. 5384 RegUsage[ClassID] += 1; 5385 } else { 5386 unsigned ClassID = 5387 TTI.getRegisterClassForType(true, Inst->getType()); 5388 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); 5389 } 5390 } 5391 } 5392 5393 for (const auto &Pair : RegUsage) { 5394 auto &Entry = MaxUsages[J][Pair.first]; 5395 Entry = std::max(Entry, Pair.second); 5396 } 5397 } 5398 5399 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " 5400 << OpenIntervals.size() << '\n'); 5401 5402 // Add the current instruction to the list of open intervals. 5403 OpenIntervals.insert(I); 5404 } 5405 5406 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { 5407 // Note that elements in this SmallMapVector will be default constructed 5408 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5409 // there is no previous entry for ClassID. 5410 SmallMapVector<unsigned, unsigned, 4> Invariant; 5411 5412 for (auto *Inst : LoopInvariants) { 5413 // FIXME: The target might use more than one register for the type 5414 // even in the scalar case. 5415 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5416 auto *I = cast<Instruction>(U); 5417 return TheLoop != LI->getLoopFor(I->getParent()) || 5418 isScalarAfterVectorization(I, VFs[Idx]); 5419 }); 5420 5421 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; 5422 unsigned ClassID = 5423 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5424 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5425 } 5426 5427 LLVM_DEBUG({ 5428 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; 5429 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() 5430 << " item\n"; 5431 for (const auto &pair : MaxUsages[Idx]) { 5432 dbgs() << "LV(REG): RegisterClass: " 5433 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5434 << " registers\n"; 5435 } 5436 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5437 << " item\n"; 5438 for (const auto &pair : Invariant) { 5439 dbgs() << "LV(REG): RegisterClass: " 5440 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5441 << " registers\n"; 5442 } 5443 }); 5444 5445 RU.LoopInvariantRegs = Invariant; 5446 RU.MaxLocalUsers = MaxUsages[Idx]; 5447 RUs[Idx] = RU; 5448 } 5449 5450 return RUs; 5451 } 5452 5453 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5454 ElementCount VF) { 5455 // TODO: Cost model for emulated masked load/store is completely 5456 // broken. This hack guides the cost model to use an artificially 5457 // high enough value to practically disable vectorization with such 5458 // operations, except where previously deployed legality hack allowed 5459 // using very low cost values. This is to avoid regressions coming simply 5460 // from moving "masked load/store" check from legality to cost model. 5461 // Masked Load/Gather emulation was previously never allowed. 5462 // Limited number of Masked Store/Scatter emulation was allowed. 5463 assert((isPredicatedInst(I)) && 5464 "Expecting a scalar emulated instruction"); 5465 return isa<LoadInst>(I) || 5466 (isa<StoreInst>(I) && 5467 NumPredStores > NumberOfStoresToPredicate); 5468 } 5469 5470 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5471 // If we aren't vectorizing the loop, or if we've already collected the 5472 // instructions to scalarize, there's nothing to do. Collection may already 5473 // have occurred if we have a user-selected VF and are now computing the 5474 // expected cost for interleaving. 5475 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5476 return; 5477 5478 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5479 // not profitable to scalarize any instructions, the presence of VF in the 5480 // map will indicate that we've analyzed it already. 5481 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5482 5483 PredicatedBBsAfterVectorization[VF].clear(); 5484 5485 // Find all the instructions that are scalar with predication in the loop and 5486 // determine if it would be better to not if-convert the blocks they are in. 5487 // If so, we also record the instructions to scalarize. 5488 for (BasicBlock *BB : TheLoop->blocks()) { 5489 if (!blockNeedsPredicationForAnyReason(BB)) 5490 continue; 5491 for (Instruction &I : *BB) 5492 if (isScalarWithPredication(&I, VF)) { 5493 ScalarCostsTy ScalarCosts; 5494 // Do not apply discount logic for: 5495 // 1. Scalars after vectorization, as there will only be a single copy 5496 // of the instruction. 5497 // 2. Scalable VF, as that would lead to invalid scalarization costs. 5498 // 3. Emulated masked memrefs, if a hacked cost is needed. 5499 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && 5500 !useEmulatedMaskMemRefHack(&I, VF) && 5501 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { 5502 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5503 // Check if we decided to scalarize a call. If so, update the widening 5504 // decision of the call to CM_Scalarize with the computed scalar cost. 5505 for (const auto &[I, _] : ScalarCosts) { 5506 auto *CI = dyn_cast<CallInst>(I); 5507 if (!CI || !CallWideningDecisions.contains({CI, VF})) 5508 continue; 5509 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize; 5510 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI]; 5511 } 5512 } 5513 // Remember that BB will remain after vectorization. 5514 PredicatedBBsAfterVectorization[VF].insert(BB); 5515 for (auto *Pred : predecessors(BB)) { 5516 if (Pred->getSingleSuccessor() == BB) 5517 PredicatedBBsAfterVectorization[VF].insert(Pred); 5518 } 5519 } 5520 } 5521 } 5522 5523 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5524 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5525 assert(!isUniformAfterVectorization(PredInst, VF) && 5526 "Instruction marked uniform-after-vectorization will be predicated"); 5527 5528 // Initialize the discount to zero, meaning that the scalar version and the 5529 // vector version cost the same. 5530 InstructionCost Discount = 0; 5531 5532 // Holds instructions to analyze. The instructions we visit are mapped in 5533 // ScalarCosts. Those instructions are the ones that would be scalarized if 5534 // we find that the scalar version costs less. 5535 SmallVector<Instruction *, 8> Worklist; 5536 5537 // Returns true if the given instruction can be scalarized. 5538 auto CanBeScalarized = [&](Instruction *I) -> bool { 5539 // We only attempt to scalarize instructions forming a single-use chain 5540 // from the original predicated block that would otherwise be vectorized. 5541 // Although not strictly necessary, we give up on instructions we know will 5542 // already be scalar to avoid traversing chains that are unlikely to be 5543 // beneficial. 5544 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5545 isScalarAfterVectorization(I, VF)) 5546 return false; 5547 5548 // If the instruction is scalar with predication, it will be analyzed 5549 // separately. We ignore it within the context of PredInst. 5550 if (isScalarWithPredication(I, VF)) 5551 return false; 5552 5553 // If any of the instruction's operands are uniform after vectorization, 5554 // the instruction cannot be scalarized. This prevents, for example, a 5555 // masked load from being scalarized. 5556 // 5557 // We assume we will only emit a value for lane zero of an instruction 5558 // marked uniform after vectorization, rather than VF identical values. 5559 // Thus, if we scalarize an instruction that uses a uniform, we would 5560 // create uses of values corresponding to the lanes we aren't emitting code 5561 // for. This behavior can be changed by allowing getScalarValue to clone 5562 // the lane zero values for uniforms rather than asserting. 5563 for (Use &U : I->operands()) 5564 if (auto *J = dyn_cast<Instruction>(U.get())) 5565 if (isUniformAfterVectorization(J, VF)) 5566 return false; 5567 5568 // Otherwise, we can scalarize the instruction. 5569 return true; 5570 }; 5571 5572 // Compute the expected cost discount from scalarizing the entire expression 5573 // feeding the predicated instruction. We currently only consider expressions 5574 // that are single-use instruction chains. 5575 Worklist.push_back(PredInst); 5576 while (!Worklist.empty()) { 5577 Instruction *I = Worklist.pop_back_val(); 5578 5579 // If we've already analyzed the instruction, there's nothing to do. 5580 if (ScalarCosts.contains(I)) 5581 continue; 5582 5583 // Compute the cost of the vector instruction. Note that this cost already 5584 // includes the scalarization overhead of the predicated instruction. 5585 InstructionCost VectorCost = getInstructionCost(I, VF); 5586 5587 // Compute the cost of the scalarized instruction. This cost is the cost of 5588 // the instruction as if it wasn't if-converted and instead remained in the 5589 // predicated block. We will scale this cost by block probability after 5590 // computing the scalarization overhead. 5591 InstructionCost ScalarCost = 5592 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); 5593 5594 // Compute the scalarization overhead of needed insertelement instructions 5595 // and phi nodes. 5596 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5597 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5598 ScalarCost += TTI.getScalarizationOverhead( 5599 cast<VectorType>(toVectorTy(I->getType(), VF)), 5600 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5601 /*Extract*/ false, CostKind); 5602 ScalarCost += 5603 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5604 } 5605 5606 // Compute the scalarization overhead of needed extractelement 5607 // instructions. For each of the instruction's operands, if the operand can 5608 // be scalarized, add it to the worklist; otherwise, account for the 5609 // overhead. 5610 for (Use &U : I->operands()) 5611 if (auto *J = dyn_cast<Instruction>(U.get())) { 5612 assert(VectorType::isValidElementType(J->getType()) && 5613 "Instruction has non-scalar type"); 5614 if (CanBeScalarized(J)) 5615 Worklist.push_back(J); 5616 else if (needsExtract(J, VF)) { 5617 ScalarCost += TTI.getScalarizationOverhead( 5618 cast<VectorType>(toVectorTy(J->getType(), VF)), 5619 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5620 /*Extract*/ true, CostKind); 5621 } 5622 } 5623 5624 // Scale the total scalar cost by block probability. 5625 ScalarCost /= getReciprocalPredBlockProb(); 5626 5627 // Compute the discount. A non-negative discount means the vector version 5628 // of the instruction costs more, and scalarizing would be beneficial. 5629 Discount += VectorCost - ScalarCost; 5630 ScalarCosts[I] = ScalarCost; 5631 } 5632 5633 return Discount; 5634 } 5635 5636 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { 5637 InstructionCost Cost; 5638 5639 // If the vector loop gets executed exactly once with the given VF, ignore the 5640 // costs of comparison and induction instructions, as they'll get simplified 5641 // away. 5642 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF; 5643 auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5644 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking()) 5645 addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), 5646 ValuesToIgnoreForVF); 5647 5648 // For each block. 5649 for (BasicBlock *BB : TheLoop->blocks()) { 5650 InstructionCost BlockCost; 5651 5652 // For each instruction in the old loop. 5653 for (Instruction &I : BB->instructionsWithoutDebug()) { 5654 // Skip ignored values. 5655 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) || 5656 (VF.isVector() && VecValuesToIgnore.count(&I))) 5657 continue; 5658 5659 InstructionCost C = getInstructionCost(&I, VF); 5660 5661 // Check if we should override the cost. 5662 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) 5663 C = InstructionCost(ForceTargetInstructionCost); 5664 5665 BlockCost += C; 5666 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " 5667 << VF << " For instruction: " << I << '\n'); 5668 } 5669 5670 // If we are vectorizing a predicated block, it will have been 5671 // if-converted. This means that the block's instructions (aside from 5672 // stores and instructions that may divide by zero) will now be 5673 // unconditionally executed. For the scalar case, we may not always execute 5674 // the predicated block, if it is an if-else block. Thus, scale the block's 5675 // cost by the probability of executing it. blockNeedsPredication from 5676 // Legal is used so as to not include all blocks in tail folded loops. 5677 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5678 BlockCost /= getReciprocalPredBlockProb(); 5679 5680 Cost += BlockCost; 5681 } 5682 5683 return Cost; 5684 } 5685 5686 /// Gets Address Access SCEV after verifying that the access pattern 5687 /// is loop invariant except the induction variable dependence. 5688 /// 5689 /// This SCEV can be sent to the Target in order to estimate the address 5690 /// calculation cost. 5691 static const SCEV *getAddressAccessSCEV( 5692 Value *Ptr, 5693 LoopVectorizationLegality *Legal, 5694 PredicatedScalarEvolution &PSE, 5695 const Loop *TheLoop) { 5696 5697 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5698 if (!Gep) 5699 return nullptr; 5700 5701 // We are looking for a gep with all loop invariant indices except for one 5702 // which should be an induction variable. 5703 auto *SE = PSE.getSE(); 5704 unsigned NumOperands = Gep->getNumOperands(); 5705 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { 5706 Value *Opd = Gep->getOperand(Idx); 5707 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5708 !Legal->isInductionVariable(Opd)) 5709 return nullptr; 5710 } 5711 5712 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5713 return PSE.getSCEV(Ptr); 5714 } 5715 5716 InstructionCost 5717 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5718 ElementCount VF) { 5719 assert(VF.isVector() && 5720 "Scalarization cost of instruction implies vectorization."); 5721 if (VF.isScalable()) 5722 return InstructionCost::getInvalid(); 5723 5724 Type *ValTy = getLoadStoreType(I); 5725 auto *SE = PSE.getSE(); 5726 5727 unsigned AS = getLoadStoreAddressSpace(I); 5728 Value *Ptr = getLoadStorePointerOperand(I); 5729 Type *PtrTy = toVectorTy(Ptr->getType(), VF); 5730 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5731 // that it is being called from this specific place. 5732 5733 // Figure out whether the access is strided and get the stride value 5734 // if it's known in compile time 5735 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5736 5737 // Get the cost of the scalar memory instruction and address computation. 5738 InstructionCost Cost = 5739 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5740 5741 // Don't pass *I here, since it is scalar but will actually be part of a 5742 // vectorized loop where the user of it is a vectorized instruction. 5743 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5744 const Align Alignment = getLoadStoreAlignment(I); 5745 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 5746 ValTy->getScalarType(), 5747 Alignment, AS, CostKind); 5748 5749 // Get the overhead of the extractelement and insertelement instructions 5750 // we might create due to scalarization. 5751 Cost += getScalarizationOverhead(I, VF, CostKind); 5752 5753 // If we have a predicated load/store, it will need extra i1 extracts and 5754 // conditional branches, but may not be executed for each vector lane. Scale 5755 // the cost by the probability of executing the predicated block. 5756 if (isPredicatedInst(I)) { 5757 Cost /= getReciprocalPredBlockProb(); 5758 5759 // Add the cost of an i1 extract and a branch 5760 auto *VecI1Ty = 5761 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 5762 Cost += TTI.getScalarizationOverhead( 5763 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 5764 /*Insert=*/false, /*Extract=*/true, CostKind); 5765 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 5766 5767 if (useEmulatedMaskMemRefHack(I, VF)) 5768 // Artificially setting to a high enough value to practically disable 5769 // vectorization with such operations. 5770 Cost = 3000000; 5771 } 5772 5773 return Cost; 5774 } 5775 5776 InstructionCost 5777 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5778 ElementCount VF) { 5779 Type *ValTy = getLoadStoreType(I); 5780 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5781 Value *Ptr = getLoadStorePointerOperand(I); 5782 unsigned AS = getLoadStoreAddressSpace(I); 5783 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 5784 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5785 5786 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5787 "Stride should be 1 or -1 for consecutive memory access"); 5788 const Align Alignment = getLoadStoreAlignment(I); 5789 InstructionCost Cost = 0; 5790 if (Legal->isMaskRequired(I)) { 5791 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5792 CostKind); 5793 } else { 5794 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5795 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5796 CostKind, OpInfo, I); 5797 } 5798 5799 bool Reverse = ConsecutiveStride < 0; 5800 if (Reverse) 5801 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5802 CostKind, 0); 5803 return Cost; 5804 } 5805 5806 InstructionCost 5807 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5808 ElementCount VF) { 5809 assert(Legal->isUniformMemOp(*I, VF)); 5810 5811 Type *ValTy = getLoadStoreType(I); 5812 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5813 const Align Alignment = getLoadStoreAlignment(I); 5814 unsigned AS = getLoadStoreAddressSpace(I); 5815 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5816 if (isa<LoadInst>(I)) { 5817 return TTI.getAddressComputationCost(ValTy) + 5818 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5819 CostKind) + 5820 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5821 } 5822 StoreInst *SI = cast<StoreInst>(I); 5823 5824 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 5825 return TTI.getAddressComputationCost(ValTy) + 5826 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5827 CostKind) + 5828 (IsLoopInvariantStoreValue 5829 ? 0 5830 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5831 CostKind, VF.getKnownMinValue() - 1)); 5832 } 5833 5834 InstructionCost 5835 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5836 ElementCount VF) { 5837 Type *ValTy = getLoadStoreType(I); 5838 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5839 const Align Alignment = getLoadStoreAlignment(I); 5840 const Value *Ptr = getLoadStorePointerOperand(I); 5841 5842 return TTI.getAddressComputationCost(VectorTy) + 5843 TTI.getGatherScatterOpCost( 5844 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 5845 TargetTransformInfo::TCK_RecipThroughput, I); 5846 } 5847 5848 InstructionCost 5849 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5850 ElementCount VF) { 5851 const auto *Group = getInterleavedAccessGroup(I); 5852 assert(Group && "Fail to get an interleaved access group."); 5853 5854 Instruction *InsertPos = Group->getInsertPos(); 5855 Type *ValTy = getLoadStoreType(InsertPos); 5856 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5857 unsigned AS = getLoadStoreAddressSpace(InsertPos); 5858 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5859 5860 unsigned InterleaveFactor = Group->getFactor(); 5861 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5862 5863 // Holds the indices of existing members in the interleaved group. 5864 SmallVector<unsigned, 4> Indices; 5865 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 5866 if (Group->getMember(IF)) 5867 Indices.push_back(IF); 5868 5869 // Calculate the cost of the whole interleaved group. 5870 bool UseMaskForGaps = 5871 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 5872 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 5873 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 5874 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5875 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), 5876 UseMaskForGaps); 5877 5878 if (Group->isReverse()) { 5879 // TODO: Add support for reversed masked interleaved access. 5880 assert(!Legal->isMaskRequired(I) && 5881 "Reverse masked interleaved access not supported."); 5882 Cost += Group->getNumMembers() * 5883 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5884 CostKind, 0); 5885 } 5886 return Cost; 5887 } 5888 5889 std::optional<InstructionCost> 5890 LoopVectorizationCostModel::getReductionPatternCost( 5891 Instruction *I, ElementCount VF, Type *Ty, 5892 TTI::TargetCostKind CostKind) const { 5893 using namespace llvm::PatternMatch; 5894 // Early exit for no inloop reductions 5895 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 5896 return std::nullopt; 5897 auto *VectorTy = cast<VectorType>(Ty); 5898 5899 // We are looking for a pattern of, and finding the minimal acceptable cost: 5900 // reduce(mul(ext(A), ext(B))) or 5901 // reduce(mul(A, B)) or 5902 // reduce(ext(A)) or 5903 // reduce(A). 5904 // The basic idea is that we walk down the tree to do that, finding the root 5905 // reduction instruction in InLoopReductionImmediateChains. From there we find 5906 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 5907 // of the components. If the reduction cost is lower then we return it for the 5908 // reduction instruction and 0 for the other instructions in the pattern. If 5909 // it is not we return an invalid cost specifying the orignal cost method 5910 // should be used. 5911 Instruction *RetI = I; 5912 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 5913 if (!RetI->hasOneUser()) 5914 return std::nullopt; 5915 RetI = RetI->user_back(); 5916 } 5917 5918 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 5919 RetI->user_back()->getOpcode() == Instruction::Add) { 5920 RetI = RetI->user_back(); 5921 } 5922 5923 // Test if the found instruction is a reduction, and if not return an invalid 5924 // cost specifying the parent to use the original cost modelling. 5925 if (!InLoopReductionImmediateChains.count(RetI)) 5926 return std::nullopt; 5927 5928 // Find the reduction this chain is a part of and calculate the basic cost of 5929 // the reduction on its own. 5930 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 5931 Instruction *ReductionPhi = LastChain; 5932 while (!isa<PHINode>(ReductionPhi)) 5933 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 5934 5935 const RecurrenceDescriptor &RdxDesc = 5936 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 5937 5938 InstructionCost BaseCost; 5939 RecurKind RK = RdxDesc.getRecurrenceKind(); 5940 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 5941 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); 5942 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, 5943 RdxDesc.getFastMathFlags(), CostKind); 5944 } else { 5945 BaseCost = TTI.getArithmeticReductionCost( 5946 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 5947 } 5948 5949 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5950 // normal fmul instruction to the cost of the fadd reduction. 5951 if (RK == RecurKind::FMulAdd) 5952 BaseCost += 5953 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 5954 5955 // If we're using ordered reductions then we can just return the base cost 5956 // here, since getArithmeticReductionCost calculates the full ordered 5957 // reduction cost when FP reassociation is not allowed. 5958 if (useOrderedReductions(RdxDesc)) 5959 return BaseCost; 5960 5961 // Get the operand that was not the reduction chain and match it to one of the 5962 // patterns, returning the better cost if it is found. 5963 Instruction *RedOp = RetI->getOperand(1) == LastChain 5964 ? dyn_cast<Instruction>(RetI->getOperand(0)) 5965 : dyn_cast<Instruction>(RetI->getOperand(1)); 5966 5967 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 5968 5969 Instruction *Op0, *Op1; 5970 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5971 match(RedOp, 5972 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 5973 match(Op0, m_ZExtOrSExt(m_Value())) && 5974 Op0->getOpcode() == Op1->getOpcode() && 5975 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 5976 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 5977 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 5978 5979 // Matched reduce.add(ext(mul(ext(A), ext(B))) 5980 // Note that the extend opcodes need to all match, or if A==B they will have 5981 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5982 // which is equally fine. 5983 bool IsUnsigned = isa<ZExtInst>(Op0); 5984 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 5985 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 5986 5987 InstructionCost ExtCost = 5988 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 5989 TTI::CastContextHint::None, CostKind, Op0); 5990 InstructionCost MulCost = 5991 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 5992 InstructionCost Ext2Cost = 5993 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 5994 TTI::CastContextHint::None, CostKind, RedOp); 5995 5996 InstructionCost RedCost = TTI.getMulAccReductionCost( 5997 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5998 5999 if (RedCost.isValid() && 6000 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6001 return I == RetI ? RedCost : 0; 6002 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6003 !TheLoop->isLoopInvariant(RedOp)) { 6004 // Matched reduce(ext(A)) 6005 bool IsUnsigned = isa<ZExtInst>(RedOp); 6006 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6007 InstructionCost RedCost = TTI.getExtendedReductionCost( 6008 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6009 RdxDesc.getFastMathFlags(), CostKind); 6010 6011 InstructionCost ExtCost = 6012 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6013 TTI::CastContextHint::None, CostKind, RedOp); 6014 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6015 return I == RetI ? RedCost : 0; 6016 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6017 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6018 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6019 Op0->getOpcode() == Op1->getOpcode() && 6020 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6021 bool IsUnsigned = isa<ZExtInst>(Op0); 6022 Type *Op0Ty = Op0->getOperand(0)->getType(); 6023 Type *Op1Ty = Op1->getOperand(0)->getType(); 6024 Type *LargestOpTy = 6025 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6026 : Op0Ty; 6027 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6028 6029 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6030 // different sizes. We take the largest type as the ext to reduce, and add 6031 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6032 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6033 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6034 TTI::CastContextHint::None, CostKind, Op0); 6035 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6036 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6037 TTI::CastContextHint::None, CostKind, Op1); 6038 InstructionCost MulCost = 6039 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6040 6041 InstructionCost RedCost = TTI.getMulAccReductionCost( 6042 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6043 InstructionCost ExtraExtCost = 0; 6044 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6045 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6046 ExtraExtCost = TTI.getCastInstrCost( 6047 ExtraExtOp->getOpcode(), ExtType, 6048 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6049 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6050 } 6051 6052 if (RedCost.isValid() && 6053 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6054 return I == RetI ? RedCost : 0; 6055 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6056 // Matched reduce.add(mul()) 6057 InstructionCost MulCost = 6058 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6059 6060 InstructionCost RedCost = TTI.getMulAccReductionCost( 6061 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6062 6063 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6064 return I == RetI ? RedCost : 0; 6065 } 6066 } 6067 6068 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6069 } 6070 6071 InstructionCost 6072 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6073 ElementCount VF) { 6074 // Calculate scalar cost only. Vectorization cost should be ready at this 6075 // moment. 6076 if (VF.isScalar()) { 6077 Type *ValTy = getLoadStoreType(I); 6078 const Align Alignment = getLoadStoreAlignment(I); 6079 unsigned AS = getLoadStoreAddressSpace(I); 6080 6081 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6082 return TTI.getAddressComputationCost(ValTy) + 6083 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6084 TTI::TCK_RecipThroughput, OpInfo, I); 6085 } 6086 return getWideningCost(I, VF); 6087 } 6088 6089 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6090 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6091 6092 // There is no mechanism yet to create a scalable scalarization loop, 6093 // so this is currently Invalid. 6094 if (VF.isScalable()) 6095 return InstructionCost::getInvalid(); 6096 6097 if (VF.isScalar()) 6098 return 0; 6099 6100 InstructionCost Cost = 0; 6101 Type *RetTy = toVectorTy(I->getType(), VF); 6102 if (!RetTy->isVoidTy() && 6103 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6104 Cost += TTI.getScalarizationOverhead( 6105 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6106 /*Insert*/ true, 6107 /*Extract*/ false, CostKind); 6108 6109 // Some targets keep addresses scalar. 6110 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6111 return Cost; 6112 6113 // Some targets support efficient element stores. 6114 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6115 return Cost; 6116 6117 // Collect operands to consider. 6118 CallInst *CI = dyn_cast<CallInst>(I); 6119 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6120 6121 // Skip operands that do not require extraction/scalarization and do not incur 6122 // any overhead. 6123 SmallVector<Type *> Tys; 6124 for (auto *V : filterExtractingOperands(Ops, VF)) 6125 Tys.push_back(maybeVectorizeType(V->getType(), VF)); 6126 return Cost + TTI.getOperandsScalarizationOverhead( 6127 filterExtractingOperands(Ops, VF), Tys, CostKind); 6128 } 6129 6130 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6131 if (VF.isScalar()) 6132 return; 6133 NumPredStores = 0; 6134 for (BasicBlock *BB : TheLoop->blocks()) { 6135 // For each instruction in the old loop. 6136 for (Instruction &I : *BB) { 6137 Value *Ptr = getLoadStorePointerOperand(&I); 6138 if (!Ptr) 6139 continue; 6140 6141 // TODO: We should generate better code and update the cost model for 6142 // predicated uniform stores. Today they are treated as any other 6143 // predicated store (see added test cases in 6144 // invariant-store-vectorization.ll). 6145 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6146 NumPredStores++; 6147 6148 if (Legal->isUniformMemOp(I, VF)) { 6149 auto IsLegalToScalarize = [&]() { 6150 if (!VF.isScalable()) 6151 // Scalarization of fixed length vectors "just works". 6152 return true; 6153 6154 // We have dedicated lowering for unpredicated uniform loads and 6155 // stores. Note that even with tail folding we know that at least 6156 // one lane is active (i.e. generalized predication is not possible 6157 // here), and the logic below depends on this fact. 6158 if (!foldTailByMasking()) 6159 return true; 6160 6161 // For scalable vectors, a uniform memop load is always 6162 // uniform-by-parts and we know how to scalarize that. 6163 if (isa<LoadInst>(I)) 6164 return true; 6165 6166 // A uniform store isn't neccessarily uniform-by-part 6167 // and we can't assume scalarization. 6168 auto &SI = cast<StoreInst>(I); 6169 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6170 }; 6171 6172 const InstructionCost GatherScatterCost = 6173 isLegalGatherOrScatter(&I, VF) ? 6174 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6175 6176 // Load: Scalar load + broadcast 6177 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6178 // FIXME: This cost is a significant under-estimate for tail folded 6179 // memory ops. 6180 const InstructionCost ScalarizationCost = 6181 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF) 6182 : InstructionCost::getInvalid(); 6183 6184 // Choose better solution for the current VF, Note that Invalid 6185 // costs compare as maximumal large. If both are invalid, we get 6186 // scalable invalid which signals a failure and a vectorization abort. 6187 if (GatherScatterCost < ScalarizationCost) 6188 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6189 else 6190 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6191 continue; 6192 } 6193 6194 // We assume that widening is the best solution when possible. 6195 if (memoryInstructionCanBeWidened(&I, VF)) { 6196 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6197 int ConsecutiveStride = Legal->isConsecutivePtr( 6198 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6199 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6200 "Expected consecutive stride."); 6201 InstWidening Decision = 6202 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6203 setWideningDecision(&I, VF, Decision, Cost); 6204 continue; 6205 } 6206 6207 // Choose between Interleaving, Gather/Scatter or Scalarization. 6208 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6209 unsigned NumAccesses = 1; 6210 if (isAccessInterleaved(&I)) { 6211 const auto *Group = getInterleavedAccessGroup(&I); 6212 assert(Group && "Fail to get an interleaved access group."); 6213 6214 // Make one decision for the whole group. 6215 if (getWideningDecision(&I, VF) != CM_Unknown) 6216 continue; 6217 6218 NumAccesses = Group->getNumMembers(); 6219 if (interleavedAccessCanBeWidened(&I, VF)) 6220 InterleaveCost = getInterleaveGroupCost(&I, VF); 6221 } 6222 6223 InstructionCost GatherScatterCost = 6224 isLegalGatherOrScatter(&I, VF) 6225 ? getGatherScatterCost(&I, VF) * NumAccesses 6226 : InstructionCost::getInvalid(); 6227 6228 InstructionCost ScalarizationCost = 6229 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6230 6231 // Choose better solution for the current VF, 6232 // write down this decision and use it during vectorization. 6233 InstructionCost Cost; 6234 InstWidening Decision; 6235 if (InterleaveCost <= GatherScatterCost && 6236 InterleaveCost < ScalarizationCost) { 6237 Decision = CM_Interleave; 6238 Cost = InterleaveCost; 6239 } else if (GatherScatterCost < ScalarizationCost) { 6240 Decision = CM_GatherScatter; 6241 Cost = GatherScatterCost; 6242 } else { 6243 Decision = CM_Scalarize; 6244 Cost = ScalarizationCost; 6245 } 6246 // If the instructions belongs to an interleave group, the whole group 6247 // receives the same decision. The whole group receives the cost, but 6248 // the cost will actually be assigned to one instruction. 6249 if (const auto *Group = getInterleavedAccessGroup(&I)) 6250 setWideningDecision(Group, VF, Decision, Cost); 6251 else 6252 setWideningDecision(&I, VF, Decision, Cost); 6253 } 6254 } 6255 6256 // Make sure that any load of address and any other address computation 6257 // remains scalar unless there is gather/scatter support. This avoids 6258 // inevitable extracts into address registers, and also has the benefit of 6259 // activating LSR more, since that pass can't optimize vectorized 6260 // addresses. 6261 if (TTI.prefersVectorizedAddressing()) 6262 return; 6263 6264 // Start with all scalar pointer uses. 6265 SmallPtrSet<Instruction *, 8> AddrDefs; 6266 for (BasicBlock *BB : TheLoop->blocks()) 6267 for (Instruction &I : *BB) { 6268 Instruction *PtrDef = 6269 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6270 if (PtrDef && TheLoop->contains(PtrDef) && 6271 getWideningDecision(&I, VF) != CM_GatherScatter) 6272 AddrDefs.insert(PtrDef); 6273 } 6274 6275 // Add all instructions used to generate the addresses. 6276 SmallVector<Instruction *, 4> Worklist; 6277 append_range(Worklist, AddrDefs); 6278 while (!Worklist.empty()) { 6279 Instruction *I = Worklist.pop_back_val(); 6280 for (auto &Op : I->operands()) 6281 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6282 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6283 AddrDefs.insert(InstOp).second) 6284 Worklist.push_back(InstOp); 6285 } 6286 6287 for (auto *I : AddrDefs) { 6288 if (isa<LoadInst>(I)) { 6289 // Setting the desired widening decision should ideally be handled in 6290 // by cost functions, but since this involves the task of finding out 6291 // if the loaded register is involved in an address computation, it is 6292 // instead changed here when we know this is the case. 6293 InstWidening Decision = getWideningDecision(I, VF); 6294 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6295 // Scalarize a widened load of address. 6296 setWideningDecision( 6297 I, VF, CM_Scalarize, 6298 (VF.getKnownMinValue() * 6299 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6300 else if (const auto *Group = getInterleavedAccessGroup(I)) { 6301 // Scalarize an interleave group of address loads. 6302 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6303 if (Instruction *Member = Group->getMember(I)) 6304 setWideningDecision( 6305 Member, VF, CM_Scalarize, 6306 (VF.getKnownMinValue() * 6307 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6308 } 6309 } 6310 } else 6311 // Make sure I gets scalarized and a cost estimate without 6312 // scalarization overhead. 6313 ForcedScalars[VF].insert(I); 6314 } 6315 } 6316 6317 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6318 assert(!VF.isScalar() && 6319 "Trying to set a vectorization decision for a scalar VF"); 6320 6321 auto ForcedScalar = ForcedScalars.find(VF); 6322 for (BasicBlock *BB : TheLoop->blocks()) { 6323 // For each instruction in the old loop. 6324 for (Instruction &I : *BB) { 6325 CallInst *CI = dyn_cast<CallInst>(&I); 6326 6327 if (!CI) 6328 continue; 6329 6330 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6331 InstructionCost VectorCost = InstructionCost::getInvalid(); 6332 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6333 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6334 Function *ScalarFunc = CI->getCalledFunction(); 6335 Type *ScalarRetTy = CI->getType(); 6336 SmallVector<Type *, 4> Tys, ScalarTys; 6337 for (auto &ArgOp : CI->args()) 6338 ScalarTys.push_back(ArgOp->getType()); 6339 6340 // Estimate cost of scalarized vector call. The source operands are 6341 // assumed to be vectors, so we need to extract individual elements from 6342 // there, execute VF scalar calls, and then gather the result into the 6343 // vector return value. 6344 InstructionCost ScalarCallCost = 6345 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6346 6347 // Compute costs of unpacking argument values for the scalar calls and 6348 // packing the return values to a vector. 6349 InstructionCost ScalarizationCost = 6350 getScalarizationOverhead(CI, VF, CostKind); 6351 6352 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6353 // Honor ForcedScalars and UniformAfterVectorization decisions. 6354 // TODO: For calls, it might still be more profitable to widen. Use 6355 // VPlan-based cost model to compare different options. 6356 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() && 6357 ForcedScalar->second.contains(CI)) || 6358 isUniformAfterVectorization(CI, VF))) { 6359 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr, 6360 Intrinsic::not_intrinsic, std::nullopt, 6361 ScalarCost); 6362 continue; 6363 } 6364 6365 bool MaskRequired = Legal->isMaskRequired(CI); 6366 // Compute corresponding vector type for return value and arguments. 6367 Type *RetTy = toVectorTy(ScalarRetTy, VF); 6368 for (Type *ScalarTy : ScalarTys) 6369 Tys.push_back(toVectorTy(ScalarTy, VF)); 6370 6371 // An in-loop reduction using an fmuladd intrinsic is a special case; 6372 // we don't want the normal cost for that intrinsic. 6373 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6374 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6375 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6376 getVectorIntrinsicIDForCall(CI, TLI), 6377 std::nullopt, *RedCost); 6378 continue; 6379 } 6380 6381 // Find the cost of vectorizing the call, if we can find a suitable 6382 // vector variant of the function. 6383 bool UsesMask = false; 6384 VFInfo FuncInfo; 6385 Function *VecFunc = nullptr; 6386 // Search through any available variants for one we can use at this VF. 6387 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6388 // Must match requested VF. 6389 if (Info.Shape.VF != VF) 6390 continue; 6391 6392 // Must take a mask argument if one is required 6393 if (MaskRequired && !Info.isMasked()) 6394 continue; 6395 6396 // Check that all parameter kinds are supported 6397 bool ParamsOk = true; 6398 for (VFParameter Param : Info.Shape.Parameters) { 6399 switch (Param.ParamKind) { 6400 case VFParamKind::Vector: 6401 break; 6402 case VFParamKind::OMP_Uniform: { 6403 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6404 // Make sure the scalar parameter in the loop is invariant. 6405 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6406 TheLoop)) 6407 ParamsOk = false; 6408 break; 6409 } 6410 case VFParamKind::OMP_Linear: { 6411 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6412 // Find the stride for the scalar parameter in this loop and see if 6413 // it matches the stride for the variant. 6414 // TODO: do we need to figure out the cost of an extract to get the 6415 // first lane? Or do we hope that it will be folded away? 6416 ScalarEvolution *SE = PSE.getSE(); 6417 const auto *SAR = 6418 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6419 6420 if (!SAR || SAR->getLoop() != TheLoop) { 6421 ParamsOk = false; 6422 break; 6423 } 6424 6425 const SCEVConstant *Step = 6426 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6427 6428 if (!Step || 6429 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6430 ParamsOk = false; 6431 6432 break; 6433 } 6434 case VFParamKind::GlobalPredicate: 6435 UsesMask = true; 6436 break; 6437 default: 6438 ParamsOk = false; 6439 break; 6440 } 6441 } 6442 6443 if (!ParamsOk) 6444 continue; 6445 6446 // Found a suitable candidate, stop here. 6447 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6448 FuncInfo = Info; 6449 break; 6450 } 6451 6452 // Add in the cost of synthesizing a mask if one wasn't required. 6453 InstructionCost MaskCost = 0; 6454 if (VecFunc && UsesMask && !MaskRequired) 6455 MaskCost = TTI.getShuffleCost( 6456 TargetTransformInfo::SK_Broadcast, 6457 VectorType::get(IntegerType::getInt1Ty( 6458 VecFunc->getFunctionType()->getContext()), 6459 VF)); 6460 6461 if (TLI && VecFunc && !CI->isNoBuiltin()) 6462 VectorCost = 6463 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6464 6465 // Find the cost of an intrinsic; some targets may have instructions that 6466 // perform the operation without needing an actual call. 6467 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6468 if (IID != Intrinsic::not_intrinsic) 6469 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6470 6471 InstructionCost Cost = ScalarCost; 6472 InstWidening Decision = CM_Scalarize; 6473 6474 if (VectorCost <= Cost) { 6475 Cost = VectorCost; 6476 Decision = CM_VectorCall; 6477 } 6478 6479 if (IntrinsicCost <= Cost) { 6480 Cost = IntrinsicCost; 6481 Decision = CM_IntrinsicCall; 6482 } 6483 6484 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6485 FuncInfo.getParamIndexForOptionalMask(), Cost); 6486 } 6487 } 6488 } 6489 6490 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { 6491 if (!Legal->isInvariant(Op)) 6492 return false; 6493 // Consider Op invariant, if it or its operands aren't predicated 6494 // instruction in the loop. In that case, it is not trivially hoistable. 6495 auto *OpI = dyn_cast<Instruction>(Op); 6496 return !OpI || !TheLoop->contains(OpI) || 6497 (!isPredicatedInst(OpI) && 6498 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) && 6499 all_of(OpI->operands(), 6500 [this](Value *Op) { return shouldConsiderInvariant(Op); })); 6501 } 6502 6503 InstructionCost 6504 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6505 ElementCount VF) { 6506 // If we know that this instruction will remain uniform, check the cost of 6507 // the scalar version. 6508 if (isUniformAfterVectorization(I, VF)) 6509 VF = ElementCount::getFixed(1); 6510 6511 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6512 return InstsToScalarize[VF][I]; 6513 6514 // Forced scalars do not have any scalarization overhead. 6515 auto ForcedScalar = ForcedScalars.find(VF); 6516 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6517 auto InstSet = ForcedScalar->second; 6518 if (InstSet.count(I)) 6519 return getInstructionCost(I, ElementCount::getFixed(1)) * 6520 VF.getKnownMinValue(); 6521 } 6522 6523 Type *RetTy = I->getType(); 6524 if (canTruncateToMinimalBitwidth(I, VF)) 6525 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6526 auto *SE = PSE.getSE(); 6527 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6528 6529 auto HasSingleCopyAfterVectorization = [this](Instruction *I, 6530 ElementCount VF) -> bool { 6531 if (VF.isScalar()) 6532 return true; 6533 6534 auto Scalarized = InstsToScalarize.find(VF); 6535 assert(Scalarized != InstsToScalarize.end() && 6536 "VF not yet analyzed for scalarization profitability"); 6537 return !Scalarized->second.count(I) && 6538 llvm::all_of(I->users(), [&](User *U) { 6539 auto *UI = cast<Instruction>(U); 6540 return !Scalarized->second.count(UI); 6541 }); 6542 }; 6543 (void)HasSingleCopyAfterVectorization; 6544 6545 Type *VectorTy; 6546 if (isScalarAfterVectorization(I, VF)) { 6547 // With the exception of GEPs and PHIs, after scalarization there should 6548 // only be one copy of the instruction generated in the loop. This is 6549 // because the VF is either 1, or any instructions that need scalarizing 6550 // have already been dealt with by the time we get here. As a result, 6551 // it means we don't have to multiply the instruction cost by VF. 6552 assert(I->getOpcode() == Instruction::GetElementPtr || 6553 I->getOpcode() == Instruction::PHI || 6554 (I->getOpcode() == Instruction::BitCast && 6555 I->getType()->isPointerTy()) || 6556 HasSingleCopyAfterVectorization(I, VF)); 6557 VectorTy = RetTy; 6558 } else 6559 VectorTy = toVectorTy(RetTy, VF); 6560 6561 if (VF.isVector() && VectorTy->isVectorTy() && 6562 !TTI.getNumberOfParts(VectorTy)) 6563 return InstructionCost::getInvalid(); 6564 6565 // TODO: We need to estimate the cost of intrinsic calls. 6566 switch (I->getOpcode()) { 6567 case Instruction::GetElementPtr: 6568 // We mark this instruction as zero-cost because the cost of GEPs in 6569 // vectorized code depends on whether the corresponding memory instruction 6570 // is scalarized or not. Therefore, we handle GEPs with the memory 6571 // instruction cost. 6572 return 0; 6573 case Instruction::Br: { 6574 // In cases of scalarized and predicated instructions, there will be VF 6575 // predicated blocks in the vectorized loop. Each branch around these 6576 // blocks requires also an extract of its vector compare i1 element. 6577 // Note that the conditional branch from the loop latch will be replaced by 6578 // a single branch controlling the loop, so there is no extra overhead from 6579 // scalarization. 6580 bool ScalarPredicatedBB = false; 6581 BranchInst *BI = cast<BranchInst>(I); 6582 if (VF.isVector() && BI->isConditional() && 6583 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6584 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && 6585 BI->getParent() != TheLoop->getLoopLatch()) 6586 ScalarPredicatedBB = true; 6587 6588 if (ScalarPredicatedBB) { 6589 // Not possible to scalarize scalable vector with predicated instructions. 6590 if (VF.isScalable()) 6591 return InstructionCost::getInvalid(); 6592 // Return cost for branches around scalarized and predicated blocks. 6593 auto *VecI1Ty = 6594 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6595 return ( 6596 TTI.getScalarizationOverhead( 6597 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), 6598 /*Insert*/ false, /*Extract*/ true, CostKind) + 6599 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6600 } 6601 6602 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6603 // The back-edge branch will remain, as will all scalar branches. 6604 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6605 6606 // This branch will be eliminated by if-conversion. 6607 return 0; 6608 // Note: We currently assume zero cost for an unconditional branch inside 6609 // a predicated block since it will become a fall-through, although we 6610 // may decide in the future to call TTI for all branches. 6611 } 6612 case Instruction::Switch: { 6613 if (VF.isScalar()) 6614 return TTI.getCFInstrCost(Instruction::Switch, CostKind); 6615 auto *Switch = cast<SwitchInst>(I); 6616 return Switch->getNumCases() * 6617 TTI.getCmpSelInstrCost( 6618 Instruction::ICmp, 6619 toVectorTy(Switch->getCondition()->getType(), VF), 6620 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 6621 CmpInst::ICMP_EQ, CostKind); 6622 } 6623 case Instruction::PHI: { 6624 auto *Phi = cast<PHINode>(I); 6625 6626 // First-order recurrences are replaced by vector shuffles inside the loop. 6627 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6628 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6629 // penultimate value of the recurrence. 6630 // TODO: Consider vscale_range info. 6631 if (VF.isScalable() && VF.getKnownMinValue() == 1) 6632 return InstructionCost::getInvalid(); 6633 SmallVector<int> Mask(VF.getKnownMinValue()); 6634 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6635 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6636 cast<VectorType>(VectorTy), Mask, CostKind, 6637 VF.getKnownMinValue() - 1); 6638 } 6639 6640 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6641 // converted into select instructions. We require N - 1 selects per phi 6642 // node, where N is the number of incoming values. 6643 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { 6644 Type *ResultTy = Phi->getType(); 6645 6646 // All instructions in an Any-of reduction chain are narrowed to bool. 6647 // Check if that is the case for this phi node. 6648 auto *HeaderUser = cast_if_present<PHINode>( 6649 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * { 6650 auto *Phi = dyn_cast<PHINode>(U); 6651 if (Phi && Phi->getParent() == TheLoop->getHeader()) 6652 return Phi; 6653 return nullptr; 6654 })); 6655 if (HeaderUser) { 6656 auto &ReductionVars = Legal->getReductionVars(); 6657 auto Iter = ReductionVars.find(HeaderUser); 6658 if (Iter != ReductionVars.end() && 6659 RecurrenceDescriptor::isAnyOfRecurrenceKind( 6660 Iter->second.getRecurrenceKind())) 6661 ResultTy = Type::getInt1Ty(Phi->getContext()); 6662 } 6663 return (Phi->getNumIncomingValues() - 1) * 6664 TTI.getCmpSelInstrCost( 6665 Instruction::Select, toVectorTy(ResultTy, VF), 6666 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6667 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6668 } 6669 6670 // When tail folding with EVL, if the phi is part of an out of loop 6671 // reduction then it will be transformed into a wide vp_merge. 6672 if (VF.isVector() && foldTailWithEVL() && 6673 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { 6674 IntrinsicCostAttributes ICA( 6675 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), 6676 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); 6677 return TTI.getIntrinsicInstrCost(ICA, CostKind); 6678 } 6679 6680 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6681 } 6682 case Instruction::UDiv: 6683 case Instruction::SDiv: 6684 case Instruction::URem: 6685 case Instruction::SRem: 6686 if (VF.isVector() && isPredicatedInst(I)) { 6687 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6688 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6689 ScalarCost : SafeDivisorCost; 6690 } 6691 // We've proven all lanes safe to speculate, fall through. 6692 [[fallthrough]]; 6693 case Instruction::Add: 6694 case Instruction::Sub: { 6695 auto Info = Legal->getHistogramInfo(I); 6696 if (Info && VF.isVector()) { 6697 const HistogramInfo *HGram = Info.value(); 6698 // Assume that a non-constant update value (or a constant != 1) requires 6699 // a multiply, and add that into the cost. 6700 InstructionCost MulCost = TTI::TCC_Free; 6701 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); 6702 if (!RHS || RHS->getZExtValue() != 1) 6703 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 6704 6705 // Find the cost of the histogram operation itself. 6706 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); 6707 Type *ScalarTy = I->getType(); 6708 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); 6709 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 6710 Type::getVoidTy(I->getContext()), 6711 {PtrTy, ScalarTy, MaskTy}); 6712 6713 // Add the costs together with the add/sub operation. 6714 return TTI.getIntrinsicInstrCost( 6715 ICA, TargetTransformInfo::TCK_RecipThroughput) + 6716 MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); 6717 } 6718 [[fallthrough]]; 6719 } 6720 case Instruction::FAdd: 6721 case Instruction::FSub: 6722 case Instruction::Mul: 6723 case Instruction::FMul: 6724 case Instruction::FDiv: 6725 case Instruction::FRem: 6726 case Instruction::Shl: 6727 case Instruction::LShr: 6728 case Instruction::AShr: 6729 case Instruction::And: 6730 case Instruction::Or: 6731 case Instruction::Xor: { 6732 // If we're speculating on the stride being 1, the multiplication may 6733 // fold away. We can generalize this for all operations using the notion 6734 // of neutral elements. (TODO) 6735 if (I->getOpcode() == Instruction::Mul && 6736 (PSE.getSCEV(I->getOperand(0))->isOne() || 6737 PSE.getSCEV(I->getOperand(1))->isOne())) 6738 return 0; 6739 6740 // Detect reduction patterns 6741 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6742 return *RedCost; 6743 6744 // Certain instructions can be cheaper to vectorize if they have a constant 6745 // second vector operand. One example of this are shifts on x86. 6746 Value *Op2 = I->getOperand(1); 6747 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) && 6748 isa<SCEVConstant>(PSE.getSCEV(Op2))) { 6749 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue(); 6750 } 6751 auto Op2Info = TTI.getOperandInfo(Op2); 6752 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6753 shouldConsiderInvariant(Op2)) 6754 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6755 6756 SmallVector<const Value *, 4> Operands(I->operand_values()); 6757 return TTI.getArithmeticInstrCost( 6758 I->getOpcode(), VectorTy, CostKind, 6759 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6760 Op2Info, Operands, I, TLI); 6761 } 6762 case Instruction::FNeg: { 6763 return TTI.getArithmeticInstrCost( 6764 I->getOpcode(), VectorTy, CostKind, 6765 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6766 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6767 I->getOperand(0), I); 6768 } 6769 case Instruction::Select: { 6770 SelectInst *SI = cast<SelectInst>(I); 6771 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6772 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6773 6774 const Value *Op0, *Op1; 6775 using namespace llvm::PatternMatch; 6776 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6777 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6778 // select x, y, false --> x & y 6779 // select x, true, y --> x | y 6780 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6781 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6782 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6783 Op1->getType()->getScalarSizeInBits() == 1); 6784 6785 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6786 return TTI.getArithmeticInstrCost( 6787 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6788 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6789 } 6790 6791 Type *CondTy = SI->getCondition()->getType(); 6792 if (!ScalarCond) 6793 CondTy = VectorType::get(CondTy, VF); 6794 6795 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6796 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6797 Pred = Cmp->getPredicate(); 6798 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6799 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 6800 {TTI::OK_AnyValue, TTI::OP_None}, I); 6801 } 6802 case Instruction::ICmp: 6803 case Instruction::FCmp: { 6804 Type *ValTy = I->getOperand(0)->getType(); 6805 6806 if (canTruncateToMinimalBitwidth(I, VF)) { 6807 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6808 (void)Op0AsInstruction; 6809 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || 6810 MinBWs[I] == MinBWs[Op0AsInstruction]) && 6811 "if both the operand and the compare are marked for " 6812 "truncation, they must have the same bitwidth"); 6813 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); 6814 } 6815 6816 VectorTy = toVectorTy(ValTy, VF); 6817 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 6818 cast<CmpInst>(I)->getPredicate(), CostKind, 6819 {TTI::OK_AnyValue, TTI::OP_None}, 6820 {TTI::OK_AnyValue, TTI::OP_None}, I); 6821 } 6822 case Instruction::Store: 6823 case Instruction::Load: { 6824 ElementCount Width = VF; 6825 if (Width.isVector()) { 6826 InstWidening Decision = getWideningDecision(I, Width); 6827 assert(Decision != CM_Unknown && 6828 "CM decision should be taken at this point"); 6829 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 6830 return InstructionCost::getInvalid(); 6831 if (Decision == CM_Scalarize) 6832 Width = ElementCount::getFixed(1); 6833 } 6834 VectorTy = toVectorTy(getLoadStoreType(I), Width); 6835 return getMemoryInstructionCost(I, VF); 6836 } 6837 case Instruction::BitCast: 6838 if (I->getType()->isPointerTy()) 6839 return 0; 6840 [[fallthrough]]; 6841 case Instruction::ZExt: 6842 case Instruction::SExt: 6843 case Instruction::FPToUI: 6844 case Instruction::FPToSI: 6845 case Instruction::FPExt: 6846 case Instruction::PtrToInt: 6847 case Instruction::IntToPtr: 6848 case Instruction::SIToFP: 6849 case Instruction::UIToFP: 6850 case Instruction::Trunc: 6851 case Instruction::FPTrunc: { 6852 // Computes the CastContextHint from a Load/Store instruction. 6853 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6854 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6855 "Expected a load or a store!"); 6856 6857 if (VF.isScalar() || !TheLoop->contains(I)) 6858 return TTI::CastContextHint::Normal; 6859 6860 switch (getWideningDecision(I, VF)) { 6861 case LoopVectorizationCostModel::CM_GatherScatter: 6862 return TTI::CastContextHint::GatherScatter; 6863 case LoopVectorizationCostModel::CM_Interleave: 6864 return TTI::CastContextHint::Interleave; 6865 case LoopVectorizationCostModel::CM_Scalarize: 6866 case LoopVectorizationCostModel::CM_Widen: 6867 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6868 : TTI::CastContextHint::Normal; 6869 case LoopVectorizationCostModel::CM_Widen_Reverse: 6870 return TTI::CastContextHint::Reversed; 6871 case LoopVectorizationCostModel::CM_Unknown: 6872 llvm_unreachable("Instr did not go through cost modelling?"); 6873 case LoopVectorizationCostModel::CM_VectorCall: 6874 case LoopVectorizationCostModel::CM_IntrinsicCall: 6875 llvm_unreachable_internal("Instr has invalid widening decision"); 6876 } 6877 6878 llvm_unreachable("Unhandled case!"); 6879 }; 6880 6881 unsigned Opcode = I->getOpcode(); 6882 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6883 // For Trunc, the context is the only user, which must be a StoreInst. 6884 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6885 if (I->hasOneUse()) 6886 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6887 CCH = ComputeCCH(Store); 6888 } 6889 // For Z/Sext, the context is the operand, which must be a LoadInst. 6890 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6891 Opcode == Instruction::FPExt) { 6892 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6893 CCH = ComputeCCH(Load); 6894 } 6895 6896 // We optimize the truncation of induction variables having constant 6897 // integer steps. The cost of these truncations is the same as the scalar 6898 // operation. 6899 if (isOptimizableIVTruncate(I, VF)) { 6900 auto *Trunc = cast<TruncInst>(I); 6901 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6902 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6903 } 6904 6905 // Detect reduction patterns 6906 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6907 return *RedCost; 6908 6909 Type *SrcScalarTy = I->getOperand(0)->getType(); 6910 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6911 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6912 SrcScalarTy = 6913 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); 6914 Type *SrcVecTy = 6915 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6916 6917 if (canTruncateToMinimalBitwidth(I, VF)) { 6918 // If the result type is <= the source type, there will be no extend 6919 // after truncating the users to the minimal required bitwidth. 6920 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && 6921 (I->getOpcode() == Instruction::ZExt || 6922 I->getOpcode() == Instruction::SExt)) 6923 return 0; 6924 } 6925 6926 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6927 } 6928 case Instruction::Call: 6929 return getVectorCallCost(cast<CallInst>(I), VF); 6930 case Instruction::ExtractValue: 6931 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 6932 case Instruction::Alloca: 6933 // We cannot easily widen alloca to a scalable alloca, as 6934 // the result would need to be a vector of pointers. 6935 if (VF.isScalable()) 6936 return InstructionCost::getInvalid(); 6937 [[fallthrough]]; 6938 default: 6939 // This opcode is unknown. Assume that it is the same as 'mul'. 6940 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6941 } // end of switch. 6942 } 6943 6944 void LoopVectorizationCostModel::collectValuesToIgnore() { 6945 // Ignore ephemeral values. 6946 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6947 6948 SmallVector<Value *, 4> DeadInterleavePointerOps; 6949 SmallVector<Value *, 4> DeadOps; 6950 6951 // If a scalar epilogue is required, users outside the loop won't use 6952 // live-outs from the vector loop but from the scalar epilogue. Ignore them if 6953 // that is the case. 6954 bool RequiresScalarEpilogue = requiresScalarEpilogue(true); 6955 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) { 6956 return RequiresScalarEpilogue && 6957 !TheLoop->contains(cast<Instruction>(U)->getParent()); 6958 }; 6959 6960 LoopBlocksDFS DFS(TheLoop); 6961 DFS.perform(LI); 6962 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps; 6963 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO()))) 6964 for (Instruction &I : reverse(*BB)) { 6965 // Find all stores to invariant variables. Since they are going to sink 6966 // outside the loop we do not need calculate cost for them. 6967 StoreInst *SI; 6968 if ((SI = dyn_cast<StoreInst>(&I)) && 6969 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 6970 ValuesToIgnore.insert(&I); 6971 DeadInvariantStoreOps[SI->getPointerOperand()].push_back( 6972 SI->getValueOperand()); 6973 } 6974 6975 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) 6976 continue; 6977 6978 // Add instructions that would be trivially dead and are only used by 6979 // values already ignored to DeadOps to seed worklist. 6980 if (wouldInstructionBeTriviallyDead(&I, TLI) && 6981 all_of(I.users(), [this, IsLiveOutDead](User *U) { 6982 return VecValuesToIgnore.contains(U) || 6983 ValuesToIgnore.contains(U) || IsLiveOutDead(U); 6984 })) 6985 DeadOps.push_back(&I); 6986 6987 // For interleave groups, we only create a pointer for the start of the 6988 // interleave group. Queue up addresses of group members except the insert 6989 // position for further processing. 6990 if (isAccessInterleaved(&I)) { 6991 auto *Group = getInterleavedAccessGroup(&I); 6992 if (Group->getInsertPos() == &I) 6993 continue; 6994 Value *PointerOp = getLoadStorePointerOperand(&I); 6995 DeadInterleavePointerOps.push_back(PointerOp); 6996 } 6997 6998 // Queue branches for analysis. They are dead, if their successors only 6999 // contain dead instructions. 7000 if (auto *Br = dyn_cast<BranchInst>(&I)) { 7001 if (Br->isConditional()) 7002 DeadOps.push_back(&I); 7003 } 7004 } 7005 7006 // Mark ops feeding interleave group members as free, if they are only used 7007 // by other dead computations. 7008 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { 7009 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); 7010 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { 7011 Instruction *UI = cast<Instruction>(U); 7012 return !VecValuesToIgnore.contains(U) && 7013 (!isAccessInterleaved(UI) || 7014 getInterleavedAccessGroup(UI)->getInsertPos() == UI); 7015 })) 7016 continue; 7017 VecValuesToIgnore.insert(Op); 7018 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); 7019 } 7020 7021 for (const auto &[_, Ops] : DeadInvariantStoreOps) { 7022 for (Value *Op : ArrayRef(Ops).drop_back()) 7023 DeadOps.push_back(Op); 7024 } 7025 // Mark ops that would be trivially dead and are only used by ignored 7026 // instructions as free. 7027 BasicBlock *Header = TheLoop->getHeader(); 7028 7029 // Returns true if the block contains only dead instructions. Such blocks will 7030 // be removed by VPlan-to-VPlan transforms and won't be considered by the 7031 // VPlan-based cost model, so skip them in the legacy cost-model as well. 7032 auto IsEmptyBlock = [this](BasicBlock *BB) { 7033 return all_of(*BB, [this](Instruction &I) { 7034 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) || 7035 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional()); 7036 }); 7037 }; 7038 for (unsigned I = 0; I != DeadOps.size(); ++I) { 7039 auto *Op = dyn_cast<Instruction>(DeadOps[I]); 7040 7041 // Check if the branch should be considered dead. 7042 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) { 7043 BasicBlock *ThenBB = Br->getSuccessor(0); 7044 BasicBlock *ElseBB = Br->getSuccessor(1); 7045 // Don't considers branches leaving the loop for simplification. 7046 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB)) 7047 continue; 7048 bool ThenEmpty = IsEmptyBlock(ThenBB); 7049 bool ElseEmpty = IsEmptyBlock(ElseBB); 7050 if ((ThenEmpty && ElseEmpty) || 7051 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB && 7052 ElseBB->phis().empty()) || 7053 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB && 7054 ThenBB->phis().empty())) { 7055 VecValuesToIgnore.insert(Br); 7056 DeadOps.push_back(Br->getCondition()); 7057 } 7058 continue; 7059 } 7060 7061 // Skip any op that shouldn't be considered dead. 7062 if (!Op || !TheLoop->contains(Op) || 7063 (isa<PHINode>(Op) && Op->getParent() == Header) || 7064 !wouldInstructionBeTriviallyDead(Op, TLI) || 7065 any_of(Op->users(), [this, IsLiveOutDead](User *U) { 7066 return !VecValuesToIgnore.contains(U) && 7067 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U); 7068 })) 7069 continue; 7070 7071 if (!TheLoop->contains(Op->getParent())) 7072 continue; 7073 7074 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore 7075 // which applies for both scalar and vector versions. Otherwise it is only 7076 // dead in vector versions, so only add it to VecValuesToIgnore. 7077 if (all_of(Op->users(), 7078 [this](User *U) { return ValuesToIgnore.contains(U); })) 7079 ValuesToIgnore.insert(Op); 7080 7081 VecValuesToIgnore.insert(Op); 7082 DeadOps.append(Op->op_begin(), Op->op_end()); 7083 } 7084 7085 // Ignore type-promoting instructions we identified during reduction 7086 // detection. 7087 for (const auto &Reduction : Legal->getReductionVars()) { 7088 const RecurrenceDescriptor &RedDes = Reduction.second; 7089 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7090 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7091 } 7092 // Ignore type-casting instructions we identified during induction 7093 // detection. 7094 for (const auto &Induction : Legal->getInductionVars()) { 7095 const InductionDescriptor &IndDes = Induction.second; 7096 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7097 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7098 } 7099 } 7100 7101 void LoopVectorizationCostModel::collectInLoopReductions() { 7102 for (const auto &Reduction : Legal->getReductionVars()) { 7103 PHINode *Phi = Reduction.first; 7104 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7105 7106 // We don't collect reductions that are type promoted (yet). 7107 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7108 continue; 7109 7110 // If the target would prefer this reduction to happen "in-loop", then we 7111 // want to record it as such. 7112 unsigned Opcode = RdxDesc.getOpcode(); 7113 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7114 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7115 TargetTransformInfo::ReductionFlags())) 7116 continue; 7117 7118 // Check that we can correctly put the reductions into the loop, by 7119 // finding the chain of operations that leads from the phi to the loop 7120 // exit value. 7121 SmallVector<Instruction *, 4> ReductionOperations = 7122 RdxDesc.getReductionOpChain(Phi, TheLoop); 7123 bool InLoop = !ReductionOperations.empty(); 7124 7125 if (InLoop) { 7126 InLoopReductions.insert(Phi); 7127 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7128 Instruction *LastChain = Phi; 7129 for (auto *I : ReductionOperations) { 7130 InLoopReductionImmediateChains[I] = LastChain; 7131 LastChain = I; 7132 } 7133 } 7134 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7135 << " reduction for phi: " << *Phi << "\n"); 7136 } 7137 } 7138 7139 // This function will select a scalable VF if the target supports scalable 7140 // vectors and a fixed one otherwise. 7141 // TODO: we could return a pair of values that specify the max VF and 7142 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7143 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7144 // doesn't have a cost model that can choose which plan to execute if 7145 // more than one is generated. 7146 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7147 LoopVectorizationCostModel &CM) { 7148 unsigned WidestType; 7149 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7150 7151 TargetTransformInfo::RegisterKind RegKind = 7152 TTI.enableScalableVectorization() 7153 ? TargetTransformInfo::RGK_ScalableVector 7154 : TargetTransformInfo::RGK_FixedWidthVector; 7155 7156 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7157 unsigned N = RegSize.getKnownMinValue() / WidestType; 7158 return ElementCount::get(N, RegSize.isScalable()); 7159 } 7160 7161 VectorizationFactor 7162 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7163 ElementCount VF = UserVF; 7164 // Outer loop handling: They may require CFG and instruction level 7165 // transformations before even evaluating whether vectorization is profitable. 7166 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7167 // the vectorization pipeline. 7168 if (!OrigLoop->isInnermost()) { 7169 // If the user doesn't provide a vectorization factor, determine a 7170 // reasonable one. 7171 if (UserVF.isZero()) { 7172 VF = determineVPlanVF(TTI, CM); 7173 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7174 7175 // Make sure we have a VF > 1 for stress testing. 7176 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7177 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7178 << "overriding computed VF.\n"); 7179 VF = ElementCount::getFixed(4); 7180 } 7181 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7182 !ForceTargetSupportsScalableVectors) { 7183 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7184 << "not supported by the target.\n"); 7185 reportVectorizationFailure( 7186 "Scalable vectorization requested but not supported by the target", 7187 "the scalable user-specified vectorization width for outer-loop " 7188 "vectorization cannot be used because the target does not support " 7189 "scalable vectors.", 7190 "ScalableVFUnfeasible", ORE, OrigLoop); 7191 return VectorizationFactor::Disabled(); 7192 } 7193 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7194 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7195 "VF needs to be a power of two"); 7196 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7197 << "VF " << VF << " to build VPlans.\n"); 7198 buildVPlans(VF, VF); 7199 7200 // For VPlan build stress testing, we bail out after VPlan construction. 7201 if (VPlanBuildStressTest) 7202 return VectorizationFactor::Disabled(); 7203 7204 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7205 } 7206 7207 LLVM_DEBUG( 7208 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7209 "VPlan-native path.\n"); 7210 return VectorizationFactor::Disabled(); 7211 } 7212 7213 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7214 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7215 CM.collectValuesToIgnore(); 7216 CM.collectElementTypesForWidening(); 7217 7218 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7219 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7220 return; 7221 7222 // Invalidate interleave groups if all blocks of loop will be predicated. 7223 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7224 !useMaskedInterleavedAccesses(TTI)) { 7225 LLVM_DEBUG( 7226 dbgs() 7227 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7228 "which requires masked-interleaved support.\n"); 7229 if (CM.InterleaveInfo.invalidateGroups()) 7230 // Invalidating interleave groups also requires invalidating all decisions 7231 // based on them, which includes widening decisions and uniform and scalar 7232 // values. 7233 CM.invalidateCostModelingDecisions(); 7234 } 7235 7236 if (CM.foldTailByMasking()) 7237 Legal->prepareToFoldTailByMasking(); 7238 7239 ElementCount MaxUserVF = 7240 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7241 if (UserVF) { 7242 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) { 7243 reportVectorizationInfo( 7244 "UserVF ignored because it may be larger than the maximal safe VF", 7245 "InvalidUserVF", ORE, OrigLoop); 7246 } else { 7247 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7248 "VF needs to be a power of two"); 7249 // Collect the instructions (and their associated costs) that will be more 7250 // profitable to scalarize. 7251 CM.collectInLoopReductions(); 7252 if (CM.selectUserVectorizationFactor(UserVF)) { 7253 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7254 buildVPlansWithVPRecipes(UserVF, UserVF); 7255 LLVM_DEBUG(printPlans(dbgs())); 7256 return; 7257 } 7258 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7259 "InvalidCost", ORE, OrigLoop); 7260 } 7261 } 7262 7263 // Collect the Vectorization Factor Candidates. 7264 SmallVector<ElementCount> VFCandidates; 7265 for (auto VF = ElementCount::getFixed(1); 7266 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7267 VFCandidates.push_back(VF); 7268 for (auto VF = ElementCount::getScalable(1); 7269 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7270 VFCandidates.push_back(VF); 7271 7272 CM.collectInLoopReductions(); 7273 for (const auto &VF : VFCandidates) { 7274 // Collect Uniform and Scalar instructions after vectorization with VF. 7275 CM.collectUniformsAndScalars(VF); 7276 7277 // Collect the instructions (and their associated costs) that will be more 7278 // profitable to scalarize. 7279 if (VF.isVector()) 7280 CM.collectInstsToScalarize(VF); 7281 } 7282 7283 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7284 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7285 7286 LLVM_DEBUG(printPlans(dbgs())); 7287 } 7288 7289 InstructionCost VPCostContext::getLegacyCost(Instruction *UI, 7290 ElementCount VF) const { 7291 if (ForceTargetInstructionCost.getNumOccurrences()) 7292 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); 7293 return CM.getInstructionCost(UI, VF); 7294 } 7295 7296 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { 7297 return CM.ValuesToIgnore.contains(UI) || 7298 (IsVector && CM.VecValuesToIgnore.contains(UI)) || 7299 SkipCostComputation.contains(UI); 7300 } 7301 7302 InstructionCost 7303 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, 7304 VPCostContext &CostCtx) const { 7305 InstructionCost Cost; 7306 // Cost modeling for inductions is inaccurate in the legacy cost model 7307 // compared to the recipes that are generated. To match here initially during 7308 // VPlan cost model bring up directly use the induction costs from the legacy 7309 // cost model. Note that we do this as pre-processing; the VPlan may not have 7310 // any recipes associated with the original induction increment instruction 7311 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 7312 // the cost of induction phis and increments (both that are represented by 7313 // recipes and those that are not), to avoid distinguishing between them here, 7314 // and skip all recipes that represent induction phis and increments (the 7315 // former case) later on, if they exist, to avoid counting them twice. 7316 // Similarly we pre-compute the cost of any optimized truncates. 7317 // TODO: Switch to more accurate costing based on VPlan. 7318 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { 7319 Instruction *IVInc = cast<Instruction>( 7320 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 7321 SmallVector<Instruction *> IVInsts = {IVInc}; 7322 for (unsigned I = 0; I != IVInsts.size(); I++) { 7323 for (Value *Op : IVInsts[I]->operands()) { 7324 auto *OpI = dyn_cast<Instruction>(Op); 7325 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse()) 7326 continue; 7327 IVInsts.push_back(OpI); 7328 } 7329 } 7330 IVInsts.push_back(IV); 7331 for (User *U : IV->users()) { 7332 auto *CI = cast<Instruction>(U); 7333 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) 7334 continue; 7335 IVInsts.push_back(CI); 7336 } 7337 7338 // If the vector loop gets executed exactly once with the given VF, ignore 7339 // the costs of comparison and induction instructions, as they'll get 7340 // simplified away. 7341 // TODO: Remove this code after stepping away from the legacy cost model and 7342 // adding code to simplify VPlans before calculating their costs. 7343 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); 7344 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking()) 7345 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), 7346 CostCtx.SkipCostComputation); 7347 7348 for (Instruction *IVInst : IVInsts) { 7349 if (CostCtx.skipCostComputation(IVInst, VF.isVector())) 7350 continue; 7351 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); 7352 LLVM_DEBUG({ 7353 dbgs() << "Cost of " << InductionCost << " for VF " << VF 7354 << ": induction instruction " << *IVInst << "\n"; 7355 }); 7356 Cost += InductionCost; 7357 CostCtx.SkipCostComputation.insert(IVInst); 7358 } 7359 } 7360 7361 /// Compute the cost of all exiting conditions of the loop using the legacy 7362 /// cost model. This is to match the legacy behavior, which adds the cost of 7363 /// all exit conditions. Note that this over-estimates the cost, as there will 7364 /// be a single condition to control the vector loop. 7365 SmallVector<BasicBlock *> Exiting; 7366 CM.TheLoop->getExitingBlocks(Exiting); 7367 SetVector<Instruction *> ExitInstrs; 7368 // Collect all exit conditions. 7369 for (BasicBlock *EB : Exiting) { 7370 auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); 7371 if (!Term) 7372 continue; 7373 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { 7374 ExitInstrs.insert(CondI); 7375 } 7376 } 7377 // Compute the cost of all instructions only feeding the exit conditions. 7378 for (unsigned I = 0; I != ExitInstrs.size(); ++I) { 7379 Instruction *CondI = ExitInstrs[I]; 7380 if (!OrigLoop->contains(CondI) || 7381 !CostCtx.SkipCostComputation.insert(CondI).second) 7382 continue; 7383 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); 7384 LLVM_DEBUG({ 7385 dbgs() << "Cost of " << CondICost << " for VF " << VF 7386 << ": exit condition instruction " << *CondI << "\n"; 7387 }); 7388 Cost += CondICost; 7389 for (Value *Op : CondI->operands()) { 7390 auto *OpI = dyn_cast<Instruction>(Op); 7391 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { 7392 return OrigLoop->contains(cast<Instruction>(U)->getParent()) && 7393 !ExitInstrs.contains(cast<Instruction>(U)); 7394 })) 7395 continue; 7396 ExitInstrs.insert(OpI); 7397 } 7398 } 7399 7400 // The legacy cost model has special logic to compute the cost of in-loop 7401 // reductions, which may be smaller than the sum of all instructions involved 7402 // in the reduction. 7403 // TODO: Switch to costing based on VPlan once the logic has been ported. 7404 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { 7405 if (ForceTargetInstructionCost.getNumOccurrences()) 7406 continue; 7407 7408 if (!CM.isInLoopReduction(RedPhi)) 7409 continue; 7410 7411 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); 7412 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), 7413 ChainOps.end()); 7414 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { 7415 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; 7416 }; 7417 // Also include the operands of instructions in the chain, as the cost-model 7418 // may mark extends as free. 7419 // 7420 // For ARM, some of the instruction can folded into the reducion 7421 // instruction. So we need to mark all folded instructions free. 7422 // For example: We can fold reduce(mul(ext(A), ext(B))) into one 7423 // instruction. 7424 for (auto *ChainOp : ChainOps) { 7425 for (Value *Op : ChainOp->operands()) { 7426 if (auto *I = dyn_cast<Instruction>(Op)) { 7427 ChainOpsAndOperands.insert(I); 7428 if (I->getOpcode() == Instruction::Mul) { 7429 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0)); 7430 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1)); 7431 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && 7432 Ext0->getOpcode() == Ext1->getOpcode()) { 7433 ChainOpsAndOperands.insert(Ext0); 7434 ChainOpsAndOperands.insert(Ext1); 7435 } 7436 } 7437 } 7438 } 7439 } 7440 7441 // Pre-compute the cost for I, if it has a reduction pattern cost. 7442 for (Instruction *I : ChainOpsAndOperands) { 7443 auto ReductionCost = CM.getReductionPatternCost( 7444 I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); 7445 if (!ReductionCost) 7446 continue; 7447 7448 assert(!CostCtx.SkipCostComputation.contains(I) && 7449 "reduction op visited multiple times"); 7450 CostCtx.SkipCostComputation.insert(I); 7451 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7452 << ":\n in-loop reduction " << *I << "\n"); 7453 Cost += *ReductionCost; 7454 } 7455 } 7456 7457 // Pre-compute the costs for branches except for the backedge, as the number 7458 // of replicate regions in a VPlan may not directly match the number of 7459 // branches, which would lead to different decisions. 7460 // TODO: Compute cost of branches for each replicate region in the VPlan, 7461 // which is more accurate than the legacy cost model. 7462 for (BasicBlock *BB : OrigLoop->blocks()) { 7463 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector())) 7464 continue; 7465 CostCtx.SkipCostComputation.insert(BB->getTerminator()); 7466 if (BB == OrigLoop->getLoopLatch()) 7467 continue; 7468 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); 7469 Cost += BranchCost; 7470 } 7471 7472 // Pre-compute costs for instructions that are forced-scalar or profitable to 7473 // scalarize. Their costs will be computed separately in the legacy cost 7474 // model. 7475 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) { 7476 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector())) 7477 continue; 7478 CostCtx.SkipCostComputation.insert(ForcedScalar); 7479 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF); 7480 LLVM_DEBUG({ 7481 dbgs() << "Cost of " << ForcedCost << " for VF " << VF 7482 << ": forced scalar " << *ForcedScalar << "\n"; 7483 }); 7484 Cost += ForcedCost; 7485 } 7486 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) { 7487 if (CostCtx.skipCostComputation(Scalarized, VF.isVector())) 7488 continue; 7489 CostCtx.SkipCostComputation.insert(Scalarized); 7490 LLVM_DEBUG({ 7491 dbgs() << "Cost of " << ScalarCost << " for VF " << VF 7492 << ": profitable to scalarize " << *Scalarized << "\n"; 7493 }); 7494 Cost += ScalarCost; 7495 } 7496 7497 return Cost; 7498 } 7499 7500 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, 7501 ElementCount VF) const { 7502 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); 7503 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); 7504 7505 // Now compute and add the VPlan-based cost. 7506 Cost += Plan.cost(VF, CostCtx); 7507 #ifndef NDEBUG 7508 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); 7509 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost 7510 << " (Estimated cost per lane: "); 7511 if (Cost.isValid()) { 7512 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth; 7513 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane)); 7514 } else /* No point dividing an invalid cost - it will still be invalid */ 7515 LLVM_DEBUG(dbgs() << "Invalid"); 7516 LLVM_DEBUG(dbgs() << ")\n"); 7517 #endif 7518 return Cost; 7519 } 7520 7521 #ifndef NDEBUG 7522 /// Return true if the original loop \ TheLoop contains any instructions that do 7523 /// not have corresponding recipes in \p Plan and are not marked to be ignored 7524 /// in \p CostCtx. This means the VPlan contains simplification that the legacy 7525 /// cost-model did not account for. 7526 static bool planContainsAdditionalSimplifications(VPlan &Plan, 7527 VPCostContext &CostCtx, 7528 Loop *TheLoop) { 7529 // First collect all instructions for the recipes in Plan. 7530 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { 7531 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 7532 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 7533 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 7534 return &WidenMem->getIngredient(); 7535 return nullptr; 7536 }; 7537 7538 DenseSet<Instruction *> SeenInstrs; 7539 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); 7540 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 7541 for (VPRecipeBase &R : *VPBB) { 7542 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) { 7543 auto *IG = IR->getInterleaveGroup(); 7544 unsigned NumMembers = IG->getNumMembers(); 7545 for (unsigned I = 0; I != NumMembers; ++I) { 7546 if (Instruction *M = IG->getMember(I)) 7547 SeenInstrs.insert(M); 7548 } 7549 continue; 7550 } 7551 // The VPlan-based cost model is more accurate for partial reduction and 7552 // comparing against the legacy cost isn't desirable. 7553 if (isa<VPPartialReductionRecipe>(&R)) 7554 return true; 7555 if (Instruction *UI = GetInstructionForCost(&R)) 7556 SeenInstrs.insert(UI); 7557 } 7558 } 7559 7560 // Return true if the loop contains any instructions that are not also part of 7561 // the VPlan or are skipped for VPlan-based cost computations. This indicates 7562 // that the VPlan contains extra simplifications. 7563 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx, 7564 TheLoop](BasicBlock *BB) { 7565 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) { 7566 if (isa<PHINode>(&I) && BB == TheLoop->getHeader()) 7567 return false; 7568 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); 7569 }); 7570 }); 7571 } 7572 #endif 7573 7574 VectorizationFactor LoopVectorizationPlanner::computeBestVF() { 7575 if (VPlans.empty()) 7576 return VectorizationFactor::Disabled(); 7577 // If there is a single VPlan with a single VF, return it directly. 7578 VPlan &FirstPlan = *VPlans[0]; 7579 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) 7580 return {*FirstPlan.vectorFactors().begin(), 0, 0}; 7581 7582 ElementCount ScalarVF = ElementCount::getFixed(1); 7583 assert(hasPlanWithVF(ScalarVF) && 7584 "More than a single plan/VF w/o any plan having scalar VF"); 7585 7586 // TODO: Compute scalar cost using VPlan-based cost model. 7587 InstructionCost ScalarCost = CM.expectedCost(ScalarVF); 7588 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); 7589 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost); 7590 VectorizationFactor BestFactor = ScalarFactor; 7591 7592 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 7593 if (ForceVectorization) { 7594 // Ignore scalar width, because the user explicitly wants vectorization. 7595 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 7596 // evaluation. 7597 BestFactor.Cost = InstructionCost::getMax(); 7598 } 7599 7600 for (auto &P : VPlans) { 7601 for (ElementCount VF : P->vectorFactors()) { 7602 if (VF.isScalar()) 7603 continue; 7604 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 7605 LLVM_DEBUG( 7606 dbgs() 7607 << "LV: Not considering vector loop of width " << VF 7608 << " because it will not generate any vector instructions.\n"); 7609 continue; 7610 } 7611 7612 InstructionCost Cost = cost(*P, VF); 7613 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); 7614 if (isMoreProfitable(CurrentFactor, BestFactor)) 7615 BestFactor = CurrentFactor; 7616 7617 // If profitable add it to ProfitableVF list. 7618 if (isMoreProfitable(CurrentFactor, ScalarFactor)) 7619 ProfitableVFs.push_back(CurrentFactor); 7620 } 7621 } 7622 7623 #ifndef NDEBUG 7624 // Select the optimal vectorization factor according to the legacy cost-model. 7625 // This is now only used to verify the decisions by the new VPlan-based 7626 // cost-model and will be retired once the VPlan-based cost-model is 7627 // stabilized. 7628 VectorizationFactor LegacyVF = selectVectorizationFactor(); 7629 VPlan &BestPlan = getPlanFor(BestFactor.Width); 7630 7631 // Pre-compute the cost and use it to check if BestPlan contains any 7632 // simplifications not accounted for in the legacy cost model. If that's the 7633 // case, don't trigger the assertion, as the extra simplifications may cause a 7634 // different VF to be picked by the VPlan-based cost model. 7635 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); 7636 precomputeCosts(BestPlan, BestFactor.Width, CostCtx); 7637 assert((BestFactor.Width == LegacyVF.Width || 7638 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), 7639 CostCtx, OrigLoop) || 7640 planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), 7641 CostCtx, OrigLoop)) && 7642 " VPlan cost model and legacy cost model disagreed"); 7643 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && 7644 "when vectorizing, the scalar cost must be computed."); 7645 #endif 7646 7647 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); 7648 return BestFactor; 7649 } 7650 7651 static void addRuntimeUnrollDisableMetaData(Loop *L) { 7652 SmallVector<Metadata *, 4> MDs; 7653 // Reserve first location for self reference to the LoopID metadata node. 7654 MDs.push_back(nullptr); 7655 bool IsUnrollMetadata = false; 7656 MDNode *LoopID = L->getLoopID(); 7657 if (LoopID) { 7658 // First find existing loop unrolling disable metadata. 7659 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { 7660 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); 7661 if (MD) { 7662 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7663 IsUnrollMetadata = 7664 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7665 } 7666 MDs.push_back(LoopID->getOperand(I)); 7667 } 7668 } 7669 7670 if (!IsUnrollMetadata) { 7671 // Add runtime unroll disable metadata. 7672 LLVMContext &Context = L->getHeader()->getContext(); 7673 SmallVector<Metadata *, 1> DisableOperands; 7674 DisableOperands.push_back( 7675 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7676 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7677 MDs.push_back(DisableNode); 7678 MDNode *NewLoopID = MDNode::get(Context, MDs); 7679 // Set operand 0 to refer to the loop id itself. 7680 NewLoopID->replaceOperandWith(0, NewLoopID); 7681 L->setLoopID(NewLoopID); 7682 } 7683 } 7684 7685 // If \p R is a ComputeReductionResult when vectorizing the epilog loop, 7686 // fix the reduction's scalar PHI node by adding the incoming value from the 7687 // main vector loop. 7688 static void fixReductionScalarResumeWhenVectorizingEpilog( 7689 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, 7690 BasicBlock *BypassBlock) { 7691 auto *EpiRedResult = dyn_cast<VPInstruction>(R); 7692 if (!EpiRedResult || 7693 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7694 return; 7695 7696 auto *EpiRedHeaderPhi = 7697 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0)); 7698 const RecurrenceDescriptor &RdxDesc = 7699 EpiRedHeaderPhi->getRecurrenceDescriptor(); 7700 Value *MainResumeValue = 7701 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); 7702 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 7703 RdxDesc.getRecurrenceKind())) { 7704 auto *Cmp = cast<ICmpInst>(MainResumeValue); 7705 assert(Cmp->getPredicate() == CmpInst::ICMP_NE && 7706 "AnyOf expected to start with ICMP_NE"); 7707 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && 7708 "AnyOf expected to start by comparing main resume value to original " 7709 "start value"); 7710 MainResumeValue = Cmp->getOperand(0); 7711 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 7712 RdxDesc.getRecurrenceKind())) { 7713 using namespace llvm::PatternMatch; 7714 Value *Cmp, *OrigResumeV; 7715 bool IsExpectedPattern = 7716 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), 7717 m_Specific(RdxDesc.getSentinelValue()), 7718 m_Value(OrigResumeV))) && 7719 match(Cmp, 7720 m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), 7721 m_Specific(RdxDesc.getRecurrenceStartValue()))); 7722 assert(IsExpectedPattern && "Unexpected reduction resume pattern"); 7723 (void)IsExpectedPattern; 7724 MainResumeValue = OrigResumeV; 7725 } 7726 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue); 7727 7728 // When fixing reductions in the epilogue loop we should already have 7729 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry 7730 // over the incoming values correctly. 7731 using namespace VPlanPatternMatch; 7732 auto IsResumePhi = [](VPUser *U) { 7733 return match( 7734 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue())); 7735 }; 7736 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && 7737 "ResumePhi must have a single user"); 7738 auto *EpiResumePhiVPI = 7739 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi)); 7740 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true)); 7741 EpiResumePhi->setIncomingValueForBlock( 7742 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); 7743 } 7744 7745 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( 7746 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7747 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue, 7748 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7749 assert(BestVPlan.hasVF(BestVF) && 7750 "Trying to execute plan with unsupported VF"); 7751 assert(BestVPlan.hasUF(BestUF) && 7752 "Trying to execute plan with unsupported UF"); 7753 assert( 7754 ((VectorizingEpilogue && ExpandedSCEVs) || 7755 (!VectorizingEpilogue && !ExpandedSCEVs)) && 7756 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7757 7758 // TODO: Move to VPlan transform stage once the transition to the VPlan-based 7759 // cost model is complete for better cost estimates. 7760 VPlanTransforms::unrollByUF(BestVPlan, BestUF, 7761 OrigLoop->getHeader()->getContext()); 7762 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7763 VPlanTransforms::convertToConcreteRecipes(BestVPlan); 7764 7765 // Perform the actual loop transformation. 7766 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, 7767 &BestVPlan, OrigLoop->getParentLoop(), 7768 Legal->getWidestInductionType()); 7769 7770 #ifdef EXPENSIVE_CHECKS 7771 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 7772 #endif 7773 7774 // 0. Generate SCEV-dependent code in the entry, including TripCount, before 7775 // making any changes to the CFG. 7776 if (!BestVPlan.getEntry()->empty()) 7777 BestVPlan.getEntry()->execute(&State); 7778 7779 if (!ILV.getTripCount()) 7780 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); 7781 else 7782 assert(VectorizingEpilogue && "should only re-use the existing trip " 7783 "count during epilogue vectorization"); 7784 7785 // 1. Set up the skeleton for vectorization, including vector pre-header and 7786 // middle block. The vector loop is created during VPlan execution. 7787 VPBasicBlock *VectorPH = 7788 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor()); 7789 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( 7790 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); 7791 if (VectorizingEpilogue) 7792 VPlanTransforms::removeDeadRecipes(BestVPlan); 7793 7794 // Only use noalias metadata when using memory checks guaranteeing no overlap 7795 // across all iterations. 7796 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7797 std::unique_ptr<LoopVersioning> LVer = nullptr; 7798 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7799 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7800 7801 // We currently don't use LoopVersioning for the actual loop cloning but we 7802 // still use it to add the noalias metadata. 7803 // TODO: Find a better way to re-use LoopVersioning functionality to add 7804 // metadata. 7805 LVer = std::make_unique<LoopVersioning>( 7806 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7807 PSE.getSE()); 7808 State.LVer = &*LVer; 7809 State.LVer->prepareNoAliasMetadata(); 7810 } 7811 7812 ILV.printDebugTracesAtStart(); 7813 7814 //===------------------------------------------------===// 7815 // 7816 // Notice: any optimization or new instruction that go 7817 // into the code below should also be implemented in 7818 // the cost-model. 7819 // 7820 //===------------------------------------------------===// 7821 7822 // 2. Copy and widen instructions from the old loop into the new loop. 7823 BestVPlan.prepareToExecute( 7824 ILV.getTripCount(), 7825 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); 7826 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); 7827 7828 BestVPlan.execute(&State); 7829 7830 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7831 // 2.5 When vectorizing the epilogue, fix reduction and induction resume 7832 // values from the additional bypass block. 7833 if (VectorizingEpilogue) { 7834 assert(!ILV.Legal->hasUncountableEarlyExit() && 7835 "Epilogue vectorisation not yet supported with early exits"); 7836 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); 7837 for (VPRecipeBase &R : *MiddleVPBB) { 7838 fixReductionScalarResumeWhenVectorizingEpilog( 7839 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); 7840 } 7841 BasicBlock *PH = OrigLoop->getLoopPreheader(); 7842 for (const auto &[IVPhi, _] : Legal->getInductionVars()) { 7843 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); 7844 Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); 7845 Inc->setIncomingValueForBlock(BypassBlock, V); 7846 } 7847 } 7848 7849 // 2.6. Maintain Loop Hints 7850 // Keep all loop hints from the original loop on the vector loop (we'll 7851 // replace the vectorizer-specific hints below). 7852 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { 7853 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7854 7855 std::optional<MDNode *> VectorizedLoopID = 7856 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7857 LLVMLoopVectorizeFollowupVectorized}); 7858 7859 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); 7860 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7861 if (VectorizedLoopID) { 7862 L->setLoopID(*VectorizedLoopID); 7863 } else { 7864 // Keep all loop hints from the original loop on the vector loop (we'll 7865 // replace the vectorizer-specific hints below). 7866 if (MDNode *LID = OrigLoop->getLoopID()) 7867 L->setLoopID(LID); 7868 7869 LoopVectorizeHints Hints(L, true, *ORE); 7870 Hints.setAlreadyVectorized(); 7871 } 7872 TargetTransformInfo::UnrollingPreferences UP; 7873 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7874 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) 7875 addRuntimeUnrollDisableMetaData(L); 7876 } 7877 7878 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7879 // predication, updating analyses. 7880 ILV.fixVectorizedLoop(State); 7881 7882 ILV.printDebugTracesAtEnd(); 7883 7884 // 4. Adjust branch weight of the branch in the middle block. 7885 if (BestVPlan.getVectorLoopRegion()) { 7886 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7887 auto *MiddleTerm = 7888 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); 7889 if (MiddleTerm->isConditional() && 7890 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7891 // Assume that `Count % VectorTripCount` is equally distributed. 7892 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); 7893 assert(TripCount > 0 && "trip count should not be zero"); 7894 const uint32_t Weights[] = {1, TripCount - 1}; 7895 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); 7896 } 7897 } 7898 7899 return State.ExpandedSCEVs; 7900 } 7901 7902 //===--------------------------------------------------------------------===// 7903 // EpilogueVectorizerMainLoop 7904 //===--------------------------------------------------------------------===// 7905 7906 /// This function is partially responsible for generating the control flow 7907 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7908 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7909 const SCEV2ValueTy &ExpandedSCEVs) { 7910 createVectorLoopSkeleton(""); 7911 7912 // Generate the code to check the minimum iteration count of the vector 7913 // epilogue (see below). 7914 EPI.EpilogueIterationCountCheck = 7915 emitIterationCountCheck(LoopScalarPreHeader, true); 7916 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7917 7918 // Generate the code to check any assumptions that we've made for SCEV 7919 // expressions. 7920 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7921 7922 // Generate the code that checks at runtime if arrays overlap. We put the 7923 // checks into a separate block to make the more common case of few elements 7924 // faster. 7925 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7926 7927 // Generate the iteration count check for the main loop, *after* the check 7928 // for the epilogue loop, so that the path-length is shorter for the case 7929 // that goes directly through the vector epilogue. The longer-path length for 7930 // the main loop is compensated for, by the gain from vectorizing the larger 7931 // trip count. Note: the branch will get updated later on when we vectorize 7932 // the epilogue. 7933 EPI.MainLoopIterationCountCheck = 7934 emitIterationCountCheck(LoopScalarPreHeader, false); 7935 7936 // Generate the induction variable. 7937 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7938 7939 return LoopVectorPreHeader; 7940 } 7941 7942 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7943 LLVM_DEBUG({ 7944 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7945 << "Main Loop VF:" << EPI.MainLoopVF 7946 << ", Main Loop UF:" << EPI.MainLoopUF 7947 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7948 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7949 }); 7950 } 7951 7952 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7953 DEBUG_WITH_TYPE(VerboseDebug, { 7954 dbgs() << "intermediate fn:\n" 7955 << *OrigLoop->getHeader()->getParent() << "\n"; 7956 }); 7957 } 7958 7959 BasicBlock * 7960 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7961 bool ForEpilogue) { 7962 assert(Bypass && "Expected valid bypass basic block."); 7963 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7964 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7965 Value *Count = getTripCount(); 7966 // Reuse existing vector loop preheader for TC checks. 7967 // Note that new preheader block is generated for vector loop. 7968 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7969 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7970 7971 // Generate code to check if the loop's trip count is less than VF * UF of the 7972 // main vector loop. 7973 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7974 : VF.isVector()) 7975 ? ICmpInst::ICMP_ULE 7976 : ICmpInst::ICMP_ULT; 7977 7978 Value *CheckMinIters = Builder.CreateICmp( 7979 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7980 "min.iters.check"); 7981 7982 if (!ForEpilogue) 7983 TCCheckBlock->setName("vector.main.loop.iter.check"); 7984 7985 // Create new preheader for vector loop. 7986 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7987 DT, LI, nullptr, "vector.ph"); 7988 7989 if (ForEpilogue) { 7990 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7991 DT->getNode(Bypass)->getIDom()) && 7992 "TC check is expected to dominate Bypass"); 7993 7994 LoopBypassBlocks.push_back(TCCheckBlock); 7995 7996 // Save the trip count so we don't have to regenerate it in the 7997 // vec.epilog.iter.check. This is safe to do because the trip count 7998 // generated here dominates the vector epilog iter check. 7999 EPI.TripCount = Count; 8000 } 8001 8002 BranchInst &BI = 8003 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8004 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 8005 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 8006 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 8007 8008 introduceCheckBlockInVPlan(TCCheckBlock); 8009 return TCCheckBlock; 8010 } 8011 8012 //===--------------------------------------------------------------------===// 8013 // EpilogueVectorizerEpilogueLoop 8014 //===--------------------------------------------------------------------===// 8015 8016 /// This function is partially responsible for generating the control flow 8017 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8018 BasicBlock * 8019 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 8020 const SCEV2ValueTy &ExpandedSCEVs) { 8021 createVectorLoopSkeleton("vec.epilog."); 8022 8023 // Now, compare the remaining count and if there aren't enough iterations to 8024 // execute the vectorized epilogue skip to the scalar part. 8025 LoopVectorPreHeader->setName("vec.epilog.ph"); 8026 BasicBlock *VecEpilogueIterationCountCheck = 8027 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, 8028 nullptr, "vec.epilog.iter.check", true); 8029 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 8030 VecEpilogueIterationCountCheck); 8031 AdditionalBypassBlock = VecEpilogueIterationCountCheck; 8032 8033 // Adjust the control flow taking the state info from the main loop 8034 // vectorization into account. 8035 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8036 "expected this to be saved from the previous pass."); 8037 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8038 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8039 8040 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8041 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8042 8043 if (EPI.SCEVSafetyCheck) 8044 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8045 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8046 if (EPI.MemSafetyCheck) 8047 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8048 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8049 8050 DT->changeImmediateDominator(LoopScalarPreHeader, 8051 EPI.EpilogueIterationCountCheck); 8052 // Keep track of bypass blocks, as they feed start values to the induction and 8053 // reduction phis in the scalar loop preheader. 8054 if (EPI.SCEVSafetyCheck) 8055 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8056 if (EPI.MemSafetyCheck) 8057 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8058 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8059 8060 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 8061 // reductions which merge control-flow from the latch block and the middle 8062 // block. Update the incoming values here and move the Phi into the preheader. 8063 SmallVector<PHINode *, 4> PhisInBlock; 8064 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8065 PhisInBlock.push_back(&Phi); 8066 8067 for (PHINode *Phi : PhisInBlock) { 8068 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8069 Phi->replaceIncomingBlockWith( 8070 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8071 VecEpilogueIterationCountCheck); 8072 8073 // If the phi doesn't have an incoming value from the 8074 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 8075 // value and also those from other check blocks. This is needed for 8076 // reduction phis only. 8077 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 8078 return EPI.EpilogueIterationCountCheck == IncB; 8079 })) 8080 continue; 8081 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8082 if (EPI.SCEVSafetyCheck) 8083 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8084 if (EPI.MemSafetyCheck) 8085 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8086 } 8087 8088 // Generate bypass values from the additional bypass block. Note that when the 8089 // vectorized epilogue is skipped due to iteration count check, then the 8090 // resume value for the induction variable comes from the trip count of the 8091 // main vector loop, passed as the second argument. 8092 createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); 8093 return LoopVectorPreHeader; 8094 } 8095 8096 BasicBlock * 8097 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8098 BasicBlock *Bypass, BasicBlock *Insert) { 8099 8100 assert(EPI.TripCount && 8101 "Expected trip count to have been saved in the first pass."); 8102 assert( 8103 (!isa<Instruction>(EPI.TripCount) || 8104 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8105 "saved trip count does not dominate insertion point."); 8106 Value *TC = EPI.TripCount; 8107 IRBuilder<> Builder(Insert->getTerminator()); 8108 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8109 8110 // Generate code to check if the loop's trip count is less than VF * UF of the 8111 // vector epilogue loop. 8112 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8113 ? ICmpInst::ICMP_ULE 8114 : ICmpInst::ICMP_ULT; 8115 8116 Value *CheckMinIters = 8117 Builder.CreateICmp(P, Count, 8118 createStepForVF(Builder, Count->getType(), 8119 EPI.EpilogueVF, EPI.EpilogueUF), 8120 "min.epilog.iters.check"); 8121 8122 BranchInst &BI = 8123 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8124 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 8125 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 8126 unsigned EpilogueLoopStep = 8127 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 8128 // We assume the remaining `Count` is equally distributed in 8129 // [0, MainLoopStep) 8130 // So the probability for `Count < EpilogueLoopStep` should be 8131 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 8132 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 8133 const uint32_t Weights[] = {EstimatedSkipCount, 8134 MainLoopStep - EstimatedSkipCount}; 8135 setBranchWeights(BI, Weights, /*IsExpected=*/false); 8136 } 8137 ReplaceInstWithInst(Insert->getTerminator(), &BI); 8138 LoopBypassBlocks.push_back(Insert); 8139 8140 // A new entry block has been created for the epilogue VPlan. Hook it in, as 8141 // otherwise we would try to modify the entry to the main vector loop. 8142 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); 8143 VPBasicBlock *OldEntry = Plan.getEntry(); 8144 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); 8145 Plan.setEntry(NewEntry); 8146 // OldEntry is now dead and will be cleaned up when the plan gets destroyed. 8147 8148 introduceCheckBlockInVPlan(Insert); 8149 return Insert; 8150 } 8151 8152 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8153 LLVM_DEBUG({ 8154 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8155 << "Epilogue Loop VF:" << EPI.EpilogueVF 8156 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8157 }); 8158 } 8159 8160 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8161 DEBUG_WITH_TYPE(VerboseDebug, { 8162 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8163 }); 8164 } 8165 8166 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 8167 VPRecipeBuilder::mapToVPValues(User::op_range Operands) { 8168 std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 8169 return getVPValueOrAddLiveIn(Op); 8170 }; 8171 return map_range(Operands, Fn); 8172 } 8173 8174 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { 8175 BasicBlock *Src = SI->getParent(); 8176 assert(!OrigLoop->isLoopExiting(Src) && 8177 all_of(successors(Src), 8178 [this](BasicBlock *Succ) { 8179 return OrigLoop->getHeader() != Succ; 8180 }) && 8181 "unsupported switch either exiting loop or continuing to header"); 8182 // Create masks where the terminator in Src is a switch. We create mask for 8183 // all edges at the same time. This is more efficient, as we can create and 8184 // collect compares for all cases once. 8185 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); 8186 BasicBlock *DefaultDst = SI->getDefaultDest(); 8187 MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares; 8188 for (auto &C : SI->cases()) { 8189 BasicBlock *Dst = C.getCaseSuccessor(); 8190 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); 8191 // Cases whose destination is the same as default are redundant and can be 8192 // ignored - they will get there anyhow. 8193 if (Dst == DefaultDst) 8194 continue; 8195 auto &Compares = Dst2Compares[Dst]; 8196 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); 8197 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); 8198 } 8199 8200 // We need to handle 2 separate cases below for all entries in Dst2Compares, 8201 // which excludes destinations matching the default destination. 8202 VPValue *SrcMask = getBlockInMask(Src); 8203 VPValue *DefaultMask = nullptr; 8204 for (const auto &[Dst, Conds] : Dst2Compares) { 8205 // 1. Dst is not the default destination. Dst is reached if any of the cases 8206 // with destination == Dst are taken. Join the conditions for each case 8207 // whose destination == Dst using an OR. 8208 VPValue *Mask = Conds[0]; 8209 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) 8210 Mask = Builder.createOr(Mask, V); 8211 if (SrcMask) 8212 Mask = Builder.createLogicalAnd(SrcMask, Mask); 8213 EdgeMaskCache[{Src, Dst}] = Mask; 8214 8215 // 2. Create the mask for the default destination, which is reached if none 8216 // of the cases with destination != default destination are taken. Join the 8217 // conditions for each case where the destination is != Dst using an OR and 8218 // negate it. 8219 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; 8220 } 8221 8222 if (DefaultMask) { 8223 DefaultMask = Builder.createNot(DefaultMask); 8224 if (SrcMask) 8225 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); 8226 } 8227 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; 8228 } 8229 8230 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 8231 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8232 8233 // Look for cached value. 8234 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8235 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8236 if (ECEntryIt != EdgeMaskCache.end()) 8237 return ECEntryIt->second; 8238 8239 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { 8240 createSwitchEdgeMasks(SI); 8241 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); 8242 return EdgeMaskCache[Edge]; 8243 } 8244 8245 VPValue *SrcMask = getBlockInMask(Src); 8246 8247 // The terminator has to be a branch inst! 8248 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8249 assert(BI && "Unexpected terminator found"); 8250 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8251 return EdgeMaskCache[Edge] = SrcMask; 8252 8253 // If source is an exiting block, we know the exit edge is dynamically dead 8254 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8255 // adding uses of an otherwise potentially dead instruction unless we are 8256 // vectorizing a loop with uncountable exits. In that case, we always 8257 // materialize the mask. 8258 if (OrigLoop->isLoopExiting(Src) && 8259 Src != Legal->getUncountableEarlyExitingBlock()) 8260 return EdgeMaskCache[Edge] = SrcMask; 8261 8262 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); 8263 assert(EdgeMask && "No Edge Mask found for condition"); 8264 8265 if (BI->getSuccessor(0) != Dst) 8266 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8267 8268 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8269 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 8270 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 8271 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8272 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); 8273 } 8274 8275 return EdgeMaskCache[Edge] = EdgeMask; 8276 } 8277 8278 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { 8279 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8280 8281 // Look for cached value. 8282 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8283 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); 8284 assert(ECEntryIt != EdgeMaskCache.end() && 8285 "looking up mask for edge which has not been created"); 8286 return ECEntryIt->second; 8287 } 8288 8289 void VPRecipeBuilder::createHeaderMask() { 8290 BasicBlock *Header = OrigLoop->getHeader(); 8291 8292 // When not folding the tail, use nullptr to model all-true mask. 8293 if (!CM.foldTailByMasking()) { 8294 BlockMaskCache[Header] = nullptr; 8295 return; 8296 } 8297 8298 // Introduce the early-exit compare IV <= BTC to form header block mask. 8299 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8300 // constructing the desired canonical IV in the header block as its first 8301 // non-phi instructions. 8302 8303 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8304 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8305 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8306 HeaderVPBB->insert(IV, NewInsertionPoint); 8307 8308 VPBuilder::InsertPointGuard Guard(Builder); 8309 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8310 VPValue *BlockMask = nullptr; 8311 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8312 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8313 BlockMaskCache[Header] = BlockMask; 8314 } 8315 8316 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8317 // Return the cached value. 8318 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8319 assert(BCEntryIt != BlockMaskCache.end() && 8320 "Trying to access mask for block without one."); 8321 return BCEntryIt->second; 8322 } 8323 8324 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { 8325 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8326 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8327 assert(OrigLoop->getHeader() != BB && 8328 "Loop header must have cached block mask"); 8329 8330 // All-one mask is modelled as no-mask following the convention for masked 8331 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8332 VPValue *BlockMask = nullptr; 8333 // This is the block mask. We OR all unique incoming edges. 8334 for (auto *Predecessor : 8335 SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) { 8336 VPValue *EdgeMask = createEdgeMask(Predecessor, BB); 8337 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8338 BlockMaskCache[BB] = EdgeMask; 8339 return; 8340 } 8341 8342 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8343 BlockMask = EdgeMask; 8344 continue; 8345 } 8346 8347 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8348 } 8349 8350 BlockMaskCache[BB] = BlockMask; 8351 } 8352 8353 VPWidenMemoryRecipe * 8354 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, 8355 VFRange &Range) { 8356 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8357 "Must be called with either a load or store"); 8358 8359 auto WillWiden = [&](ElementCount VF) -> bool { 8360 LoopVectorizationCostModel::InstWidening Decision = 8361 CM.getWideningDecision(I, VF); 8362 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8363 "CM decision should be taken at this point."); 8364 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8365 return true; 8366 if (CM.isScalarAfterVectorization(I, VF) || 8367 CM.isProfitableToScalarize(I, VF)) 8368 return false; 8369 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8370 }; 8371 8372 if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range)) 8373 return nullptr; 8374 8375 VPValue *Mask = nullptr; 8376 if (Legal->isMaskRequired(I)) 8377 Mask = getBlockInMask(I->getParent()); 8378 8379 // Determine if the pointer operand of the access is either consecutive or 8380 // reverse consecutive. 8381 LoopVectorizationCostModel::InstWidening Decision = 8382 CM.getWideningDecision(I, Range.Start); 8383 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8384 bool Consecutive = 8385 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8386 8387 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8388 if (Consecutive) { 8389 auto *GEP = dyn_cast<GetElementPtrInst>( 8390 Ptr->getUnderlyingValue()->stripPointerCasts()); 8391 VPSingleDefRecipe *VectorPtr; 8392 if (Reverse) { 8393 // When folding the tail, we may compute an address that we don't in the 8394 // original scalar loop and it may not be inbounds. Drop Inbounds in that 8395 // case. 8396 GEPNoWrapFlags Flags = 8397 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) 8398 ? GEPNoWrapFlags::none() 8399 : GEPNoWrapFlags::inBounds(); 8400 VectorPtr = new VPReverseVectorPointerRecipe( 8401 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); 8402 } else { 8403 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), 8404 GEP ? GEP->getNoWrapFlags() 8405 : GEPNoWrapFlags::none(), 8406 I->getDebugLoc()); 8407 } 8408 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8409 Ptr = VectorPtr; 8410 } 8411 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8412 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, 8413 I->getDebugLoc()); 8414 8415 StoreInst *Store = cast<StoreInst>(I); 8416 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, 8417 Reverse, I->getDebugLoc()); 8418 } 8419 8420 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8421 /// insert a recipe to expand the step for the induction recipe. 8422 static VPWidenIntOrFpInductionRecipe * 8423 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8424 VPValue *Start, const InductionDescriptor &IndDesc, 8425 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { 8426 assert(IndDesc.getStartValue() == 8427 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8428 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8429 "step must be loop invariant"); 8430 8431 VPValue *Step = 8432 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8433 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8434 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8435 IndDesc, TruncI, 8436 TruncI->getDebugLoc()); 8437 } 8438 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8439 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8440 IndDesc, Phi->getDebugLoc()); 8441 } 8442 8443 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8444 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { 8445 8446 // Check if this is an integer or fp induction. If so, build the recipe that 8447 // produces its scalar and vector values. 8448 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8449 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8450 *PSE.getSE(), *OrigLoop); 8451 8452 // Check if this is pointer induction. If so, build the recipe for it. 8453 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8454 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8455 *PSE.getSE()); 8456 return new VPWidenPointerInductionRecipe( 8457 Phi, Operands[0], Step, *II, 8458 LoopVectorizationPlanner::getDecisionAndClampRange( 8459 [&](ElementCount VF) { 8460 return CM.isScalarAfterVectorization(Phi, VF); 8461 }, 8462 Range), 8463 Phi->getDebugLoc()); 8464 } 8465 return nullptr; 8466 } 8467 8468 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8469 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { 8470 // Optimize the special case where the source is a constant integer 8471 // induction variable. Notice that we can only optimize the 'trunc' case 8472 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8473 // (c) other casts depend on pointer size. 8474 8475 // Determine whether \p K is a truncation based on an induction variable that 8476 // can be optimized. 8477 auto IsOptimizableIVTruncate = 8478 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8479 return [=](ElementCount VF) -> bool { 8480 return CM.isOptimizableIVTruncate(K, VF); 8481 }; 8482 }; 8483 8484 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8485 IsOptimizableIVTruncate(I), Range)) { 8486 8487 auto *Phi = cast<PHINode>(I->getOperand(0)); 8488 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8489 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); 8490 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8491 *OrigLoop); 8492 } 8493 return nullptr; 8494 } 8495 8496 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, 8497 ArrayRef<VPValue *> Operands) { 8498 unsigned NumIncoming = Phi->getNumIncomingValues(); 8499 8500 // We know that all PHIs in non-header blocks are converted into selects, so 8501 // we don't have to worry about the insertion order and we can just use the 8502 // builder. At this point we generate the predication tree. There may be 8503 // duplications since this is a simple recursive scan, but future 8504 // optimizations will clean it up. 8505 SmallVector<VPValue *, 2> OperandsWithMask; 8506 8507 for (unsigned In = 0; In < NumIncoming; In++) { 8508 OperandsWithMask.push_back(Operands[In]); 8509 VPValue *EdgeMask = 8510 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); 8511 if (!EdgeMask) { 8512 assert(In == 0 && "Both null and non-null edge masks found"); 8513 assert(all_equal(Operands) && 8514 "Distinct incoming values with one having a full mask"); 8515 break; 8516 } 8517 OperandsWithMask.push_back(EdgeMask); 8518 } 8519 return new VPBlendRecipe(Phi, OperandsWithMask); 8520 } 8521 8522 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8523 ArrayRef<VPValue *> Operands, 8524 VFRange &Range) { 8525 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8526 [this, CI](ElementCount VF) { 8527 return CM.isScalarWithPredication(CI, VF); 8528 }, 8529 Range); 8530 8531 if (IsPredicated) 8532 return nullptr; 8533 8534 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8535 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8536 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8537 ID == Intrinsic::pseudoprobe || 8538 ID == Intrinsic::experimental_noalias_scope_decl)) 8539 return nullptr; 8540 8541 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8542 8543 // Is it beneficial to perform intrinsic call compared to lib call? 8544 bool ShouldUseVectorIntrinsic = 8545 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8546 [&](ElementCount VF) -> bool { 8547 return CM.getCallWideningDecision(CI, VF).Kind == 8548 LoopVectorizationCostModel::CM_IntrinsicCall; 8549 }, 8550 Range); 8551 if (ShouldUseVectorIntrinsic) 8552 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), 8553 CI->getDebugLoc()); 8554 8555 Function *Variant = nullptr; 8556 std::optional<unsigned> MaskPos; 8557 // Is better to call a vectorized version of the function than to to scalarize 8558 // the call? 8559 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8560 [&](ElementCount VF) -> bool { 8561 // The following case may be scalarized depending on the VF. 8562 // The flag shows whether we can use a usual Call for vectorized 8563 // version of the instruction. 8564 8565 // If we've found a variant at a previous VF, then stop looking. A 8566 // vectorized variant of a function expects input in a certain shape 8567 // -- basically the number of input registers, the number of lanes 8568 // per register, and whether there's a mask required. 8569 // We store a pointer to the variant in the VPWidenCallRecipe, so 8570 // once we have an appropriate variant it's only valid for that VF. 8571 // This will force a different vplan to be generated for each VF that 8572 // finds a valid variant. 8573 if (Variant) 8574 return false; 8575 LoopVectorizationCostModel::CallWideningDecision Decision = 8576 CM.getCallWideningDecision(CI, VF); 8577 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8578 Variant = Decision.Variant; 8579 MaskPos = Decision.MaskPos; 8580 return true; 8581 } 8582 8583 return false; 8584 }, 8585 Range); 8586 if (ShouldUseVectorCall) { 8587 if (MaskPos.has_value()) { 8588 // We have 2 cases that would require a mask: 8589 // 1) The block needs to be predicated, either due to a conditional 8590 // in the scalar loop or use of an active lane mask with 8591 // tail-folding, and we use the appropriate mask for the block. 8592 // 2) No mask is required for the block, but the only available 8593 // vector variant at this VF requires a mask, so we synthesize an 8594 // all-true mask. 8595 VPValue *Mask = nullptr; 8596 if (Legal->isMaskRequired(CI)) 8597 Mask = getBlockInMask(CI->getParent()); 8598 else 8599 Mask = Plan.getOrAddLiveIn( 8600 ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); 8601 8602 Ops.insert(Ops.begin() + *MaskPos, Mask); 8603 } 8604 8605 Ops.push_back(Operands.back()); 8606 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); 8607 } 8608 8609 return nullptr; 8610 } 8611 8612 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8613 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8614 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8615 // Instruction should be widened, unless it is scalar after vectorization, 8616 // scalarization is profitable or it is predicated. 8617 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8618 return CM.isScalarAfterVectorization(I, VF) || 8619 CM.isProfitableToScalarize(I, VF) || 8620 CM.isScalarWithPredication(I, VF); 8621 }; 8622 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8623 Range); 8624 } 8625 8626 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8627 ArrayRef<VPValue *> Operands, 8628 VPBasicBlock *VPBB) { 8629 switch (I->getOpcode()) { 8630 default: 8631 return nullptr; 8632 case Instruction::SDiv: 8633 case Instruction::UDiv: 8634 case Instruction::SRem: 8635 case Instruction::URem: { 8636 // If not provably safe, use a select to form a safe divisor before widening the 8637 // div/rem operation itself. Otherwise fall through to general handling below. 8638 if (CM.isPredicatedInst(I)) { 8639 SmallVector<VPValue *> Ops(Operands); 8640 VPValue *Mask = getBlockInMask(I->getParent()); 8641 VPValue *One = 8642 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); 8643 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); 8644 Ops[1] = SafeRHS; 8645 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8646 } 8647 [[fallthrough]]; 8648 } 8649 case Instruction::Add: 8650 case Instruction::And: 8651 case Instruction::AShr: 8652 case Instruction::FAdd: 8653 case Instruction::FCmp: 8654 case Instruction::FDiv: 8655 case Instruction::FMul: 8656 case Instruction::FNeg: 8657 case Instruction::FRem: 8658 case Instruction::FSub: 8659 case Instruction::ICmp: 8660 case Instruction::LShr: 8661 case Instruction::Mul: 8662 case Instruction::Or: 8663 case Instruction::Select: 8664 case Instruction::Shl: 8665 case Instruction::Sub: 8666 case Instruction::Xor: 8667 case Instruction::Freeze: 8668 SmallVector<VPValue *> NewOps(Operands); 8669 if (Instruction::isBinaryOp(I->getOpcode())) { 8670 // The legacy cost model uses SCEV to check if some of the operands are 8671 // constants. To match the legacy cost model's behavior, use SCEV to try 8672 // to replace operands with constants. 8673 ScalarEvolution &SE = *PSE.getSE(); 8674 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) { 8675 Value *V = Op->getUnderlyingValue(); 8676 if (isa<Constant>(V) || !SE.isSCEVable(V->getType())) 8677 return Op; 8678 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V)); 8679 if (!C) 8680 return Op; 8681 return Plan.getOrAddLiveIn(C->getValue()); 8682 }; 8683 // For Mul, the legacy cost model checks both operands. 8684 if (I->getOpcode() == Instruction::Mul) 8685 NewOps[0] = GetConstantViaSCEV(NewOps[0]); 8686 // For other binops, the legacy cost model only checks the second operand. 8687 NewOps[1] = GetConstantViaSCEV(NewOps[1]); 8688 } 8689 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); 8690 }; 8691 } 8692 8693 VPHistogramRecipe * 8694 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, 8695 ArrayRef<VPValue *> Operands) { 8696 // FIXME: Support other operations. 8697 unsigned Opcode = HI->Update->getOpcode(); 8698 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && 8699 "Histogram update operation must be an Add or Sub"); 8700 8701 SmallVector<VPValue *, 3> HGramOps; 8702 // Bucket address. 8703 HGramOps.push_back(Operands[1]); 8704 // Increment value. 8705 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); 8706 8707 // In case of predicated execution (due to tail-folding, or conditional 8708 // execution, or both), pass the relevant mask. 8709 if (Legal->isMaskRequired(HI->Store)) 8710 HGramOps.push_back(getBlockInMask(HI->Store->getParent())); 8711 8712 return new VPHistogramRecipe(Opcode, 8713 make_range(HGramOps.begin(), HGramOps.end()), 8714 HI->Store->getDebugLoc()); 8715 } 8716 8717 void VPRecipeBuilder::fixHeaderPhis() { 8718 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8719 for (VPHeaderPHIRecipe *R : PhisToFix) { 8720 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8721 VPRecipeBase *IncR = 8722 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8723 R->addOperand(IncR->getVPSingleValue()); 8724 } 8725 } 8726 8727 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, 8728 VFRange &Range) { 8729 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8730 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8731 Range); 8732 8733 bool IsPredicated = CM.isPredicatedInst(I); 8734 8735 // Even if the instruction is not marked as uniform, there are certain 8736 // intrinsic calls that can be effectively treated as such, so we check for 8737 // them here. Conservatively, we only do this for scalable vectors, since 8738 // for fixed-width VFs we can always fall back on full scalarization. 8739 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8740 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8741 case Intrinsic::assume: 8742 case Intrinsic::lifetime_start: 8743 case Intrinsic::lifetime_end: 8744 // For scalable vectors if one of the operands is variant then we still 8745 // want to mark as uniform, which will generate one instruction for just 8746 // the first lane of the vector. We can't scalarize the call in the same 8747 // way as for fixed-width vectors because we don't know how many lanes 8748 // there are. 8749 // 8750 // The reasons for doing it this way for scalable vectors are: 8751 // 1. For the assume intrinsic generating the instruction for the first 8752 // lane is still be better than not generating any at all. For 8753 // example, the input may be a splat across all lanes. 8754 // 2. For the lifetime start/end intrinsics the pointer operand only 8755 // does anything useful when the input comes from a stack object, 8756 // which suggests it should always be uniform. For non-stack objects 8757 // the effect is to poison the object, which still allows us to 8758 // remove the call. 8759 IsUniform = true; 8760 break; 8761 default: 8762 break; 8763 } 8764 } 8765 VPValue *BlockInMask = nullptr; 8766 if (!IsPredicated) { 8767 // Finalize the recipe for Instr, first if it is not predicated. 8768 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8769 } else { 8770 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8771 // Instructions marked for predication are replicated and a mask operand is 8772 // added initially. Masked replicate recipes will later be placed under an 8773 // if-then construct to prevent side-effects. Generate recipes to compute 8774 // the block mask for this region. 8775 BlockInMask = getBlockInMask(I->getParent()); 8776 } 8777 8778 // Note that there is some custom logic to mark some intrinsics as uniform 8779 // manually above for scalable vectors, which this assert needs to account for 8780 // as well. 8781 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || 8782 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && 8783 "Should not predicate a uniform recipe"); 8784 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), 8785 IsUniform, BlockInMask); 8786 return Recipe; 8787 } 8788 8789 /// Find all possible partial reductions in the loop and track all of those that 8790 /// are valid so recipes can be formed later. 8791 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { 8792 // Find all possible partial reductions. 8793 SmallVector<std::pair<PartialReductionChain, unsigned>, 1> 8794 PartialReductionChains; 8795 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) 8796 if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair = 8797 getScaledReduction(Phi, RdxDesc, Range)) 8798 PartialReductionChains.push_back(*Pair); 8799 8800 // A partial reduction is invalid if any of its extends are used by 8801 // something that isn't another partial reduction. This is because the 8802 // extends are intended to be lowered along with the reduction itself. 8803 8804 // Build up a set of partial reduction bin ops for efficient use checking. 8805 SmallSet<User *, 4> PartialReductionBinOps; 8806 for (const auto &[PartialRdx, _] : PartialReductionChains) 8807 PartialReductionBinOps.insert(PartialRdx.BinOp); 8808 8809 auto ExtendIsOnlyUsedByPartialReductions = 8810 [&PartialReductionBinOps](Instruction *Extend) { 8811 return all_of(Extend->users(), [&](const User *U) { 8812 return PartialReductionBinOps.contains(U); 8813 }); 8814 }; 8815 8816 // Check if each use of a chain's two extends is a partial reduction 8817 // and only add those that don't have non-partial reduction users. 8818 for (auto Pair : PartialReductionChains) { 8819 PartialReductionChain Chain = Pair.first; 8820 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && 8821 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) 8822 ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair)); 8823 } 8824 } 8825 8826 std::optional<std::pair<PartialReductionChain, unsigned>> 8827 VPRecipeBuilder::getScaledReduction(PHINode *PHI, 8828 const RecurrenceDescriptor &Rdx, 8829 VFRange &Range) { 8830 // TODO: Allow scaling reductions when predicating. The select at 8831 // the end of the loop chooses between the phi value and most recent 8832 // reduction result, both of which have different VFs to the active lane 8833 // mask when scaling. 8834 if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) 8835 return std::nullopt; 8836 8837 auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr()); 8838 if (!Update) 8839 return std::nullopt; 8840 8841 Value *Op = Update->getOperand(0); 8842 Value *PhiOp = Update->getOperand(1); 8843 if (Op == PHI) { 8844 Op = Update->getOperand(1); 8845 PhiOp = Update->getOperand(0); 8846 } 8847 if (PhiOp != PHI) 8848 return std::nullopt; 8849 8850 auto *BinOp = dyn_cast<BinaryOperator>(Op); 8851 if (!BinOp || !BinOp->hasOneUse()) 8852 return std::nullopt; 8853 8854 using namespace llvm::PatternMatch; 8855 Value *A, *B; 8856 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || 8857 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) 8858 return std::nullopt; 8859 8860 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0)); 8861 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1)); 8862 8863 TTI::PartialReductionExtendKind OpAExtend = 8864 TargetTransformInfo::getPartialReductionExtendKind(ExtA); 8865 TTI::PartialReductionExtendKind OpBExtend = 8866 TargetTransformInfo::getPartialReductionExtendKind(ExtB); 8867 8868 PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); 8869 8870 unsigned TargetScaleFactor = 8871 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( 8872 A->getType()->getPrimitiveSizeInBits()); 8873 8874 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8875 [&](ElementCount VF) { 8876 InstructionCost Cost = TTI->getPartialReductionCost( 8877 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), 8878 VF, OpAExtend, OpBExtend, 8879 std::make_optional(BinOp->getOpcode())); 8880 return Cost.isValid(); 8881 }, 8882 Range)) 8883 return std::make_pair(Chain, TargetScaleFactor); 8884 8885 return std::nullopt; 8886 } 8887 8888 VPRecipeBase * 8889 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8890 ArrayRef<VPValue *> Operands, 8891 VFRange &Range, VPBasicBlock *VPBB) { 8892 // First, check for specific widening recipes that deal with inductions, Phi 8893 // nodes, calls and memory operations. 8894 VPRecipeBase *Recipe; 8895 if (auto *Phi = dyn_cast<PHINode>(Instr)) { 8896 if (Phi->getParent() != OrigLoop->getHeader()) 8897 return tryToBlend(Phi, Operands); 8898 8899 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8900 return Recipe; 8901 8902 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8903 assert((Legal->isReductionVariable(Phi) || 8904 Legal->isFixedOrderRecurrence(Phi)) && 8905 "can only widen reductions and fixed-order recurrences here"); 8906 VPValue *StartV = Operands[0]; 8907 if (Legal->isReductionVariable(Phi)) { 8908 const RecurrenceDescriptor &RdxDesc = 8909 Legal->getReductionVars().find(Phi)->second; 8910 assert(RdxDesc.getRecurrenceStartValue() == 8911 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8912 8913 // If the PHI is used by a partial reduction, set the scale factor. 8914 std::optional<std::pair<PartialReductionChain, unsigned>> Pair = 8915 getScaledReductionForInstr(RdxDesc.getLoopExitInstr()); 8916 unsigned ScaleFactor = Pair ? Pair->second : 1; 8917 PhiRecipe = new VPReductionPHIRecipe( 8918 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), 8919 CM.useOrderedReductions(RdxDesc), ScaleFactor); 8920 } else { 8921 // TODO: Currently fixed-order recurrences are modeled as chains of 8922 // first-order recurrences. If there are no users of the intermediate 8923 // recurrences in the chain, the fixed order recurrence should be modeled 8924 // directly, enabling more efficient codegen. 8925 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8926 } 8927 8928 PhisToFix.push_back(PhiRecipe); 8929 return PhiRecipe; 8930 } 8931 8932 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8933 cast<TruncInst>(Instr), Operands, Range))) 8934 return Recipe; 8935 8936 // All widen recipes below deal only with VF > 1. 8937 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8938 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8939 return nullptr; 8940 8941 if (auto *CI = dyn_cast<CallInst>(Instr)) 8942 return tryToWidenCall(CI, Operands, Range); 8943 8944 if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) 8945 if (auto HistInfo = Legal->getHistogramInfo(SI)) 8946 return tryToWidenHistogram(*HistInfo, Operands); 8947 8948 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8949 return tryToWidenMemory(Instr, Operands, Range); 8950 8951 if (getScaledReductionForInstr(Instr)) 8952 return tryToCreatePartialReduction(Instr, Operands); 8953 8954 if (!shouldWiden(Instr, Range)) 8955 return nullptr; 8956 8957 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) 8958 return new VPWidenGEPRecipe(GEP, 8959 make_range(Operands.begin(), Operands.end())); 8960 8961 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8962 return new VPWidenSelectRecipe( 8963 *SI, make_range(Operands.begin(), Operands.end())); 8964 } 8965 8966 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8967 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), 8968 *CI); 8969 } 8970 8971 return tryToWiden(Instr, Operands, VPBB); 8972 } 8973 8974 VPRecipeBase * 8975 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, 8976 ArrayRef<VPValue *> Operands) { 8977 assert(Operands.size() == 2 && 8978 "Unexpected number of operands for partial reduction"); 8979 8980 VPValue *BinOp = Operands[0]; 8981 VPValue *Phi = Operands[1]; 8982 if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe())) 8983 std::swap(BinOp, Phi); 8984 8985 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, 8986 Reduction); 8987 } 8988 8989 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8990 ElementCount MaxVF) { 8991 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8992 8993 auto MaxVFTimes2 = MaxVF * 2; 8994 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8995 VFRange SubRange = {VF, MaxVFTimes2}; 8996 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8997 // Now optimize the initial VPlan. 8998 if (!Plan->hasVF(ElementCount::getFixed(1))) 8999 VPlanTransforms::truncateToMinimalBitwidths(*Plan, 9000 CM.getMinimalBitwidths()); 9001 VPlanTransforms::optimize(*Plan); 9002 // TODO: try to put it close to addActiveLaneMask(). 9003 // Discard the plan if it is not EVL-compatible 9004 if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( 9005 *Plan, CM.getMaxSafeElements())) 9006 break; 9007 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9008 VPlans.push_back(std::move(Plan)); 9009 } 9010 VF = SubRange.End; 9011 } 9012 } 9013 9014 // Add the necessary canonical IV and branch recipes required to control the 9015 // loop. 9016 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 9017 DebugLoc DL) { 9018 Value *StartIdx = ConstantInt::get(IdxTy, 0); 9019 auto *StartV = Plan.getOrAddLiveIn(StartIdx); 9020 9021 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 9022 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 9023 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 9024 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 9025 Header->insert(CanonicalIVPHI, Header->begin()); 9026 9027 VPBuilder Builder(TopRegion->getExitingBasicBlock()); 9028 // Add a VPInstruction to increment the scalar canonical IV by VF * UF. 9029 auto *CanonicalIVIncrement = Builder.createOverflowingOp( 9030 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, 9031 "index.next"); 9032 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 9033 9034 // Add the BranchOnCount VPInstruction to the latch. 9035 Builder.createNaryOp(VPInstruction::BranchOnCount, 9036 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 9037 } 9038 9039 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the 9040 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute 9041 /// the end value of the induction. 9042 static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, 9043 VPBuilder &VectorPHBuilder, 9044 VPBuilder &ScalarPHBuilder, 9045 VPTypeAnalysis &TypeInfo, 9046 VPValue *VectorTC) { 9047 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); 9048 // Truncated wide inductions resume from the last lane of their vector value 9049 // in the last vector iteration which is handled elsewhere. 9050 if (WideIntOrFp && WideIntOrFp->getTruncInst()) 9051 return nullptr; 9052 9053 VPValue *Start = WideIV->getStartValue(); 9054 VPValue *Step = WideIV->getStepValue(); 9055 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 9056 VPValue *EndValue = VectorTC; 9057 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { 9058 EndValue = VectorPHBuilder.createDerivedIV( 9059 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), 9060 Start, VectorTC, Step); 9061 } 9062 9063 // EndValue is derived from the vector trip count (which has the same type as 9064 // the widest induction) and thus may be wider than the induction here. 9065 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); 9066 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { 9067 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, 9068 ScalarTypeOfWideIV, 9069 WideIV->getDebugLoc()); 9070 } 9071 9072 auto *ResumePhiRecipe = 9073 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, 9074 WideIV->getDebugLoc(), "bc.resume.val"); 9075 return ResumePhiRecipe; 9076 } 9077 9078 /// Create resume phis in the scalar preheader for first-order recurrences, 9079 /// reductions and inductions, and update the VPIRInstructions wrapping the 9080 /// original phis in the scalar header. 9081 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { 9082 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 9083 auto *ScalarPH = Plan.getScalarPreheader(); 9084 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); 9085 VPBuilder VectorPHBuilder( 9086 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor())); 9087 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9088 VPBuilder ScalarPHBuilder(ScalarPH); 9089 VPValue *OneVPV = Plan.getOrAddLiveIn( 9090 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); 9091 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { 9092 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR); 9093 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); 9094 if (!ScalarPhiI) 9095 break; 9096 9097 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); 9098 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { 9099 if (VPValue *ResumePhi = addResumePhiRecipeForInduction( 9100 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, 9101 &Plan.getVectorTripCount())) { 9102 ScalarPhiIRI->addOperand(ResumePhi); 9103 continue; 9104 } 9105 // TODO: Also handle truncated inductions here. Computing end-values 9106 // separately should be done as VPlan-to-VPlan optimization, after 9107 // legalizing all resume values to use the last lane from the loop. 9108 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && 9109 "should only skip truncated wide inductions"); 9110 continue; 9111 } 9112 9113 // The backedge value provides the value to resume coming out of a loop, 9114 // which for FORs is a vector whose last element needs to be extracted. The 9115 // start value provides the value if the loop is bypassed. 9116 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); 9117 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); 9118 if (IsFOR) 9119 ResumeFromVectorLoop = MiddleBuilder.createNaryOp( 9120 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, 9121 "vector.recur.extract"); 9122 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; 9123 auto *ResumePhiR = ScalarPHBuilder.createNaryOp( 9124 VPInstruction::ResumePhi, 9125 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); 9126 ScalarPhiIRI->addOperand(ResumePhiR); 9127 } 9128 } 9129 9130 /// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is 9131 /// either an untruncated wide induction, or if it increments a wide induction 9132 /// by its step. 9133 static bool isOptimizableIVOrUse(VPValue *VPV) { 9134 VPRecipeBase *Def = VPV->getDefiningRecipe(); 9135 if (!Def) 9136 return false; 9137 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def); 9138 if (WideIV) { 9139 // VPV itself is a wide induction, separately compute the end value for exit 9140 // users if it is not a truncated IV. 9141 return isa<VPWidenPointerInductionRecipe>(WideIV) || 9142 !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst(); 9143 } 9144 9145 // Check if VPV is an optimizable induction increment. 9146 if (Def->getNumOperands() != 2) 9147 return false; 9148 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0)); 9149 if (!WideIV) 9150 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1)); 9151 if (!WideIV) 9152 return false; 9153 9154 using namespace VPlanPatternMatch; 9155 auto &ID = WideIV->getInductionDescriptor(); 9156 9157 // Check if VPV increments the induction by the induction step. 9158 VPValue *IVStep = WideIV->getStepValue(); 9159 switch (ID.getInductionOpcode()) { 9160 case Instruction::Add: 9161 return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV), 9162 m_Specific(IVStep))); 9163 case Instruction::FAdd: 9164 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV), 9165 m_Specific(IVStep))); 9166 case Instruction::FSub: 9167 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV), 9168 m_Specific(IVStep))); 9169 case Instruction::Sub: { 9170 // IVStep will be the negated step of the subtraction. Check if Step == -1 * 9171 // IVStep. 9172 VPValue *Step; 9173 if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) || 9174 !Step->isLiveIn() || !IVStep->isLiveIn()) 9175 return false; 9176 auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue()); 9177 auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue()); 9178 return StepCI && IVStepCI && 9179 StepCI->getValue() == (-1 * IVStepCI->getValue()); 9180 } 9181 default: 9182 return ID.getKind() == InductionDescriptor::IK_PtrInduction && 9183 match(VPV, m_GetElementPtr(m_Specific(WideIV), 9184 m_Specific(WideIV->getStepValue()))); 9185 } 9186 llvm_unreachable("should have been covered by switch above"); 9187 } 9188 9189 // Collect VPIRInstructions for phis in the exit blocks that are modeled 9190 // in VPlan and add the exiting VPValue as operand. Some exiting values are not 9191 // modeled explicitly yet and won't be included. Those are un-truncated 9192 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction 9193 // increments. 9194 static SetVector<VPIRInstruction *> 9195 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, 9196 VPlan &Plan) { 9197 auto *MiddleVPBB = Plan.getMiddleBlock(); 9198 SetVector<VPIRInstruction *> ExitUsersToFix; 9199 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { 9200 for (VPRecipeBase &R : *ExitVPBB) { 9201 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); 9202 if (!ExitIRI) 9203 continue; 9204 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); 9205 if (!ExitPhi) 9206 break; 9207 for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { 9208 BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); 9209 if (PredVPBB != MiddleVPBB) { 9210 SmallVector<BasicBlock *> ExitingBlocks; 9211 OrigLoop->getExitingBlocks(ExitingBlocks); 9212 assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); 9213 ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] 9214 : ExitingBlocks[0]; 9215 } 9216 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); 9217 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); 9218 // Exit values for inductions are computed and updated outside of VPlan 9219 // and independent of induction recipes. 9220 // TODO: Compute induction exit values in VPlan. 9221 if (isOptimizableIVOrUse(V) && 9222 ExitVPBB->getSinglePredecessor() == MiddleVPBB) 9223 continue; 9224 ExitUsersToFix.insert(ExitIRI); 9225 ExitIRI->addOperand(V); 9226 } 9227 } 9228 } 9229 return ExitUsersToFix; 9230 } 9231 9232 // Add exit values to \p Plan. Extracts are added for each entry in \p 9233 // ExitUsersToFix if needed and their operands are updated. Returns true if all 9234 // exit users can be handled, otherwise return false. 9235 static bool 9236 addUsersInExitBlocks(VPlan &Plan, 9237 const SetVector<VPIRInstruction *> &ExitUsersToFix) { 9238 if (ExitUsersToFix.empty()) 9239 return true; 9240 9241 auto *MiddleVPBB = Plan.getMiddleBlock(); 9242 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9243 9244 // Introduce extract for exiting values and update the VPIRInstructions 9245 // modeling the corresponding LCSSA phis. 9246 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9247 for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) { 9248 // Pass live-in values used by exit phis directly through to their users 9249 // in the exit block. 9250 if (Op->isLiveIn()) 9251 continue; 9252 9253 // Currently only live-ins can be used by exit values from blocks not 9254 // exiting via the vector latch through to the middle block. 9255 if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) 9256 return false; 9257 9258 LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); 9259 VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, 9260 {Op, Plan.getOrAddLiveIn(ConstantInt::get( 9261 IntegerType::get(Ctx, 32), 1))}); 9262 ExitIRI->setOperand(Idx, Ext); 9263 } 9264 } 9265 return true; 9266 } 9267 9268 /// Handle users in the exit block for first order reductions in the original 9269 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi 9270 /// users in the original exit block using the VPIRInstruction wrapping to the 9271 /// LCSSA phi. 9272 static void addExitUsersForFirstOrderRecurrences( 9273 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { 9274 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9275 auto *ScalarPHVPBB = Plan.getScalarPreheader(); 9276 auto *MiddleVPBB = Plan.getMiddleBlock(); 9277 VPBuilder ScalarPHBuilder(ScalarPHVPBB); 9278 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9279 VPValue *TwoVPV = Plan.getOrAddLiveIn( 9280 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); 9281 9282 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { 9283 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); 9284 if (!FOR) 9285 continue; 9286 9287 // This is the second phase of vectorizing first-order recurrences, creating 9288 // extract for users outside the loop. An overview of the transformation is 9289 // described below. Suppose we have the following loop with some use after 9290 // the loop of the last a[i-1], 9291 // 9292 // for (int i = 0; i < n; ++i) { 9293 // t = a[i - 1]; 9294 // b[i] = a[i] - t; 9295 // } 9296 // use t; 9297 // 9298 // There is a first-order recurrence on "a". For this loop, the shorthand 9299 // scalar IR looks like: 9300 // 9301 // scalar.ph: 9302 // s.init = a[-1] 9303 // br scalar.body 9304 // 9305 // scalar.body: 9306 // i = phi [0, scalar.ph], [i+1, scalar.body] 9307 // s1 = phi [s.init, scalar.ph], [s2, scalar.body] 9308 // s2 = a[i] 9309 // b[i] = s2 - s1 9310 // br cond, scalar.body, exit.block 9311 // 9312 // exit.block: 9313 // use = lcssa.phi [s1, scalar.body] 9314 // 9315 // In this example, s1 is a recurrence because it's value depends on the 9316 // previous iteration. In the first phase of vectorization, we created a 9317 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts 9318 // for users in the scalar preheader and exit block. 9319 // 9320 // vector.ph: 9321 // v_init = vector(..., ..., ..., a[-1]) 9322 // br vector.body 9323 // 9324 // vector.body 9325 // i = phi [0, vector.ph], [i+4, vector.body] 9326 // v1 = phi [v_init, vector.ph], [v2, vector.body] 9327 // v2 = a[i, i+1, i+2, i+3] 9328 // b[i] = v2 - v1 9329 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) 9330 // b[i, i+1, i+2, i+3] = v2 - v1 9331 // br cond, vector.body, middle.block 9332 // 9333 // middle.block: 9334 // vector.recur.extract.for.phi = v2(2) 9335 // vector.recur.extract = v2(3) 9336 // br cond, scalar.ph, exit.block 9337 // 9338 // scalar.ph: 9339 // scalar.recur.init = phi [vector.recur.extract, middle.block], 9340 // [s.init, otherwise] 9341 // br scalar.body 9342 // 9343 // scalar.body: 9344 // i = phi [0, scalar.ph], [i+1, scalar.body] 9345 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] 9346 // s2 = a[i] 9347 // b[i] = s2 - s1 9348 // br cond, scalar.body, exit.block 9349 // 9350 // exit.block: 9351 // lo = lcssa.phi [s1, scalar.body], 9352 // [vector.recur.extract.for.phi, middle.block] 9353 // 9354 // Now update VPIRInstructions modeling LCSSA phis in the exit block. 9355 // Extract the penultimate value of the recurrence and use it as operand for 9356 // the VPIRInstruction modeling the phi. 9357 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9358 if (ExitIRI->getOperand(0) != FOR) 9359 continue; 9360 VPValue *PenultimateElement = MiddleBuilder.createNaryOp( 9361 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, 9362 "vector.recur.extract.for.phi"); 9363 ExitIRI->setOperand(0, PenultimateElement); 9364 ExitUsersToFix.remove(ExitIRI); 9365 } 9366 } 9367 } 9368 9369 VPlanPtr 9370 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 9371 9372 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9373 9374 // --------------------------------------------------------------------------- 9375 // Build initial VPlan: Scan the body of the loop in a topological order to 9376 // visit each basic block after having visited its predecessor basic blocks. 9377 // --------------------------------------------------------------------------- 9378 9379 // Create initial VPlan skeleton, having a basic block for the pre-header 9380 // which contains SCEV expansions that need to happen before the CFG is 9381 // modified; a basic block for the vector pre-header, followed by a region for 9382 // the vector loop, followed by the middle basic block. The skeleton vector 9383 // loop region contains a header and latch basic blocks. 9384 9385 bool RequiresScalarEpilogueCheck = 9386 LoopVectorizationPlanner::getDecisionAndClampRange( 9387 [this](ElementCount VF) { 9388 return !CM.requiresScalarEpilogue(VF.isVector()); 9389 }, 9390 Range); 9391 VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), 9392 PSE, RequiresScalarEpilogueCheck, 9393 CM.foldTailByMasking(), OrigLoop); 9394 9395 // Don't use getDecisionAndClampRange here, because we don't know the UF 9396 // so this function is better to be conservative, rather than to split 9397 // it up into different VPlans. 9398 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 9399 bool IVUpdateMayOverflow = false; 9400 for (ElementCount VF : Range) 9401 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 9402 9403 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9404 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 9405 // Use NUW for the induction increment if we proved that it won't overflow in 9406 // the vector loop or when not folding the tail. In the later case, we know 9407 // that the canonical induction increment will not overflow as the vector trip 9408 // count is >= increment and a multiple of the increment. 9409 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; 9410 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 9411 9412 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9413 Builder); 9414 9415 // --------------------------------------------------------------------------- 9416 // Pre-construction: record ingredients whose recipes we'll need to further 9417 // process after constructing the initial VPlan. 9418 // --------------------------------------------------------------------------- 9419 9420 // For each interleave group which is relevant for this (possibly trimmed) 9421 // Range, add it to the set of groups to be later applied to the VPlan and add 9422 // placeholders for its members' Recipes which we'll be replacing with a 9423 // single VPInterleaveRecipe. 9424 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9425 auto ApplyIG = [IG, this](ElementCount VF) -> bool { 9426 bool Result = (VF.isVector() && // Query is illegal for VF == 1 9427 CM.getWideningDecision(IG->getInsertPos(), VF) == 9428 LoopVectorizationCostModel::CM_Interleave); 9429 // For scalable vectors, the only interleave factor currently supported 9430 // is 2 since we require the (de)interleave2 intrinsics instead of 9431 // shufflevectors. 9432 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 9433 "Unsupported interleave factor for scalable vectors"); 9434 return Result; 9435 }; 9436 if (!getDecisionAndClampRange(ApplyIG, Range)) 9437 continue; 9438 InterleaveGroups.insert(IG); 9439 } 9440 9441 // --------------------------------------------------------------------------- 9442 // Construct recipes for the instructions in the loop 9443 // --------------------------------------------------------------------------- 9444 9445 // Scan the body of the loop in a topological order to visit each basic block 9446 // after having visited its predecessor basic blocks. 9447 LoopBlocksDFS DFS(OrigLoop); 9448 DFS.perform(LI); 9449 9450 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 9451 VPBasicBlock *VPBB = HeaderVPBB; 9452 BasicBlock *HeaderBB = OrigLoop->getHeader(); 9453 bool NeedsMasks = 9454 CM.foldTailByMasking() || 9455 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { 9456 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); 9457 return Legal->blockNeedsPredication(BB) || NeedsBlends; 9458 }); 9459 9460 RecipeBuilder.collectScaledReductions(Range); 9461 9462 auto *MiddleVPBB = Plan->getMiddleBlock(); 9463 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); 9464 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9465 // Relevant instructions from basic block BB will be grouped into VPRecipe 9466 // ingredients and fill a new VPBasicBlock. 9467 if (VPBB != HeaderVPBB) 9468 VPBB->setName(BB->getName()); 9469 Builder.setInsertPoint(VPBB); 9470 9471 if (VPBB == HeaderVPBB) 9472 RecipeBuilder.createHeaderMask(); 9473 else if (NeedsMasks) 9474 RecipeBuilder.createBlockInMask(BB); 9475 9476 // Introduce each ingredient into VPlan. 9477 // TODO: Model and preserve debug intrinsics in VPlan. 9478 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 9479 Instruction *Instr = &I; 9480 SmallVector<VPValue *, 4> Operands; 9481 auto *Phi = dyn_cast<PHINode>(Instr); 9482 if (Phi && Phi->getParent() == HeaderBB) { 9483 Operands.push_back(Plan->getOrAddLiveIn( 9484 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9485 } else { 9486 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); 9487 Operands = {OpRange.begin(), OpRange.end()}; 9488 } 9489 9490 // The stores with invariant address inside the loop will be deleted, and 9491 // in the exit block, a uniform store recipe will be created for the final 9492 // invariant store of the reduction. 9493 StoreInst *SI; 9494 if ((SI = dyn_cast<StoreInst>(&I)) && 9495 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 9496 // Only create recipe for the final invariant store of the reduction. 9497 if (!Legal->isInvariantStoreOfReduction(SI)) 9498 continue; 9499 auto *Recipe = new VPReplicateRecipe( 9500 SI, RecipeBuilder.mapToVPValues(Instr->operands()), 9501 true /* IsUniform */); 9502 Recipe->insertBefore(*MiddleVPBB, MBIP); 9503 continue; 9504 } 9505 9506 VPRecipeBase *Recipe = 9507 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); 9508 if (!Recipe) 9509 Recipe = RecipeBuilder.handleReplication(Instr, Range); 9510 9511 RecipeBuilder.setRecipe(Instr, Recipe); 9512 if (isa<VPHeaderPHIRecipe>(Recipe)) { 9513 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 9514 // the following cases, VPHeaderPHIRecipes may be created after non-phi 9515 // recipes and need to be moved to the phi section of HeaderVPBB: 9516 // * tail-folding (non-phi recipes computing the header mask are 9517 // introduced earlier than regular header phi recipes, and should appear 9518 // after them) 9519 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 9520 9521 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 9522 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 9523 "unexpected recipe needs moving"); 9524 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9525 } else 9526 VPBB->appendRecipe(Recipe); 9527 } 9528 9529 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); 9530 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9531 } 9532 9533 // After here, VPBB should not be used. 9534 VPBB = nullptr; 9535 9536 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9537 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9538 "entry block must be set to a VPRegionBlock having a non-empty entry " 9539 "VPBasicBlock"); 9540 RecipeBuilder.fixHeaderPhis(); 9541 9542 // Update wide induction increments to use the same step as the corresponding 9543 // wide induction. This enables detecting induction increments directly in 9544 // VPlan and removes redundant splats. 9545 for (const auto &[Phi, ID] : Legal->getInductionVars()) { 9546 auto *IVInc = cast<Instruction>( 9547 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 9548 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) 9549 continue; 9550 VPWidenInductionRecipe *WideIV = 9551 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); 9552 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); 9553 R->setOperand(1, WideIV->getStepValue()); 9554 } 9555 9556 if (auto *UncountableExitingBlock = 9557 Legal->getUncountableEarlyExitingBlock()) { 9558 VPlanTransforms::handleUncountableEarlyExit( 9559 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); 9560 } 9561 addScalarResumePhis(RecipeBuilder, *Plan); 9562 SetVector<VPIRInstruction *> ExitUsersToFix = 9563 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); 9564 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); 9565 if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { 9566 reportVectorizationFailure( 9567 "Some exit values in loop with uncountable exit not supported yet", 9568 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); 9569 return nullptr; 9570 } 9571 9572 // --------------------------------------------------------------------------- 9573 // Transform initial VPlan: Apply previously taken decisions, in order, to 9574 // bring the VPlan to its final state. 9575 // --------------------------------------------------------------------------- 9576 9577 // Adjust the recipes for any inloop reductions. 9578 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); 9579 9580 // Interleave memory: for each Interleave Group we marked earlier as relevant 9581 // for this VPlan, replace the Recipes widening its memory instructions with a 9582 // single VPInterleaveRecipe at its insertion point. 9583 VPlanTransforms::createInterleaveGroups( 9584 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); 9585 9586 for (ElementCount VF : Range) 9587 Plan->addVF(VF); 9588 Plan->setName("Initial VPlan"); 9589 9590 // Replace VPValues for known constant strides guaranteed by predicate scalar 9591 // evolution. 9592 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { 9593 auto *R = cast<VPRecipeBase>(&U); 9594 return R->getParent()->getParent() || 9595 R->getParent() == 9596 Plan->getVectorLoopRegion()->getSinglePredecessor(); 9597 }; 9598 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9599 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9600 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9601 // Only handle constant strides for now. 9602 if (!ScevStride) 9603 continue; 9604 9605 auto *CI = Plan->getOrAddLiveIn( 9606 ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); 9607 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) 9608 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9609 9610 // The versioned value may not be used in the loop directly but through a 9611 // sext/zext. Add new live-ins in those cases. 9612 for (Value *U : StrideV->users()) { 9613 if (!isa<SExtInst, ZExtInst>(U)) 9614 continue; 9615 VPValue *StrideVPV = Plan->getLiveIn(U); 9616 if (!StrideVPV) 9617 continue; 9618 unsigned BW = U->getType()->getScalarSizeInBits(); 9619 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) 9620 : ScevStride->getAPInt().zext(BW); 9621 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); 9622 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9623 } 9624 } 9625 9626 VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) { 9627 return Legal->blockNeedsPredication(BB); 9628 }); 9629 9630 // Sink users of fixed-order recurrence past the recipe defining the previous 9631 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9632 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 9633 return nullptr; 9634 9635 if (useActiveLaneMask(Style)) { 9636 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 9637 // TailFoldingStyle is visible there. 9638 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 9639 bool WithoutRuntimeCheck = 9640 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 9641 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 9642 WithoutRuntimeCheck); 9643 } 9644 9645 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9646 return Plan; 9647 } 9648 9649 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9650 // Outer loop handling: They may require CFG and instruction level 9651 // transformations before even evaluating whether vectorization is profitable. 9652 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9653 // the vectorization pipeline. 9654 assert(!OrigLoop->isInnermost()); 9655 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9656 9657 // Create new empty VPlan 9658 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE, 9659 true, false, OrigLoop); 9660 9661 // Build hierarchical CFG 9662 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9663 HCFGBuilder.buildHierarchicalCFG(); 9664 9665 for (ElementCount VF : Range) 9666 Plan->addVF(VF); 9667 9668 VPlanTransforms::VPInstructionsToVPRecipes( 9669 Plan, 9670 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9671 *PSE.getSE(), *TLI); 9672 9673 // Remove the existing terminator of the exiting block of the top-most region. 9674 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9675 auto *Term = 9676 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9677 Term->eraseFromParent(); 9678 9679 // Tail folding is not supported for outer loops, so the induction increment 9680 // is guaranteed to not wrap. 9681 bool HasNUW = true; 9682 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 9683 DebugLoc()); 9684 9685 // Collect mapping of IR header phis to header phi recipes, to be used in 9686 // addScalarResumePhis. 9687 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9688 Builder); 9689 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9690 if (isa<VPCanonicalIVPHIRecipe>(&R)) 9691 continue; 9692 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); 9693 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); 9694 } 9695 addScalarResumePhis(RecipeBuilder, *Plan); 9696 9697 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9698 return Plan; 9699 } 9700 9701 // Adjust the recipes for reductions. For in-loop reductions the chain of 9702 // instructions leading from the loop exit instr to the phi need to be converted 9703 // to reductions, with one operand being vector and the other being the scalar 9704 // reduction chain. For other reductions, a select is introduced between the phi 9705 // and users outside the vector region when folding the tail. 9706 // 9707 // A ComputeReductionResult recipe is added to the middle block, also for 9708 // in-loop reductions which compute their result in-loop, because generating 9709 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 9710 // 9711 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9712 // with a boolean reduction phi node to check if the condition is true in any 9713 // iteration. The final value is selected by the final ComputeReductionResult. 9714 void LoopVectorizationPlanner::adjustRecipesForReductions( 9715 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 9716 using namespace VPlanPatternMatch; 9717 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 9718 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 9719 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); 9720 SmallVector<VPRecipeBase *> ToDelete; 9721 9722 for (VPRecipeBase &R : Header->phis()) { 9723 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9724 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9725 continue; 9726 9727 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9728 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9729 assert( 9730 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9731 !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && 9732 "AnyOf and FindLast reductions are not allowed for in-loop reductions"); 9733 9734 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9735 SetVector<VPSingleDefRecipe *> Worklist; 9736 Worklist.insert(PhiR); 9737 for (unsigned I = 0; I != Worklist.size(); ++I) { 9738 VPSingleDefRecipe *Cur = Worklist[I]; 9739 for (VPUser *U : Cur->users()) { 9740 auto *UserRecipe = cast<VPSingleDefRecipe>(U); 9741 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { 9742 assert((UserRecipe->getParent() == MiddleVPBB || 9743 UserRecipe->getParent() == Plan->getScalarPreheader()) && 9744 "U must be either in the loop region, the middle block or the " 9745 "scalar preheader."); 9746 continue; 9747 } 9748 Worklist.insert(UserRecipe); 9749 } 9750 } 9751 9752 // Visit operation "Links" along the reduction chain top-down starting from 9753 // the phi until LoopExitValue. We keep track of the previous item 9754 // (PreviousLink) to tell which of the two operands of a Link will remain 9755 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9756 // the select instructions. Blend recipes of in-loop reduction phi's will 9757 // get folded to their non-phi operand, as the reduction recipe handles the 9758 // condition directly. 9759 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 9760 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 9761 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9762 9763 // Index of the first operand which holds a non-mask vector operand. 9764 unsigned IndexOfFirstOperand; 9765 // Recognize a call to the llvm.fmuladd intrinsic. 9766 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9767 VPValue *VecOp; 9768 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9769 if (IsFMulAdd) { 9770 assert( 9771 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9772 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9773 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9774 isa<VPWidenIntrinsicRecipe>(CurrentLink)) && 9775 CurrentLink->getOperand(2) == PreviousLink && 9776 "expected a call where the previous link is the added operand"); 9777 9778 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9779 // need to create an fmul recipe (multiplying the first two operands of 9780 // the fmuladd together) to use as the vector operand for the fadd 9781 // reduction. 9782 VPInstruction *FMulRecipe = new VPInstruction( 9783 Instruction::FMul, 9784 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9785 CurrentLinkI->getFastMathFlags()); 9786 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9787 VecOp = FMulRecipe; 9788 } else { 9789 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); 9790 if (PhiR->isInLoop() && Blend) { 9791 assert(Blend->getNumIncomingValues() == 2 && 9792 "Blend must have 2 incoming values"); 9793 if (Blend->getIncomingValue(0) == PhiR) 9794 Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); 9795 else { 9796 assert(Blend->getIncomingValue(1) == PhiR && 9797 "PhiR must be an operand of the blend"); 9798 Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); 9799 } 9800 continue; 9801 } 9802 9803 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9804 if (isa<VPWidenRecipe>(CurrentLink)) { 9805 assert(isa<CmpInst>(CurrentLinkI) && 9806 "need to have the compare of the select"); 9807 continue; 9808 } 9809 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9810 "must be a select recipe"); 9811 IndexOfFirstOperand = 1; 9812 } else { 9813 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9814 "Expected to replace a VPWidenSC"); 9815 IndexOfFirstOperand = 0; 9816 } 9817 // Note that for non-commutable operands (cmp-selects), the semantics of 9818 // the cmp-select are captured in the recurrence kind. 9819 unsigned VecOpId = 9820 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9821 ? IndexOfFirstOperand + 1 9822 : IndexOfFirstOperand; 9823 VecOp = CurrentLink->getOperand(VecOpId); 9824 assert(VecOp != PreviousLink && 9825 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9826 (VecOpId - IndexOfFirstOperand)) == 9827 PreviousLink && 9828 "PreviousLink must be the operand other than VecOp"); 9829 } 9830 9831 BasicBlock *BB = CurrentLinkI->getParent(); 9832 VPValue *CondOp = nullptr; 9833 if (CM.blockNeedsPredicationForAnyReason(BB)) 9834 CondOp = RecipeBuilder.getBlockInMask(BB); 9835 9836 auto *RedRecipe = new VPReductionRecipe( 9837 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, 9838 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); 9839 // Append the recipe to the end of the VPBasicBlock because we need to 9840 // ensure that it comes after all of it's inputs, including CondOp. 9841 // Delete CurrentLink as it will be invalid if its operand is replaced 9842 // with a reduction defined at the bottom of the block in the next link. 9843 LinkVPBB->appendRecipe(RedRecipe); 9844 CurrentLink->replaceAllUsesWith(RedRecipe); 9845 ToDelete.push_back(CurrentLink); 9846 PreviousLink = RedRecipe; 9847 } 9848 } 9849 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); 9850 Builder.setInsertPoint(&*LatchVPBB->begin()); 9851 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); 9852 for (VPRecipeBase &R : 9853 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9854 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9855 if (!PhiR) 9856 continue; 9857 9858 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9859 // If tail is folded by masking, introduce selects between the phi 9860 // and the users outside the vector region of each reduction, at the 9861 // beginning of the dedicated latch block. 9862 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9863 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9864 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9865 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9866 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9867 "reduction recipe must be defined before latch"); 9868 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9869 std::optional<FastMathFlags> FMFs = 9870 PhiTy->isFloatingPointTy() 9871 ? std::make_optional(RdxDesc.getFastMathFlags()) 9872 : std::nullopt; 9873 NewExitingVPV = 9874 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9875 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9876 return isa<VPInstruction>(&U) && 9877 cast<VPInstruction>(&U)->getOpcode() == 9878 VPInstruction::ComputeReductionResult; 9879 }); 9880 if (CM.usePredicatedReductionSelect( 9881 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) 9882 PhiR->setOperand(1, NewExitingVPV); 9883 } 9884 9885 // If the vector reduction can be performed in a smaller type, we truncate 9886 // then extend the loop exit value to enable InstCombine to evaluate the 9887 // entire expression in the smaller type. 9888 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9889 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && 9890 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 9891 RdxDesc.getRecurrenceKind())) { 9892 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9893 Type *RdxTy = RdxDesc.getRecurrenceType(); 9894 auto *Trunc = 9895 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9896 auto *Extnd = 9897 RdxDesc.isSigned() 9898 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9899 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9900 9901 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9902 Extnd->insertAfter(Trunc); 9903 if (PhiR->getOperand(1) == NewExitingVPV) 9904 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9905 NewExitingVPV = Extnd; 9906 } 9907 9908 // We want code in the middle block to appear to execute on the location of 9909 // the scalar loop's latch terminator because: (a) it is all compiler 9910 // generated, (b) these instructions are always executed after evaluating 9911 // the latch conditional branch, and (c) other passes may add new 9912 // predecessors which terminate on this line. This is the easiest way to 9913 // ensure we don't accidentally cause an extra step back into the loop while 9914 // debugging. 9915 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9916 9917 // TODO: At the moment ComputeReductionResult also drives creation of the 9918 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9919 // even for in-loop reductions, until the reduction resume value handling is 9920 // also modeled in VPlan. 9921 auto *FinalReductionResult = new VPInstruction( 9922 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9923 // Update all users outside the vector region. 9924 OrigExitingVPV->replaceUsesWithIf( 9925 FinalReductionResult, [](VPUser &User, unsigned) { 9926 auto *Parent = cast<VPRecipeBase>(&User)->getParent(); 9927 return Parent && !Parent->getParent(); 9928 }); 9929 FinalReductionResult->insertBefore(*MiddleVPBB, IP); 9930 9931 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9932 // with a boolean reduction phi node to check if the condition is true in 9933 // any iteration. The final value is selected by the final 9934 // ComputeReductionResult. 9935 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 9936 RdxDesc.getRecurrenceKind())) { 9937 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { 9938 return isa<VPWidenSelectRecipe>(U) || 9939 (isa<VPReplicateRecipe>(U) && 9940 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == 9941 Instruction::Select); 9942 })); 9943 VPValue *Cmp = Select->getOperand(0); 9944 // If the compare is checking the reduction PHI node, adjust it to check 9945 // the start value. 9946 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { 9947 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) 9948 if (CmpR->getOperand(I) == PhiR) 9949 CmpR->setOperand(I, PhiR->getStartValue()); 9950 } 9951 VPBuilder::InsertPointGuard Guard(Builder); 9952 Builder.setInsertPoint(Select); 9953 9954 // If the true value of the select is the reduction phi, the new value is 9955 // selected if the negated condition is true in any iteration. 9956 if (Select->getOperand(1) == PhiR) 9957 Cmp = Builder.createNot(Cmp); 9958 VPValue *Or = Builder.createOr(PhiR, Cmp); 9959 Select->getVPSingleValue()->replaceAllUsesWith(Or); 9960 // Delete Select now that it has invalid types. 9961 ToDelete.push_back(Select); 9962 9963 // Convert the reduction phi to operate on bools. 9964 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( 9965 OrigLoop->getHeader()->getContext()))); 9966 continue; 9967 } 9968 9969 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 9970 RdxDesc.getRecurrenceKind())) { 9971 // Adjust the start value for FindLastIV recurrences to use the sentinel 9972 // value after generating the ResumePhi recipe, which uses the original 9973 // start value. 9974 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); 9975 } 9976 } 9977 9978 VPlanTransforms::clearReductionWrapFlags(*Plan); 9979 for (VPRecipeBase *R : ToDelete) 9980 R->eraseFromParent(); 9981 } 9982 9983 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9984 assert(!State.Lane && "VPDerivedIVRecipe being replicated."); 9985 9986 // Fast-math-flags propagate from the original induction instruction. 9987 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9988 if (FPBinOp) 9989 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9990 9991 Value *Step = State.get(getStepValue(), VPLane(0)); 9992 Value *Index = State.get(getOperand(1), VPLane(0)); 9993 Value *DerivedIV = emitTransformedIndex( 9994 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, 9995 cast_if_present<BinaryOperator>(FPBinOp)); 9996 DerivedIV->setName(Name); 9997 // If index is the vector trip count, the concrete value will only be set in 9998 // prepareToExecute, leading to missed simplifications, e.g. if it is 0. 9999 // TODO: Remove the special case for the vector trip count once it is computed 10000 // in VPlan and can be used during VPlan simplification. 10001 assert((DerivedIV != Index || 10002 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && 10003 "IV didn't need transforming?"); 10004 State.set(this, DerivedIV, VPLane(0)); 10005 } 10006 10007 void VPReplicateRecipe::execute(VPTransformState &State) { 10008 Instruction *UI = getUnderlyingInstr(); 10009 if (State.Lane) { // Generate a single instance. 10010 assert((State.VF.isScalar() || !isUniform()) && 10011 "uniform recipe shouldn't be predicated"); 10012 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 10013 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State); 10014 // Insert scalar instance packing it into a vector. 10015 if (State.VF.isVector() && shouldPack()) { 10016 // If we're constructing lane 0, initialize to start from poison. 10017 if (State.Lane->isFirstLane()) { 10018 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 10019 Value *Poison = PoisonValue::get( 10020 VectorType::get(UI->getType(), State.VF)); 10021 State.set(this, Poison); 10022 } 10023 State.packScalarIntoVectorValue(this, *State.Lane); 10024 } 10025 return; 10026 } 10027 10028 if (IsUniform) { 10029 // Uniform within VL means we need to generate lane 0. 10030 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State); 10031 return; 10032 } 10033 10034 // A store of a loop varying value to a uniform address only needs the last 10035 // copy of the store. 10036 if (isa<StoreInst>(UI) && 10037 vputils::isUniformAfterVectorization(getOperand(1))) { 10038 auto Lane = VPLane::getLastLaneForVF(State.VF); 10039 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 10040 return; 10041 } 10042 10043 // Generate scalar instances for all VF lanes. 10044 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 10045 const unsigned EndLane = State.VF.getKnownMinValue(); 10046 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 10047 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 10048 } 10049 10050 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10051 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10052 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10053 // for predication. 10054 static ScalarEpilogueLowering getScalarEpilogueLowering( 10055 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10056 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10057 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 10058 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10059 // don't look at hints or options, and don't request a scalar epilogue. 10060 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10061 // LoopAccessInfo (due to code dependency and not being able to reliably get 10062 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10063 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10064 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10065 // back to the old way and vectorize with versioning when forced. See D81345.) 10066 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10067 PGSOQueryType::IRPass) && 10068 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10069 return CM_ScalarEpilogueNotAllowedOptSize; 10070 10071 // 2) If set, obey the directives 10072 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10073 switch (PreferPredicateOverEpilogue) { 10074 case PreferPredicateTy::ScalarEpilogue: 10075 return CM_ScalarEpilogueAllowed; 10076 case PreferPredicateTy::PredicateElseScalarEpilogue: 10077 return CM_ScalarEpilogueNotNeededUsePredicate; 10078 case PreferPredicateTy::PredicateOrDontVectorize: 10079 return CM_ScalarEpilogueNotAllowedUsePredicate; 10080 }; 10081 } 10082 10083 // 3) If set, obey the hints 10084 switch (Hints.getPredicate()) { 10085 case LoopVectorizeHints::FK_Enabled: 10086 return CM_ScalarEpilogueNotNeededUsePredicate; 10087 case LoopVectorizeHints::FK_Disabled: 10088 return CM_ScalarEpilogueAllowed; 10089 }; 10090 10091 // 4) if the TTI hook indicates this is profitable, request predication. 10092 TailFoldingInfo TFI(TLI, &LVL, IAI); 10093 if (TTI->preferPredicateOverEpilogue(&TFI)) 10094 return CM_ScalarEpilogueNotNeededUsePredicate; 10095 10096 return CM_ScalarEpilogueAllowed; 10097 } 10098 10099 // Process the loop in the VPlan-native vectorization path. This path builds 10100 // VPlan upfront in the vectorization pipeline, which allows to apply 10101 // VPlan-to-VPlan transformations from the very beginning without modifying the 10102 // input LLVM IR. 10103 static bool processLoopInVPlanNativePath( 10104 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10105 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10106 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10107 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10108 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10109 LoopVectorizationRequirements &Requirements) { 10110 10111 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10112 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10113 return false; 10114 } 10115 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10116 Function *F = L->getHeader()->getParent(); 10117 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10118 10119 ScalarEpilogueLowering SEL = 10120 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 10121 10122 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10123 &Hints, IAI); 10124 // Use the planner for outer loop vectorization. 10125 // TODO: CM is not used at this point inside the planner. Turn CM into an 10126 // optional argument if we don't need it in the future. 10127 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 10128 ORE); 10129 10130 // Get user vectorization factor. 10131 ElementCount UserVF = Hints.getWidth(); 10132 10133 CM.collectElementTypesForWidening(); 10134 10135 // Plan how to best vectorize, return the best VF and its cost. 10136 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10137 10138 // If we are stress testing VPlan builds, do not attempt to generate vector 10139 // code. Masked vector code generation support will follow soon. 10140 // Also, do not attempt to vectorize if no vector code will be produced. 10141 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10142 return false; 10143 10144 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10145 10146 { 10147 bool AddBranchWeights = 10148 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10149 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10150 AddBranchWeights); 10151 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10152 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); 10153 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10154 << L->getHeader()->getParent()->getName() << "\"\n"); 10155 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10156 } 10157 10158 reportVectorization(ORE, L, VF, 1); 10159 10160 // Mark the loop as already vectorized to avoid vectorizing again. 10161 Hints.setAlreadyVectorized(); 10162 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10163 return true; 10164 } 10165 10166 // Emit a remark if there are stores to floats that required a floating point 10167 // extension. If the vectorized loop was generated with floating point there 10168 // will be a performance penalty from the conversion overhead and the change in 10169 // the vector width. 10170 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10171 SmallVector<Instruction *, 4> Worklist; 10172 for (BasicBlock *BB : L->getBlocks()) { 10173 for (Instruction &Inst : *BB) { 10174 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10175 if (S->getValueOperand()->getType()->isFloatTy()) 10176 Worklist.push_back(S); 10177 } 10178 } 10179 } 10180 10181 // Traverse the floating point stores upwards searching, for floating point 10182 // conversions. 10183 SmallPtrSet<const Instruction *, 4> Visited; 10184 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10185 while (!Worklist.empty()) { 10186 auto *I = Worklist.pop_back_val(); 10187 if (!L->contains(I)) 10188 continue; 10189 if (!Visited.insert(I).second) 10190 continue; 10191 10192 // Emit a remark if the floating point store required a floating 10193 // point conversion. 10194 // TODO: More work could be done to identify the root cause such as a 10195 // constant or a function return type and point the user to it. 10196 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10197 ORE->emit([&]() { 10198 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10199 I->getDebugLoc(), L->getHeader()) 10200 << "floating point conversion changes vector width. " 10201 << "Mixed floating point precision requires an up/down " 10202 << "cast that will negatively impact performance."; 10203 }); 10204 10205 for (Use &Op : I->operands()) 10206 if (auto *OpI = dyn_cast<Instruction>(Op)) 10207 Worklist.push_back(OpI); 10208 } 10209 } 10210 10211 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10212 VectorizationFactor &VF, Loop *L, 10213 const TargetTransformInfo &TTI, 10214 PredicatedScalarEvolution &PSE, 10215 ScalarEpilogueLowering SEL) { 10216 InstructionCost CheckCost = Checks.getCost(); 10217 if (!CheckCost.isValid()) 10218 return false; 10219 10220 // When interleaving only scalar and vector cost will be equal, which in turn 10221 // would lead to a divide by 0. Fall back to hard threshold. 10222 if (VF.Width.isScalar()) { 10223 if (CheckCost > VectorizeMemoryCheckThreshold) { 10224 LLVM_DEBUG( 10225 dbgs() 10226 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10227 return false; 10228 } 10229 return true; 10230 } 10231 10232 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10233 uint64_t ScalarC = *VF.ScalarCost.getValue(); 10234 if (ScalarC == 0) 10235 return true; 10236 10237 // First, compute the minimum iteration count required so that the vector 10238 // loop outperforms the scalar loop. 10239 // The total cost of the scalar loop is 10240 // ScalarC * TC 10241 // where 10242 // * TC is the actual trip count of the loop. 10243 // * ScalarC is the cost of a single scalar iteration. 10244 // 10245 // The total cost of the vector loop is 10246 // RtC + VecC * (TC / VF) + EpiC 10247 // where 10248 // * RtC is the cost of the generated runtime checks 10249 // * VecC is the cost of a single vector iteration. 10250 // * TC is the actual trip count of the loop 10251 // * VF is the vectorization factor 10252 // * EpiCost is the cost of the generated epilogue, including the cost 10253 // of the remaining scalar operations. 10254 // 10255 // Vectorization is profitable once the total vector cost is less than the 10256 // total scalar cost: 10257 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10258 // 10259 // Now we can compute the minimum required trip count TC as 10260 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 10261 // 10262 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10263 // the computations are performed on doubles, not integers and the result 10264 // is rounded up, hence we get an upper estimate of the TC. 10265 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); 10266 uint64_t RtC = *CheckCost.getValue(); 10267 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); 10268 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); 10269 10270 // Second, compute a minimum iteration count so that the cost of the 10271 // runtime checks is only a fraction of the total scalar loop cost. This 10272 // adds a loop-dependent bound on the overhead incurred if the runtime 10273 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10274 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10275 // cost, compute 10276 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10277 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); 10278 10279 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 10280 // epilogue is allowed, choose the next closest multiple of VF. This should 10281 // partly compensate for ignoring the epilogue cost. 10282 uint64_t MinTC = std::max(MinTC1, MinTC2); 10283 if (SEL == CM_ScalarEpilogueAllowed) 10284 MinTC = alignTo(MinTC, IntVF); 10285 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 10286 10287 LLVM_DEBUG( 10288 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10289 << VF.MinProfitableTripCount << "\n"); 10290 10291 // Skip vectorization if the expected trip count is less than the minimum 10292 // required trip count. 10293 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { 10294 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10295 VF.MinProfitableTripCount)) { 10296 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10297 "trip count < minimum profitable VF (" 10298 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10299 << ")\n"); 10300 10301 return false; 10302 } 10303 } 10304 return true; 10305 } 10306 10307 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10308 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10309 !EnableLoopInterleaving), 10310 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10311 !EnableLoopVectorization) {} 10312 10313 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue 10314 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that 10315 /// don't have a corresponding wide induction in \p EpiPlan. 10316 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { 10317 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those 10318 // will need their resume-values computed in the main vector loop. Others 10319 // can be removed from the main VPlan. 10320 SmallPtrSet<PHINode *, 2> EpiWidenedPhis; 10321 for (VPRecipeBase &R : 10322 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 10323 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10324 continue; 10325 EpiWidenedPhis.insert( 10326 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); 10327 } 10328 for (VPRecipeBase &R : make_early_inc_range( 10329 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { 10330 auto *VPIRInst = cast<VPIRInstruction>(&R); 10331 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); 10332 if (!IRI) 10333 break; 10334 if (EpiWidenedPhis.contains(IRI)) 10335 continue; 10336 // There is no corresponding wide induction in the epilogue plan that would 10337 // need a resume value. Remove the VPIRInst wrapping the scalar header phi 10338 // together with the corresponding ResumePhi. The resume values for the 10339 // scalar loop will be created during execution of EpiPlan. 10340 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); 10341 VPIRInst->eraseFromParent(); 10342 ResumePhi->eraseFromParent(); 10343 } 10344 VPlanTransforms::removeDeadRecipes(MainPlan); 10345 10346 using namespace VPlanPatternMatch; 10347 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); 10348 VPValue *VectorTC = &MainPlan.getVectorTripCount(); 10349 // If there is a suitable resume value for the canonical induction in the 10350 // scalar (which will become vector) epilogue loop we are done. Otherwise 10351 // create it below. 10352 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { 10353 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( 10354 m_Specific(VectorTC), m_SpecificInt(0))); 10355 })) 10356 return; 10357 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); 10358 ScalarPHBuilder.createNaryOp( 10359 VPInstruction::ResumePhi, 10360 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, 10361 "vec.epilog.resume.val"); 10362 } 10363 10364 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded 10365 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. 10366 static void 10367 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, 10368 const SCEV2ValueTy &ExpandedSCEVs, 10369 const EpilogueLoopVectorizationInfo &EPI) { 10370 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); 10371 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10372 Header->setName("vec.epilog.vector.body"); 10373 10374 // Re-use the trip count and steps expanded for the main loop, as 10375 // skeleton creation needs it as a value that dominates both the scalar 10376 // and vector epilogue loops 10377 // TODO: This is a workaround needed for epilogue vectorization and it 10378 // should be removed once induction resume value creation is done 10379 // directly in VPlan. 10380 for (auto &R : make_early_inc_range(*Plan.getEntry())) { 10381 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); 10382 if (!ExpandR) 10383 continue; 10384 auto *ExpandedVal = 10385 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10386 ExpandR->replaceAllUsesWith(ExpandedVal); 10387 if (Plan.getTripCount() == ExpandR) 10388 Plan.resetTripCount(ExpandedVal); 10389 ExpandR->eraseFromParent(); 10390 } 10391 10392 // Ensure that the start values for all header phi recipes are updated before 10393 // vectorizing the epilogue loop. 10394 for (VPRecipeBase &R : Header->phis()) { 10395 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { 10396 // When vectorizing the epilogue loop, the canonical induction start 10397 // value needs to be changed from zero to the value after the main 10398 // vector loop. Find the resume value created during execution of the main 10399 // VPlan. 10400 // FIXME: Improve modeling for canonical IV start values in the epilogue 10401 // loop. 10402 BasicBlock *MainMiddle = find_singleton<BasicBlock>( 10403 predecessors(L->getLoopPreheader()), 10404 [&EPI](BasicBlock *BB, bool) -> BasicBlock * { 10405 if (BB != EPI.MainLoopIterationCountCheck && 10406 BB != EPI.EpilogueIterationCountCheck && 10407 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck) 10408 return BB; 10409 return nullptr; 10410 }); 10411 using namespace llvm::PatternMatch; 10412 Type *IdxTy = IV->getScalarType(); 10413 PHINode *EPResumeVal = find_singleton<PHINode>( 10414 L->getLoopPreheader()->phis(), 10415 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * { 10416 if (P.getType() == IdxTy && 10417 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount && 10418 match( 10419 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), 10420 m_SpecificInt(0))) 10421 return &P; 10422 return nullptr; 10423 }); 10424 assert(EPResumeVal && "must have a resume value for the canonical IV"); 10425 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); 10426 assert(all_of(IV->users(), 10427 [](const VPUser *U) { 10428 return isa<VPScalarIVStepsRecipe>(U) || 10429 isa<VPScalarCastRecipe>(U) || 10430 isa<VPDerivedIVRecipe>(U) || 10431 cast<VPInstruction>(U)->getOpcode() == 10432 Instruction::Add; 10433 }) && 10434 "the canonical IV should only be used by its increment or " 10435 "ScalarIVSteps when resetting the start value"); 10436 IV->setOperand(0, VPV); 10437 continue; 10438 } 10439 10440 Value *ResumeV = nullptr; 10441 // TODO: Move setting of resume values to prepareToExecute. 10442 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10443 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) 10444 ->getIncomingValueForBlock(L->getLoopPreheader()); 10445 const RecurrenceDescriptor &RdxDesc = 10446 ReductionPhi->getRecurrenceDescriptor(); 10447 RecurKind RK = RdxDesc.getRecurrenceKind(); 10448 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 10449 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10450 // start value; compare the final value from the main vector loop 10451 // to the start value. 10452 IRBuilder<> Builder( 10453 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10454 ResumeV = 10455 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); 10456 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 10457 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment 10458 // to the resume value. The resume value is adjusted to the sentinel 10459 // value when the final value from the main vector loop equals the start 10460 // value. This ensures correctness when the start value might not be 10461 // less than the minimum value of a monotonically increasing induction 10462 // variable. 10463 IRBuilder<> Builder( 10464 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10465 Value *Cmp = 10466 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); 10467 ResumeV = 10468 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); 10469 } 10470 } else { 10471 // Retrieve the induction resume values for wide inductions from 10472 // their original phi nodes in the scalar loop. 10473 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode(); 10474 // Hook up to the PHINode generated by a ResumePhi recipe of main 10475 // loop VPlan, which feeds the scalar loop. 10476 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); 10477 } 10478 assert(ResumeV && "Must have a resume value"); 10479 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); 10480 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10481 } 10482 } 10483 10484 bool LoopVectorizePass::processLoop(Loop *L) { 10485 assert((EnableVPlanNativePath || L->isInnermost()) && 10486 "VPlan-native path is not enabled. Only process inner loops."); 10487 10488 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10489 << L->getHeader()->getParent()->getName() << "' from " 10490 << L->getLocStr() << "\n"); 10491 10492 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10493 10494 LLVM_DEBUG( 10495 dbgs() << "LV: Loop hints:" 10496 << " force=" 10497 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10498 ? "disabled" 10499 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10500 ? "enabled" 10501 : "?")) 10502 << " width=" << Hints.getWidth() 10503 << " interleave=" << Hints.getInterleave() << "\n"); 10504 10505 // Function containing loop 10506 Function *F = L->getHeader()->getParent(); 10507 10508 // Looking at the diagnostic output is the only way to determine if a loop 10509 // was vectorized (other than looking at the IR or machine code), so it 10510 // is important to generate an optimization remark for each loop. Most of 10511 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10512 // generated as OptimizationRemark and OptimizationRemarkMissed are 10513 // less verbose reporting vectorized loops and unvectorized loops that may 10514 // benefit from vectorization, respectively. 10515 10516 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10517 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10518 return false; 10519 } 10520 10521 PredicatedScalarEvolution PSE(*SE, *L); 10522 10523 // Check if it is legal to vectorize the loop. 10524 LoopVectorizationRequirements Requirements; 10525 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10526 &Requirements, &Hints, DB, AC, BFI, PSI); 10527 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10528 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10529 Hints.emitRemarkWithHints(); 10530 return false; 10531 } 10532 10533 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { 10534 reportVectorizationFailure("Auto-vectorization of loops with uncountable " 10535 "early exit is not enabled", 10536 "UncountableEarlyExitLoopsDisabled", ORE, L); 10537 return false; 10538 } 10539 10540 if (LVL.hasStructVectorCall()) { 10541 reportVectorizationFailure("Auto-vectorization of calls that return struct " 10542 "types is not yet supported", 10543 "StructCallVectorizationUnsupported", ORE, L); 10544 return false; 10545 } 10546 10547 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10548 // here. They may require CFG and instruction level transformations before 10549 // even evaluating whether vectorization is profitable. Since we cannot modify 10550 // the incoming IR, we need to build VPlan upfront in the vectorization 10551 // pipeline. 10552 if (!L->isInnermost()) 10553 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10554 ORE, BFI, PSI, Hints, Requirements); 10555 10556 assert(L->isInnermost() && "Inner loop expected."); 10557 10558 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10559 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10560 10561 // If an override option has been passed in for interleaved accesses, use it. 10562 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10563 UseInterleaved = EnableInterleavedMemAccesses; 10564 10565 // Analyze interleaved memory accesses. 10566 if (UseInterleaved) 10567 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10568 10569 if (LVL.hasUncountableEarlyExit()) { 10570 BasicBlock *LoopLatch = L->getLoopLatch(); 10571 if (IAI.requiresScalarEpilogue() || 10572 any_of(LVL.getCountableExitingBlocks(), 10573 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { 10574 reportVectorizationFailure("Auto-vectorization of early exit loops " 10575 "requiring a scalar epilogue is unsupported", 10576 "UncountableEarlyExitUnsupported", ORE, L); 10577 return false; 10578 } 10579 } 10580 10581 // Check the function attributes and profiles to find out if this function 10582 // should be optimized for size. 10583 ScalarEpilogueLowering SEL = 10584 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10585 10586 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10587 // count by optimizing for size, to minimize overheads. 10588 auto ExpectedTC = getSmallBestKnownTC(PSE, L); 10589 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10590 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10591 << "This loop is worth vectorizing only if no scalar " 10592 << "iteration overheads are incurred."); 10593 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10594 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10595 else { 10596 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10597 LLVM_DEBUG(dbgs() << "\n"); 10598 // Predicate tail-folded loops are efficient even when the loop 10599 // iteration count is low. However, setting the epilogue policy to 10600 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 10601 // with runtime checks. It's more effective to let 10602 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 10603 // for the loop. 10604 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 10605 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10606 } else { 10607 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10608 "small to consider vectorizing.\n"); 10609 reportVectorizationFailure( 10610 "The trip count is below the minial threshold value.", 10611 "loop trip count is too low, avoiding vectorization", 10612 "LowTripCount", ORE, L); 10613 Hints.emitRemarkWithHints(); 10614 return false; 10615 } 10616 } 10617 } 10618 10619 // Check the function attributes to see if implicit floats or vectors are 10620 // allowed. 10621 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10622 reportVectorizationFailure( 10623 "Can't vectorize when the NoImplicitFloat attribute is used", 10624 "loop not vectorized due to NoImplicitFloat attribute", 10625 "NoImplicitFloat", ORE, L); 10626 Hints.emitRemarkWithHints(); 10627 return false; 10628 } 10629 10630 // Check if the target supports potentially unsafe FP vectorization. 10631 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10632 // for the target we're vectorizing for, to make sure none of the 10633 // additional fp-math flags can help. 10634 if (Hints.isPotentiallyUnsafe() && 10635 TTI->isFPVectorizationPotentiallyUnsafe()) { 10636 reportVectorizationFailure( 10637 "Potentially unsafe FP op prevents vectorization", 10638 "loop not vectorized due to unsafe FP support.", 10639 "UnsafeFP", ORE, L); 10640 Hints.emitRemarkWithHints(); 10641 return false; 10642 } 10643 10644 bool AllowOrderedReductions; 10645 // If the flag is set, use that instead and override the TTI behaviour. 10646 if (ForceOrderedReductions.getNumOccurrences() > 0) 10647 AllowOrderedReductions = ForceOrderedReductions; 10648 else 10649 AllowOrderedReductions = TTI->enableOrderedReductions(); 10650 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10651 ORE->emit([&]() { 10652 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10653 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10654 ExactFPMathInst->getDebugLoc(), 10655 ExactFPMathInst->getParent()) 10656 << "loop not vectorized: cannot prove it is safe to reorder " 10657 "floating-point operations"; 10658 }); 10659 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10660 "reorder floating-point operations\n"); 10661 Hints.emitRemarkWithHints(); 10662 return false; 10663 } 10664 10665 // Use the cost model. 10666 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10667 F, &Hints, IAI); 10668 // Use the planner for vectorization. 10669 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10670 ORE); 10671 10672 // Get user vectorization factor and interleave count. 10673 ElementCount UserVF = Hints.getWidth(); 10674 unsigned UserIC = Hints.getInterleave(); 10675 10676 // Plan how to best vectorize. 10677 LVP.plan(UserVF, UserIC); 10678 VectorizationFactor VF = LVP.computeBestVF(); 10679 unsigned IC = 1; 10680 10681 if (ORE->allowExtraAnalysis(LV_NAME)) 10682 LVP.emitInvalidCostRemarks(ORE); 10683 10684 bool AddBranchWeights = 10685 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10686 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10687 AddBranchWeights); 10688 if (LVP.hasPlanWithVF(VF.Width)) { 10689 // Select the interleave count. 10690 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10691 10692 unsigned SelectedIC = std::max(IC, UserIC); 10693 // Optimistically generate runtime checks if they are needed. Drop them if 10694 // they turn out to not be profitable. 10695 if (VF.Width.isVector() || SelectedIC > 1) 10696 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10697 10698 // Check if it is profitable to vectorize with runtime checks. 10699 bool ForceVectorization = 10700 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10701 if (!ForceVectorization && 10702 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { 10703 ORE->emit([&]() { 10704 return OptimizationRemarkAnalysisAliasing( 10705 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10706 L->getHeader()) 10707 << "loop not vectorized: cannot prove it is safe to reorder " 10708 "memory operations"; 10709 }); 10710 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10711 Hints.emitRemarkWithHints(); 10712 return false; 10713 } 10714 } 10715 10716 // Identify the diagnostic messages that should be produced. 10717 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10718 bool VectorizeLoop = true, InterleaveLoop = true; 10719 if (VF.Width.isScalar()) { 10720 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10721 VecDiagMsg = std::make_pair( 10722 "VectorizationNotBeneficial", 10723 "the cost-model indicates that vectorization is not beneficial"); 10724 VectorizeLoop = false; 10725 } 10726 10727 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { 10728 // Tell the user interleaving was avoided up-front, despite being explicitly 10729 // requested. 10730 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10731 "interleaving should be avoided up front\n"); 10732 IntDiagMsg = std::make_pair( 10733 "InterleavingAvoided", 10734 "Ignoring UserIC, because interleaving was avoided up front"); 10735 InterleaveLoop = false; 10736 } else if (IC == 1 && UserIC <= 1) { 10737 // Tell the user interleaving is not beneficial. 10738 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10739 IntDiagMsg = std::make_pair( 10740 "InterleavingNotBeneficial", 10741 "the cost-model indicates that interleaving is not beneficial"); 10742 InterleaveLoop = false; 10743 if (UserIC == 1) { 10744 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10745 IntDiagMsg.second += 10746 " and is explicitly disabled or interleave count is set to 1"; 10747 } 10748 } else if (IC > 1 && UserIC == 1) { 10749 // Tell the user interleaving is beneficial, but it explicitly disabled. 10750 LLVM_DEBUG( 10751 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10752 IntDiagMsg = std::make_pair( 10753 "InterleavingBeneficialButDisabled", 10754 "the cost-model indicates that interleaving is beneficial " 10755 "but is explicitly disabled or interleave count is set to 1"); 10756 InterleaveLoop = false; 10757 } 10758 10759 // If there is a histogram in the loop, do not just interleave without 10760 // vectorizing. The order of operations will be incorrect without the 10761 // histogram intrinsics, which are only used for recipes with VF > 1. 10762 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) { 10763 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " 10764 << "to histogram operations.\n"); 10765 IntDiagMsg = std::make_pair( 10766 "HistogramPreventsScalarInterleaving", 10767 "Unable to interleave without vectorization due to constraints on " 10768 "the order of histogram operations"); 10769 InterleaveLoop = false; 10770 } 10771 10772 // Override IC if user provided an interleave count. 10773 IC = UserIC > 0 ? UserIC : IC; 10774 10775 // Emit diagnostic messages, if any. 10776 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10777 if (!VectorizeLoop && !InterleaveLoop) { 10778 // Do not vectorize or interleaving the loop. 10779 ORE->emit([&]() { 10780 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10781 L->getStartLoc(), L->getHeader()) 10782 << VecDiagMsg.second; 10783 }); 10784 ORE->emit([&]() { 10785 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10786 L->getStartLoc(), L->getHeader()) 10787 << IntDiagMsg.second; 10788 }); 10789 return false; 10790 } 10791 10792 if (!VectorizeLoop && InterleaveLoop) { 10793 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10794 ORE->emit([&]() { 10795 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10796 L->getStartLoc(), L->getHeader()) 10797 << VecDiagMsg.second; 10798 }); 10799 } else if (VectorizeLoop && !InterleaveLoop) { 10800 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10801 << ") in " << L->getLocStr() << '\n'); 10802 ORE->emit([&]() { 10803 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10804 L->getStartLoc(), L->getHeader()) 10805 << IntDiagMsg.second; 10806 }); 10807 } else if (VectorizeLoop && InterleaveLoop) { 10808 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10809 << ") in " << L->getLocStr() << '\n'); 10810 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10811 } 10812 10813 bool DisableRuntimeUnroll = false; 10814 MDNode *OrigLoopID = L->getLoopID(); 10815 { 10816 using namespace ore; 10817 if (!VectorizeLoop) { 10818 assert(IC > 1 && "interleave count should not be 1 or 0"); 10819 // If we decided that it is not legal to vectorize the loop, then 10820 // interleave it. 10821 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10822 InnerLoopVectorizer Unroller( 10823 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), 10824 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan); 10825 10826 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10827 10828 ORE->emit([&]() { 10829 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10830 L->getHeader()) 10831 << "interleaved loop (interleaved count: " 10832 << NV("InterleaveCount", IC) << ")"; 10833 }); 10834 } else { 10835 // If we decided that it is *legal* to vectorize the loop, then do it. 10836 10837 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10838 // Consider vectorizing the epilogue too if it's profitable. 10839 VectorizationFactor EpilogueVF = 10840 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10841 if (EpilogueVF.Width.isVector()) { 10842 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); 10843 10844 // The first pass vectorizes the main loop and creates a scalar epilogue 10845 // to be vectorized by executing the plan (potentially with a different 10846 // factor) again shortly afterwards. 10847 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); 10848 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); 10849 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, 10850 BestEpiPlan); 10851 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10852 EPI, &LVL, &CM, BFI, PSI, Checks, 10853 *BestMainPlan); 10854 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10855 *BestMainPlan, MainILV, DT, false); 10856 ++LoopsVectorized; 10857 10858 // Second pass vectorizes the epilogue and adjusts the control flow 10859 // edges from the first pass. 10860 EPI.MainLoopVF = EPI.EpilogueVF; 10861 EPI.MainLoopUF = EPI.EpilogueUF; 10862 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10863 ORE, EPI, &LVL, &CM, BFI, PSI, 10864 Checks, BestEpiPlan); 10865 EpilogILV.setTripCount(MainILV.getTripCount()); 10866 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); 10867 10868 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10869 DT, true, &ExpandedSCEVs); 10870 ++LoopsEpilogueVectorized; 10871 10872 if (!MainILV.areSafetyChecksAdded()) 10873 DisableRuntimeUnroll = true; 10874 } else { 10875 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10876 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10877 PSI, Checks, BestPlan); 10878 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10879 ++LoopsVectorized; 10880 10881 // Add metadata to disable runtime unrolling a scalar loop when there 10882 // are no runtime checks about strides and memory. A scalar loop that is 10883 // rarely used is not worth unrolling. 10884 if (!LB.areSafetyChecksAdded()) 10885 DisableRuntimeUnroll = true; 10886 } 10887 // Report the vectorization decision. 10888 reportVectorization(ORE, L, VF, IC); 10889 } 10890 10891 if (ORE->allowExtraAnalysis(LV_NAME)) 10892 checkMixedPrecision(L, ORE); 10893 } 10894 10895 assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 10896 "DT not preserved correctly"); 10897 10898 std::optional<MDNode *> RemainderLoopID = 10899 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10900 LLVMLoopVectorizeFollowupEpilogue}); 10901 if (RemainderLoopID) { 10902 L->setLoopID(*RemainderLoopID); 10903 } else { 10904 if (DisableRuntimeUnroll) 10905 addRuntimeUnrollDisableMetaData(L); 10906 10907 // Mark the loop as already vectorized to avoid vectorizing again. 10908 Hints.setAlreadyVectorized(); 10909 } 10910 10911 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10912 return true; 10913 } 10914 10915 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { 10916 10917 // Don't attempt if 10918 // 1. the target claims to have no vector registers, and 10919 // 2. interleaving won't help ILP. 10920 // 10921 // The second condition is necessary because, even if the target has no 10922 // vector registers, loop vectorization may still enable scalar 10923 // interleaving. 10924 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10925 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10926 return LoopVectorizeResult(false, false); 10927 10928 bool Changed = false, CFGChanged = false; 10929 10930 // The vectorizer requires loops to be in simplified form. 10931 // Since simplification may add new inner loops, it has to run before the 10932 // legality and profitability checks. This means running the loop vectorizer 10933 // will simplify all loops, regardless of whether anything end up being 10934 // vectorized. 10935 for (const auto &L : *LI) 10936 Changed |= CFGChanged |= 10937 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10938 10939 // Build up a worklist of inner-loops to vectorize. This is necessary as 10940 // the act of vectorizing or partially unrolling a loop creates new loops 10941 // and can invalidate iterators across the loops. 10942 SmallVector<Loop *, 8> Worklist; 10943 10944 for (Loop *L : *LI) 10945 collectSupportedLoops(*L, LI, ORE, Worklist); 10946 10947 LoopsAnalyzed += Worklist.size(); 10948 10949 // Now walk the identified inner loops. 10950 while (!Worklist.empty()) { 10951 Loop *L = Worklist.pop_back_val(); 10952 10953 // For the inner loops we actually process, form LCSSA to simplify the 10954 // transform. 10955 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10956 10957 Changed |= CFGChanged |= processLoop(L); 10958 10959 if (Changed) { 10960 LAIs->clear(); 10961 10962 #ifndef NDEBUG 10963 if (VerifySCEV) 10964 SE->verify(); 10965 #endif 10966 } 10967 } 10968 10969 // Process each loop nest in the function. 10970 return LoopVectorizeResult(Changed, CFGChanged); 10971 } 10972 10973 PreservedAnalyses LoopVectorizePass::run(Function &F, 10974 FunctionAnalysisManager &AM) { 10975 LI = &AM.getResult<LoopAnalysis>(F); 10976 // There are no loops in the function. Return before computing other 10977 // expensive analyses. 10978 if (LI->empty()) 10979 return PreservedAnalyses::all(); 10980 SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 10981 TTI = &AM.getResult<TargetIRAnalysis>(F); 10982 DT = &AM.getResult<DominatorTreeAnalysis>(F); 10983 TLI = &AM.getResult<TargetLibraryAnalysis>(F); 10984 AC = &AM.getResult<AssumptionAnalysis>(F); 10985 DB = &AM.getResult<DemandedBitsAnalysis>(F); 10986 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10987 LAIs = &AM.getResult<LoopAccessAnalysis>(F); 10988 10989 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10990 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10991 BFI = nullptr; 10992 if (PSI && PSI->hasProfileSummary()) 10993 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10994 LoopVectorizeResult Result = runImpl(F); 10995 if (!Result.MadeAnyChange) 10996 return PreservedAnalyses::all(); 10997 PreservedAnalyses PA; 10998 10999 if (isAssignmentTrackingEnabled(*F.getParent())) { 11000 for (auto &BB : F) 11001 RemoveRedundantDbgInstrs(&BB); 11002 } 11003 11004 PA.preserve<LoopAnalysis>(); 11005 PA.preserve<DominatorTreeAnalysis>(); 11006 PA.preserve<ScalarEvolutionAnalysis>(); 11007 PA.preserve<LoopAccessAnalysis>(); 11008 11009 if (Result.MadeCFGChange) { 11010 // Making CFG changes likely means a loop got vectorized. Indicate that 11011 // extra simplification passes should be run. 11012 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 11013 // be run if runtime checks have been added. 11014 AM.getResult<ShouldRunExtraVectorPasses>(F); 11015 PA.preserve<ShouldRunExtraVectorPasses>(); 11016 } else { 11017 PA.preserveSet<CFGAnalyses>(); 11018 } 11019 return PA; 11020 } 11021 11022 void LoopVectorizePass::printPipeline( 11023 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 11024 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 11025 OS, MapClassName2PassName); 11026 11027 OS << '<'; 11028 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 11029 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 11030 OS << '>'; 11031 } 11032