1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanPatternMatch.h" 63 #include "VPlanTransforms.h" 64 #include "VPlanUtils.h" 65 #include "VPlanVerifier.h" 66 #include "llvm/ADT/APInt.h" 67 #include "llvm/ADT/ArrayRef.h" 68 #include "llvm/ADT/DenseMap.h" 69 #include "llvm/ADT/DenseMapInfo.h" 70 #include "llvm/ADT/Hashing.h" 71 #include "llvm/ADT/MapVector.h" 72 #include "llvm/ADT/STLExtras.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/TypeSwitch.h" 79 #include "llvm/ADT/iterator_range.h" 80 #include "llvm/Analysis/AssumptionCache.h" 81 #include "llvm/Analysis/BasicAliasAnalysis.h" 82 #include "llvm/Analysis/BlockFrequencyInfo.h" 83 #include "llvm/Analysis/CFG.h" 84 #include "llvm/Analysis/CodeMetrics.h" 85 #include "llvm/Analysis/DemandedBits.h" 86 #include "llvm/Analysis/GlobalsModRef.h" 87 #include "llvm/Analysis/LoopAccessAnalysis.h" 88 #include "llvm/Analysis/LoopAnalysisManager.h" 89 #include "llvm/Analysis/LoopInfo.h" 90 #include "llvm/Analysis/LoopIterator.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/ValueTracking.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfo.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/MDBuilder.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/PatternMatch.h" 122 #include "llvm/IR/ProfDataUtils.h" 123 #include "llvm/IR/Type.h" 124 #include "llvm/IR/Use.h" 125 #include "llvm/IR/User.h" 126 #include "llvm/IR/Value.h" 127 #include "llvm/IR/Verifier.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/NativeFormatting.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/Local.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 251 "data-and-control-without-rt-check", 252 "Similar to data-and-control, but remove the runtime check"), 253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", 254 "Use predicated EVL instructions for tail folding. If EVL " 255 "is unsupported, fallback to data-without-lane-mask."))); 256 257 static cl::opt<bool> MaximizeBandwidth( 258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 259 cl::desc("Maximize bandwidth when selecting vectorization factor which " 260 "will be determined by the smallest type in loop.")); 261 262 static cl::opt<bool> EnableInterleavedMemAccesses( 263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 264 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 265 266 /// An interleave-group may need masking if it resides in a block that needs 267 /// predication, or in order to mask away gaps. 268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// The number of stores in a loop that are allowed to need predication. 320 static cl::opt<unsigned> NumberOfStoresToPredicate( 321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 322 cl::desc("Max number of stores to be predicated behind an if.")); 323 324 static cl::opt<bool> EnableIndVarRegisterHeur( 325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 326 cl::desc("Count the induction variable only once when interleaving")); 327 328 static cl::opt<bool> EnableCondStoresVectorization( 329 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 330 cl::desc("Enable if predication of stores during vectorization.")); 331 332 static cl::opt<unsigned> MaxNestedScalarReductionIC( 333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 334 cl::desc("The maximum interleave count to use when interleaving a scalar " 335 "reduction in a nested loop.")); 336 337 static cl::opt<bool> 338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 339 cl::Hidden, 340 cl::desc("Prefer in-loop vector reductions, " 341 "overriding the targets preference.")); 342 343 static cl::opt<bool> ForceOrderedReductions( 344 "force-ordered-reductions", cl::init(false), cl::Hidden, 345 cl::desc("Enable the vectorisation of loops with in-order (strict) " 346 "FP reductions")); 347 348 static cl::opt<bool> PreferPredicatedReductionSelect( 349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 350 cl::desc( 351 "Prefer predicating a reduction operation over an after loop select.")); 352 353 namespace llvm { 354 cl::opt<bool> EnableVPlanNativePath( 355 "enable-vplan-native-path", cl::Hidden, 356 cl::desc("Enable VPlan-native vectorization path with " 357 "support for outer loop vectorization.")); 358 } // namespace llvm 359 360 // This flag enables the stress testing of the VPlan H-CFG construction in the 361 // VPlan-native vectorization path. It must be used in conjuction with 362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363 // verification of the H-CFGs built. 364 static cl::opt<bool> VPlanBuildStressTest( 365 "vplan-build-stress-test", cl::init(false), cl::Hidden, 366 cl::desc( 367 "Build VPlan for every supported loop nest in the function and bail " 368 "out right after the build (stress test the VPlan H-CFG construction " 369 "in the VPlan-native vectorization path).")); 370 371 cl::opt<bool> llvm::EnableLoopInterleaving( 372 "interleave-loops", cl::init(true), cl::Hidden, 373 cl::desc("Enable loop interleaving in Loop vectorization passes")); 374 cl::opt<bool> llvm::EnableLoopVectorization( 375 "vectorize-loops", cl::init(true), cl::Hidden, 376 cl::desc("Run the Loop vectorization passes")); 377 378 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 379 "force-widen-divrem-via-safe-divisor", cl::Hidden, 380 cl::desc( 381 "Override cost based safe divisor widening for div/rem instructions")); 382 383 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 385 cl::Hidden, 386 cl::desc("Try wider VFs if they enable the use of vector variants")); 387 388 static cl::opt<bool> EnableEarlyExitVectorization( 389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden, 390 cl::desc( 391 "Enable vectorization of early exit loops with uncountable exits.")); 392 393 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 394 // variables not overflowing do not hold. See `emitSCEVChecks`. 395 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 396 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 397 // `emitMemRuntimeChecks`. 398 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 399 // Likelyhood of bypassing the vectorized loop because there are zero trips left 400 // after prolog. See `emitIterationCountCheck`. 401 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 402 403 /// A helper function that returns true if the given type is irregular. The 404 /// type is irregular if its allocated size doesn't equal the store size of an 405 /// element of the corresponding vector type. 406 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 407 // Determine if an array of N elements of type Ty is "bitcast compatible" 408 // with a <N x Ty> vector. 409 // This is only true if there is no padding between the array elements. 410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. 418 /// 4) Returns std::nullopt if all of the above failed. 419 static std::optional<unsigned> 420 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, 421 bool CanUseConstantMax = true) { 422 // Check if exact trip count is known. 423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) 424 return ExpectedTC; 425 426 // Check if there is an expected trip count available from profile data. 427 if (LoopVectorizeWithBlockFrequency) 428 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 429 return *EstimatedTC; 430 431 if (!CanUseConstantMax) 432 return std::nullopt; 433 434 // Check if upper bound estimate is known. 435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) 436 return ExpectedTC; 437 438 return std::nullopt; 439 } 440 441 namespace { 442 // Forward declare GeneratedRTChecks. 443 class GeneratedRTChecks; 444 445 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 446 } // namespace 447 448 namespace llvm { 449 450 AnalysisKey ShouldRunExtraVectorPasses::Key; 451 452 /// InnerLoopVectorizer vectorizes loops which contain only one basic 453 /// block to a specified vectorization factor (VF). 454 /// This class performs the widening of scalars into vectors, or multiple 455 /// scalars. This class also implements the following features: 456 /// * It inserts an epilogue loop for handling loops that don't have iteration 457 /// counts that are known to be a multiple of the vectorization factor. 458 /// * It handles the code generation for reduction variables. 459 /// * Scalarization (implementation using scalars) of un-vectorizable 460 /// instructions. 461 /// InnerLoopVectorizer does not perform any vectorization-legality 462 /// checks, and relies on the caller to check for the different legality 463 /// aspects. The InnerLoopVectorizer relies on the 464 /// LoopVectorizationLegality class to provide information about the induction 465 /// and reduction variables that were found to a given vectorization factor. 466 class InnerLoopVectorizer { 467 public: 468 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 469 LoopInfo *LI, DominatorTree *DT, 470 const TargetLibraryInfo *TLI, 471 const TargetTransformInfo *TTI, AssumptionCache *AC, 472 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 473 ElementCount MinProfitableTripCount, 474 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 475 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, 477 VPlan &Plan) 478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 479 AC(AC), ORE(ORE), VF(VecWidth), 480 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), 481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 482 PSI(PSI), RTChecks(RTChecks), Plan(Plan), 483 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { 484 // Query this against the original loop and save it here because the profile 485 // of the original loop header may change as the transformation happens. 486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 487 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 488 } 489 490 virtual ~InnerLoopVectorizer() = default; 491 492 /// Create a new empty loop that will contain vectorized instructions later 493 /// on, while the old loop will be used as the scalar remainder. Control flow 494 /// is generated around the vectorized (and scalar epilogue) loops consisting 495 /// of various checks and bypasses. Return the pre-header block of the new 496 /// loop. In the case of epilogue vectorization, this function is overriden to 497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is 498 /// used to look up SCEV expansions for expressions needed during skeleton 499 /// creation. 500 virtual BasicBlock * 501 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 502 503 /// Fix the vectorized code, taking care of header phi's, and more. 504 void fixVectorizedLoop(VPTransformState &State); 505 506 // Return true if any runtime check is added. 507 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 508 509 /// A helper function to scalarize a single Instruction in the innermost loop. 510 /// Generates a sequence of scalar instances for each lane between \p MinLane 511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 513 /// Instr's operands. 514 void scalarizeInstruction(const Instruction *Instr, 515 VPReplicateRecipe *RepRecipe, const VPLane &Lane, 516 VPTransformState &State); 517 518 /// Fix the non-induction PHIs in \p Plan. 519 void fixNonInductionPHIs(VPTransformState &State); 520 521 /// Returns the original loop trip count. 522 Value *getTripCount() const { return TripCount; } 523 524 /// Used to set the trip count after ILV's construction and after the 525 /// preheader block has been executed. Note that this always holds the trip 526 /// count of the original loop for both main loop and epilogue vectorization. 527 void setTripCount(Value *TC) { TripCount = TC; } 528 529 // Retrieve the additional bypass value associated with an original 530 /// induction header phi. 531 Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { 532 return Induction2AdditionalBypassValue.at(OrigPhi); 533 } 534 535 /// Return the additional bypass block which targets the scalar loop by 536 /// skipping the epilogue loop after completing the main loop. 537 BasicBlock *getAdditionalBypassBlock() const { 538 assert(AdditionalBypassBlock && 539 "Trying to access AdditionalBypassBlock but it has not been set"); 540 return AdditionalBypassBlock; 541 } 542 543 protected: 544 friend class LoopVectorizationPlanner; 545 546 /// Iteratively sink the scalarized operands of a predicated instruction into 547 /// the block that was created for it. 548 void sinkScalarOperands(Instruction *PredInst); 549 550 /// Returns (and creates if needed) the trip count of the widened loop. 551 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 552 553 /// Emit a bypass check to see if the vector trip count is zero, including if 554 /// it overflows. 555 void emitIterationCountCheck(BasicBlock *Bypass); 556 557 /// Emit a bypass check to see if all of the SCEV assumptions we've 558 /// had to make are correct. Returns the block containing the checks or 559 /// nullptr if no checks have been added. 560 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 561 562 /// Emit bypass checks to check any memory assumptions we may have made. 563 /// Returns the block containing the checks or nullptr if no checks have been 564 /// added. 565 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 566 567 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 568 /// vector loop preheader, middle block and scalar preheader. 569 void createVectorLoopSkeleton(StringRef Prefix); 570 571 /// Create and record the values for induction variables to resume coming from 572 /// the additional bypass block. 573 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, 574 Value *MainVectorTripCount); 575 576 /// Allow subclasses to override and print debug traces before/after vplan 577 /// execution, when trace information is requested. 578 virtual void printDebugTracesAtStart() {} 579 virtual void printDebugTracesAtEnd() {} 580 581 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the 582 /// vector preheader and its predecessor, also connecting the new block to the 583 /// scalar preheader. 584 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); 585 586 /// The original loop. 587 Loop *OrigLoop; 588 589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 590 /// dynamic knowledge to simplify SCEV expressions and converts them to a 591 /// more usable form. 592 PredicatedScalarEvolution &PSE; 593 594 /// Loop Info. 595 LoopInfo *LI; 596 597 /// Dominator Tree. 598 DominatorTree *DT; 599 600 /// Target Library Info. 601 const TargetLibraryInfo *TLI; 602 603 /// Target Transform Info. 604 const TargetTransformInfo *TTI; 605 606 /// Assumption Cache. 607 AssumptionCache *AC; 608 609 /// Interface to emit optimization remarks. 610 OptimizationRemarkEmitter *ORE; 611 612 /// The vectorization SIMD factor to use. Each vector will have this many 613 /// vector elements. 614 ElementCount VF; 615 616 ElementCount MinProfitableTripCount; 617 618 /// The vectorization unroll factor to use. Each scalar is vectorized to this 619 /// many different vector instructions. 620 unsigned UF; 621 622 /// The builder that we use 623 IRBuilder<> Builder; 624 625 // --- Vectorization state --- 626 627 /// The vector-loop preheader. 628 BasicBlock *LoopVectorPreHeader; 629 630 /// The scalar-loop preheader. 631 BasicBlock *LoopScalarPreHeader; 632 633 /// Middle Block between the vector and the scalar. 634 BasicBlock *LoopMiddleBlock; 635 636 /// A list of all bypass blocks. The first block is the entry of the loop. 637 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 638 639 /// Store instructions that were predicated. 640 SmallVector<Instruction *, 4> PredicatedInstructions; 641 642 /// Trip count of the original loop. 643 Value *TripCount = nullptr; 644 645 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 646 Value *VectorTripCount = nullptr; 647 648 /// The legality analysis. 649 LoopVectorizationLegality *Legal; 650 651 /// The profitablity analysis. 652 LoopVectorizationCostModel *Cost; 653 654 // Record whether runtime checks are added. 655 bool AddedSafetyChecks = false; 656 657 /// BFI and PSI are used to check for profile guided size optimizations. 658 BlockFrequencyInfo *BFI; 659 ProfileSummaryInfo *PSI; 660 661 // Whether this loop should be optimized for size based on profile guided size 662 // optimizatios. 663 bool OptForSizeBasedOnProfile; 664 665 /// Structure to hold information about generated runtime checks, responsible 666 /// for cleaning the checks, if vectorization turns out unprofitable. 667 GeneratedRTChecks &RTChecks; 668 669 /// Mapping of induction phis to their additional bypass values. They 670 /// need to be added as operands to phi nodes in the scalar loop preheader 671 /// after the epilogue skeleton has been created. 672 DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue; 673 674 /// The additional bypass block which conditionally skips over the epilogue 675 /// loop after executing the main loop. Needed to resume inductions and 676 /// reductions during epilogue vectorization. 677 BasicBlock *AdditionalBypassBlock = nullptr; 678 679 VPlan &Plan; 680 681 /// The vector preheader block of \p Plan, used as target for check blocks 682 /// introduced during skeleton creation. 683 VPBlockBase *VectorPHVPB; 684 }; 685 686 /// Encapsulate information regarding vectorization of a loop and its epilogue. 687 /// This information is meant to be updated and used across two stages of 688 /// epilogue vectorization. 689 struct EpilogueLoopVectorizationInfo { 690 ElementCount MainLoopVF = ElementCount::getFixed(0); 691 unsigned MainLoopUF = 0; 692 ElementCount EpilogueVF = ElementCount::getFixed(0); 693 unsigned EpilogueUF = 0; 694 BasicBlock *MainLoopIterationCountCheck = nullptr; 695 BasicBlock *EpilogueIterationCountCheck = nullptr; 696 BasicBlock *SCEVSafetyCheck = nullptr; 697 BasicBlock *MemSafetyCheck = nullptr; 698 Value *TripCount = nullptr; 699 Value *VectorTripCount = nullptr; 700 VPlan &EpiloguePlan; 701 702 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 703 ElementCount EVF, unsigned EUF, 704 VPlan &EpiloguePlan) 705 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), 706 EpiloguePlan(EpiloguePlan) { 707 assert(EUF == 1 && 708 "A high UF for the epilogue loop is likely not beneficial."); 709 } 710 }; 711 712 /// An extension of the inner loop vectorizer that creates a skeleton for a 713 /// vectorized loop that has its epilogue (residual) also vectorized. 714 /// The idea is to run the vplan on a given loop twice, firstly to setup the 715 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 716 /// from the first step and vectorize the epilogue. This is achieved by 717 /// deriving two concrete strategy classes from this base class and invoking 718 /// them in succession from the loop vectorizer planner. 719 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 720 public: 721 InnerLoopAndEpilogueVectorizer( 722 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 723 DominatorTree *DT, const TargetLibraryInfo *TLI, 724 const TargetTransformInfo *TTI, AssumptionCache *AC, 725 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 726 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 727 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 728 GeneratedRTChecks &Checks, VPlan &Plan) 729 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 730 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 731 CM, BFI, PSI, Checks, Plan), 732 EPI(EPI) {} 733 734 // Override this function to handle the more complex control flow around the 735 // three loops. 736 BasicBlock * 737 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { 738 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 739 } 740 741 /// The interface for creating a vectorized skeleton using one of two 742 /// different strategies, each corresponding to one execution of the vplan 743 /// as described above. 744 virtual BasicBlock * 745 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 746 747 /// Holds and updates state information required to vectorize the main loop 748 /// and its epilogue in two separate passes. This setup helps us avoid 749 /// regenerating and recomputing runtime safety checks. It also helps us to 750 /// shorten the iteration-count-check path length for the cases where the 751 /// iteration count of the loop is so small that the main vector loop is 752 /// completely skipped. 753 EpilogueLoopVectorizationInfo &EPI; 754 }; 755 756 /// A specialized derived class of inner loop vectorizer that performs 757 /// vectorization of *main* loops in the process of vectorizing loops and their 758 /// epilogues. 759 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 760 public: 761 EpilogueVectorizerMainLoop( 762 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 763 DominatorTree *DT, const TargetLibraryInfo *TLI, 764 const TargetTransformInfo *TTI, AssumptionCache *AC, 765 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 766 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 767 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 768 GeneratedRTChecks &Check, VPlan &Plan) 769 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 770 EPI, LVL, CM, BFI, PSI, Check, Plan) {} 771 /// Implements the interface for creating a vectorized skeleton using the 772 /// *main loop* strategy (ie the first pass of vplan execution). 773 BasicBlock * 774 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 775 776 protected: 777 /// Emits an iteration count bypass check once for the main loop (when \p 778 /// ForEpilogue is false) and once for the epilogue loop (when \p 779 /// ForEpilogue is true). 780 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 781 void printDebugTracesAtStart() override; 782 void printDebugTracesAtEnd() override; 783 }; 784 785 // A specialized derived class of inner loop vectorizer that performs 786 // vectorization of *epilogue* loops in the process of vectorizing loops and 787 // their epilogues. 788 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 789 public: 790 EpilogueVectorizerEpilogueLoop( 791 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 792 DominatorTree *DT, const TargetLibraryInfo *TLI, 793 const TargetTransformInfo *TTI, AssumptionCache *AC, 794 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 795 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 796 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 797 GeneratedRTChecks &Checks, VPlan &Plan) 798 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 799 EPI, LVL, CM, BFI, PSI, Checks, Plan) { 800 TripCount = EPI.TripCount; 801 } 802 /// Implements the interface for creating a vectorized skeleton using the 803 /// *epilogue loop* strategy (ie the second pass of vplan execution). 804 BasicBlock * 805 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 806 807 protected: 808 /// Emits an iteration count bypass check after the main vector loop has 809 /// finished to see if there are any iterations left to execute by either 810 /// the vector epilogue or the scalar epilogue. 811 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 812 BasicBlock *Bypass, 813 BasicBlock *Insert); 814 void printDebugTracesAtStart() override; 815 void printDebugTracesAtEnd() override; 816 }; 817 } // end namespace llvm 818 819 /// Look for a meaningful debug location on the instruction or its operands. 820 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 821 if (!I) 822 return DebugLoc(); 823 824 DebugLoc Empty; 825 if (I->getDebugLoc() != Empty) 826 return I->getDebugLoc(); 827 828 for (Use &Op : I->operands()) { 829 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 830 if (OpInst->getDebugLoc() != Empty) 831 return OpInst->getDebugLoc(); 832 } 833 834 return I->getDebugLoc(); 835 } 836 837 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 838 /// is passed, the message relates to that particular instruction. 839 #ifndef NDEBUG 840 static void debugVectorizationMessage(const StringRef Prefix, 841 const StringRef DebugMsg, 842 Instruction *I) { 843 dbgs() << "LV: " << Prefix << DebugMsg; 844 if (I != nullptr) 845 dbgs() << " " << *I; 846 else 847 dbgs() << '.'; 848 dbgs() << '\n'; 849 } 850 #endif 851 852 /// Create an analysis remark that explains why vectorization failed 853 /// 854 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 855 /// RemarkName is the identifier for the remark. If \p I is passed it is an 856 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 857 /// the location of the remark. If \p DL is passed, use it as debug location for 858 /// the remark. \return the remark object that can be streamed to. 859 static OptimizationRemarkAnalysis 860 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, 861 Instruction *I, DebugLoc DL = {}) { 862 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); 863 // If debug location is attached to the instruction, use it. Otherwise if DL 864 // was not provided, use the loop's. 865 if (I && I->getDebugLoc()) 866 DL = I->getDebugLoc(); 867 else if (!DL) 868 DL = TheLoop->getStartLoc(); 869 870 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 871 } 872 873 namespace llvm { 874 875 /// Return a value for Step multiplied by VF. 876 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 877 int64_t Step) { 878 assert(Ty->isIntegerTy() && "Expected an integer step"); 879 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 880 } 881 882 /// Return the runtime value for VF. 883 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 884 return B.CreateElementCount(Ty, VF); 885 } 886 887 void reportVectorizationFailure(const StringRef DebugMsg, 888 const StringRef OREMsg, const StringRef ORETag, 889 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 890 Instruction *I) { 891 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 892 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 893 ORE->emit( 894 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 895 << "loop not vectorized: " << OREMsg); 896 } 897 898 /// Reports an informative message: print \p Msg for debugging purposes as well 899 /// as an optimization remark. Uses either \p I as location of the remark, or 900 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the 901 /// remark. If \p DL is passed, use it as debug location for the remark. 902 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 903 OptimizationRemarkEmitter *ORE, 904 Loop *TheLoop, Instruction *I = nullptr, 905 DebugLoc DL = {}) { 906 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 907 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 908 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, 909 I, DL) 910 << Msg); 911 } 912 913 /// Report successful vectorization of the loop. In case an outer loop is 914 /// vectorized, prepend "outer" to the vectorization remark. 915 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 916 VectorizationFactor VF, unsigned IC) { 917 LLVM_DEBUG(debugVectorizationMessage( 918 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 919 nullptr)); 920 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 921 ORE->emit([&]() { 922 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 923 TheLoop->getHeader()) 924 << "vectorized " << LoopType << "loop (vectorization width: " 925 << ore::NV("VectorizationFactor", VF.Width) 926 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 927 }); 928 } 929 930 } // end namespace llvm 931 932 namespace llvm { 933 934 // Loop vectorization cost-model hints how the scalar epilogue loop should be 935 // lowered. 936 enum ScalarEpilogueLowering { 937 938 // The default: allowing scalar epilogues. 939 CM_ScalarEpilogueAllowed, 940 941 // Vectorization with OptForSize: don't allow epilogues. 942 CM_ScalarEpilogueNotAllowedOptSize, 943 944 // A special case of vectorisation with OptForSize: loops with a very small 945 // trip count are considered for vectorization under OptForSize, thereby 946 // making sure the cost of their loop body is dominant, free of runtime 947 // guards and scalar iteration overheads. 948 CM_ScalarEpilogueNotAllowedLowTripLoop, 949 950 // Loop hint predicate indicating an epilogue is undesired. 951 CM_ScalarEpilogueNotNeededUsePredicate, 952 953 // Directive indicating we must either tail fold or not vectorize 954 CM_ScalarEpilogueNotAllowedUsePredicate 955 }; 956 957 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 958 959 /// LoopVectorizationCostModel - estimates the expected speedups due to 960 /// vectorization. 961 /// In many cases vectorization is not profitable. This can happen because of 962 /// a number of reasons. In this class we mainly attempt to predict the 963 /// expected speedup/slowdowns due to the supported instruction set. We use the 964 /// TargetTransformInfo to query the different backends for the cost of 965 /// different operations. 966 class LoopVectorizationCostModel { 967 friend class LoopVectorizationPlanner; 968 969 public: 970 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 971 PredicatedScalarEvolution &PSE, LoopInfo *LI, 972 LoopVectorizationLegality *Legal, 973 const TargetTransformInfo &TTI, 974 const TargetLibraryInfo *TLI, DemandedBits *DB, 975 AssumptionCache *AC, 976 OptimizationRemarkEmitter *ORE, const Function *F, 977 const LoopVectorizeHints *Hints, 978 InterleavedAccessInfo &IAI) 979 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 980 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 981 Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {} 982 983 /// \return An upper bound for the vectorization factors (both fixed and 984 /// scalable). If the factors are 0, vectorization and interleaving should be 985 /// avoided up front. 986 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 987 988 /// \return True if runtime checks are required for vectorization, and false 989 /// otherwise. 990 bool runtimeChecksRequired(); 991 992 /// Setup cost-based decisions for user vectorization factor. 993 /// \return true if the UserVF is a feasible VF to be chosen. 994 bool selectUserVectorizationFactor(ElementCount UserVF) { 995 collectUniformsAndScalars(UserVF); 996 collectInstsToScalarize(UserVF); 997 return expectedCost(UserVF).isValid(); 998 } 999 1000 /// \return The size (in bits) of the smallest and widest types in the code 1001 /// that needs to be vectorized. We ignore values that remain scalar such as 1002 /// 64 bit loop indices. 1003 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1004 1005 /// \return The desired interleave count. 1006 /// If interleave count has been specified by metadata it will be returned. 1007 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1008 /// are the selected vectorization factor and the cost of the selected VF. 1009 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1010 1011 /// Memory access instruction may be vectorized in more than one way. 1012 /// Form of instruction after vectorization depends on cost. 1013 /// This function takes cost-based decisions for Load/Store instructions 1014 /// and collects them in a map. This decisions map is used for building 1015 /// the lists of loop-uniform and loop-scalar instructions. 1016 /// The calculated cost is saved with widening decision in order to 1017 /// avoid redundant calculations. 1018 void setCostBasedWideningDecision(ElementCount VF); 1019 1020 /// A call may be vectorized in different ways depending on whether we have 1021 /// vectorized variants available and whether the target supports masking. 1022 /// This function analyzes all calls in the function at the supplied VF, 1023 /// makes a decision based on the costs of available options, and stores that 1024 /// decision in a map for use in planning and plan execution. 1025 void setVectorizedCallDecision(ElementCount VF); 1026 1027 /// A struct that represents some properties of the register usage 1028 /// of a loop. 1029 struct RegisterUsage { 1030 /// Holds the number of loop invariant values that are used in the loop. 1031 /// The key is ClassID of target-provided register class. 1032 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1033 /// Holds the maximum number of concurrent live intervals in the loop. 1034 /// The key is ClassID of target-provided register class. 1035 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1036 }; 1037 1038 /// \return Returns information about the register usages of the loop for the 1039 /// given vectorization factors. 1040 SmallVector<RegisterUsage, 8> 1041 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1042 1043 /// Collect values we want to ignore in the cost model. 1044 void collectValuesToIgnore(); 1045 1046 /// Collect all element types in the loop for which widening is needed. 1047 void collectElementTypesForWidening(); 1048 1049 /// Split reductions into those that happen in the loop, and those that happen 1050 /// outside. In loop reductions are collected into InLoopReductions. 1051 void collectInLoopReductions(); 1052 1053 /// Returns true if we should use strict in-order reductions for the given 1054 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1055 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1056 /// of FP operations. 1057 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1058 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1059 } 1060 1061 /// \returns The smallest bitwidth each instruction can be represented with. 1062 /// The vector equivalents of these instructions should be truncated to this 1063 /// type. 1064 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1065 return MinBWs; 1066 } 1067 1068 /// \returns True if it is more profitable to scalarize instruction \p I for 1069 /// vectorization factor \p VF. 1070 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1071 assert(VF.isVector() && 1072 "Profitable to scalarize relevant only for VF > 1."); 1073 assert( 1074 TheLoop->isInnermost() && 1075 "cost-model should not be used for outer loops (in VPlan-native path)"); 1076 1077 auto Scalars = InstsToScalarize.find(VF); 1078 assert(Scalars != InstsToScalarize.end() && 1079 "VF not yet analyzed for scalarization profitability"); 1080 return Scalars->second.contains(I); 1081 } 1082 1083 /// Returns true if \p I is known to be uniform after vectorization. 1084 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1085 assert( 1086 TheLoop->isInnermost() && 1087 "cost-model should not be used for outer loops (in VPlan-native path)"); 1088 // Pseudo probe needs to be duplicated for each unrolled iteration and 1089 // vector lane so that profiled loop trip count can be accurately 1090 // accumulated instead of being under counted. 1091 if (isa<PseudoProbeInst>(I)) 1092 return false; 1093 1094 if (VF.isScalar()) 1095 return true; 1096 1097 auto UniformsPerVF = Uniforms.find(VF); 1098 assert(UniformsPerVF != Uniforms.end() && 1099 "VF not yet analyzed for uniformity"); 1100 return UniformsPerVF->second.count(I); 1101 } 1102 1103 /// Returns true if \p I is known to be scalar after vectorization. 1104 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1105 assert( 1106 TheLoop->isInnermost() && 1107 "cost-model should not be used for outer loops (in VPlan-native path)"); 1108 if (VF.isScalar()) 1109 return true; 1110 1111 auto ScalarsPerVF = Scalars.find(VF); 1112 assert(ScalarsPerVF != Scalars.end() && 1113 "Scalar values are not calculated for VF"); 1114 return ScalarsPerVF->second.count(I); 1115 } 1116 1117 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1118 /// for vectorization factor \p VF. 1119 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1120 return VF.isVector() && MinBWs.contains(I) && 1121 !isProfitableToScalarize(I, VF) && 1122 !isScalarAfterVectorization(I, VF); 1123 } 1124 1125 /// Decision that was taken during cost calculation for memory instruction. 1126 enum InstWidening { 1127 CM_Unknown, 1128 CM_Widen, // For consecutive accesses with stride +1. 1129 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1130 CM_Interleave, 1131 CM_GatherScatter, 1132 CM_Scalarize, 1133 CM_VectorCall, 1134 CM_IntrinsicCall 1135 }; 1136 1137 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1138 /// instruction \p I and vector width \p VF. 1139 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1140 InstructionCost Cost) { 1141 assert(VF.isVector() && "Expected VF >=2"); 1142 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1143 } 1144 1145 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1146 /// interleaving group \p Grp and vector width \p VF. 1147 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1148 ElementCount VF, InstWidening W, 1149 InstructionCost Cost) { 1150 assert(VF.isVector() && "Expected VF >=2"); 1151 /// Broadcast this decicion to all instructions inside the group. 1152 /// When interleaving, the cost will only be assigned one instruction, the 1153 /// insert position. For other cases, add the appropriate fraction of the 1154 /// total cost to each instruction. This ensures accurate costs are used, 1155 /// even if the insert position instruction is not used. 1156 InstructionCost InsertPosCost = Cost; 1157 InstructionCost OtherMemberCost = 0; 1158 if (W != CM_Interleave) 1159 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); 1160 ; 1161 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { 1162 if (auto *I = Grp->getMember(Idx)) { 1163 if (Grp->getInsertPos() == I) 1164 WideningDecisions[std::make_pair(I, VF)] = 1165 std::make_pair(W, InsertPosCost); 1166 else 1167 WideningDecisions[std::make_pair(I, VF)] = 1168 std::make_pair(W, OtherMemberCost); 1169 } 1170 } 1171 } 1172 1173 /// Return the cost model decision for the given instruction \p I and vector 1174 /// width \p VF. Return CM_Unknown if this instruction did not pass 1175 /// through the cost modeling. 1176 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1177 assert(VF.isVector() && "Expected VF to be a vector VF"); 1178 assert( 1179 TheLoop->isInnermost() && 1180 "cost-model should not be used for outer loops (in VPlan-native path)"); 1181 1182 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1183 auto Itr = WideningDecisions.find(InstOnVF); 1184 if (Itr == WideningDecisions.end()) 1185 return CM_Unknown; 1186 return Itr->second.first; 1187 } 1188 1189 /// Return the vectorization cost for the given instruction \p I and vector 1190 /// width \p VF. 1191 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1192 assert(VF.isVector() && "Expected VF >=2"); 1193 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1194 assert(WideningDecisions.contains(InstOnVF) && 1195 "The cost is not calculated"); 1196 return WideningDecisions[InstOnVF].second; 1197 } 1198 1199 struct CallWideningDecision { 1200 InstWidening Kind; 1201 Function *Variant; 1202 Intrinsic::ID IID; 1203 std::optional<unsigned> MaskPos; 1204 InstructionCost Cost; 1205 }; 1206 1207 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1208 Function *Variant, Intrinsic::ID IID, 1209 std::optional<unsigned> MaskPos, 1210 InstructionCost Cost) { 1211 assert(!VF.isScalar() && "Expected vector VF"); 1212 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1213 MaskPos, Cost}; 1214 } 1215 1216 CallWideningDecision getCallWideningDecision(CallInst *CI, 1217 ElementCount VF) const { 1218 assert(!VF.isScalar() && "Expected vector VF"); 1219 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1220 } 1221 1222 /// Return True if instruction \p I is an optimizable truncate whose operand 1223 /// is an induction variable. Such a truncate will be removed by adding a new 1224 /// induction variable with the destination type. 1225 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1226 // If the instruction is not a truncate, return false. 1227 auto *Trunc = dyn_cast<TruncInst>(I); 1228 if (!Trunc) 1229 return false; 1230 1231 // Get the source and destination types of the truncate. 1232 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1233 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1234 1235 // If the truncate is free for the given types, return false. Replacing a 1236 // free truncate with an induction variable would add an induction variable 1237 // update instruction to each iteration of the loop. We exclude from this 1238 // check the primary induction variable since it will need an update 1239 // instruction regardless. 1240 Value *Op = Trunc->getOperand(0); 1241 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1242 return false; 1243 1244 // If the truncated value is not an induction variable, return false. 1245 return Legal->isInductionPhi(Op); 1246 } 1247 1248 /// Collects the instructions to scalarize for each predicated instruction in 1249 /// the loop. 1250 void collectInstsToScalarize(ElementCount VF); 1251 1252 /// Collect Uniform and Scalar values for the given \p VF. 1253 /// The sets depend on CM decision for Load/Store instructions 1254 /// that may be vectorized as interleave, gather-scatter or scalarized. 1255 /// Also make a decision on what to do about call instructions in the loop 1256 /// at that VF -- scalarize, call a known vector routine, or call a 1257 /// vector intrinsic. 1258 void collectUniformsAndScalars(ElementCount VF) { 1259 // Do the analysis once. 1260 if (VF.isScalar() || Uniforms.contains(VF)) 1261 return; 1262 setCostBasedWideningDecision(VF); 1263 collectLoopUniforms(VF); 1264 setVectorizedCallDecision(VF); 1265 collectLoopScalars(VF); 1266 } 1267 1268 /// Returns true if the target machine supports masked store operation 1269 /// for the given \p DataType and kind of access to \p Ptr. 1270 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1271 return Legal->isConsecutivePtr(DataType, Ptr) && 1272 TTI.isLegalMaskedStore(DataType, Alignment); 1273 } 1274 1275 /// Returns true if the target machine supports masked load operation 1276 /// for the given \p DataType and kind of access to \p Ptr. 1277 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1278 return Legal->isConsecutivePtr(DataType, Ptr) && 1279 TTI.isLegalMaskedLoad(DataType, Alignment); 1280 } 1281 1282 /// Returns true if the target machine can represent \p V as a masked gather 1283 /// or scatter operation. 1284 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1285 bool LI = isa<LoadInst>(V); 1286 bool SI = isa<StoreInst>(V); 1287 if (!LI && !SI) 1288 return false; 1289 auto *Ty = getLoadStoreType(V); 1290 Align Align = getLoadStoreAlignment(V); 1291 if (VF.isVector()) 1292 Ty = VectorType::get(Ty, VF); 1293 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1294 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1295 } 1296 1297 /// Returns true if the target machine supports all of the reduction 1298 /// variables found for the given VF. 1299 bool canVectorizeReductions(ElementCount VF) const { 1300 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1301 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1302 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1303 })); 1304 } 1305 1306 /// Given costs for both strategies, return true if the scalar predication 1307 /// lowering should be used for div/rem. This incorporates an override 1308 /// option so it is not simply a cost comparison. 1309 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1310 InstructionCost SafeDivisorCost) const { 1311 switch (ForceSafeDivisor) { 1312 case cl::BOU_UNSET: 1313 return ScalarCost < SafeDivisorCost; 1314 case cl::BOU_TRUE: 1315 return false; 1316 case cl::BOU_FALSE: 1317 return true; 1318 } 1319 llvm_unreachable("impossible case value"); 1320 } 1321 1322 /// Returns true if \p I is an instruction which requires predication and 1323 /// for which our chosen predication strategy is scalarization (i.e. we 1324 /// don't have an alternate strategy such as masking available). 1325 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1326 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1327 1328 /// Returns true if \p I is an instruction that needs to be predicated 1329 /// at runtime. The result is independent of the predication mechanism. 1330 /// Superset of instructions that return true for isScalarWithPredication. 1331 bool isPredicatedInst(Instruction *I) const; 1332 1333 /// Return the costs for our two available strategies for lowering a 1334 /// div/rem operation which requires speculating at least one lane. 1335 /// First result is for scalarization (will be invalid for scalable 1336 /// vectors); second is for the safe-divisor strategy. 1337 std::pair<InstructionCost, InstructionCost> 1338 getDivRemSpeculationCost(Instruction *I, 1339 ElementCount VF) const; 1340 1341 /// Returns true if \p I is a memory instruction with consecutive memory 1342 /// access that can be widened. 1343 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1344 1345 /// Returns true if \p I is a memory instruction in an interleaved-group 1346 /// of memory accesses that can be vectorized with wide vector loads/stores 1347 /// and shuffles. 1348 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; 1349 1350 /// Check if \p Instr belongs to any interleaved access group. 1351 bool isAccessInterleaved(Instruction *Instr) const { 1352 return InterleaveInfo.isInterleaved(Instr); 1353 } 1354 1355 /// Get the interleaved access group that \p Instr belongs to. 1356 const InterleaveGroup<Instruction> * 1357 getInterleavedAccessGroup(Instruction *Instr) const { 1358 return InterleaveInfo.getInterleaveGroup(Instr); 1359 } 1360 1361 /// Returns true if we're required to use a scalar epilogue for at least 1362 /// the final iteration of the original loop. 1363 bool requiresScalarEpilogue(bool IsVectorizing) const { 1364 if (!isScalarEpilogueAllowed()) { 1365 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1366 return false; 1367 } 1368 // If we might exit from anywhere but the latch and early exit vectorization 1369 // is disabled, we must run the exiting iteration in scalar form. 1370 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && 1371 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { 1372 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " 1373 "from latch block\n"); 1374 return true; 1375 } 1376 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { 1377 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " 1378 "interleaved group requires scalar epilogue\n"); 1379 return true; 1380 } 1381 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1382 return false; 1383 } 1384 1385 /// Returns true if we're required to use a scalar epilogue for at least 1386 /// the final iteration of the original loop for all VFs in \p Range. 1387 /// A scalar epilogue must either be required for all VFs in \p Range or for 1388 /// none. 1389 bool requiresScalarEpilogue(VFRange Range) const { 1390 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1391 return requiresScalarEpilogue(VF.isVector()); 1392 }; 1393 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1394 assert( 1395 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1396 "all VFs in range must agree on whether a scalar epilogue is required"); 1397 return IsRequired; 1398 } 1399 1400 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1401 /// loop hint annotation. 1402 bool isScalarEpilogueAllowed() const { 1403 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1404 } 1405 1406 /// Returns the TailFoldingStyle that is best for the current loop. 1407 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1408 if (!ChosenTailFoldingStyle) 1409 return TailFoldingStyle::None; 1410 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first 1411 : ChosenTailFoldingStyle->second; 1412 } 1413 1414 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1415 /// overflow or not. 1416 /// \param IsScalableVF true if scalable vector factors enabled. 1417 /// \param UserIC User specific interleave count. 1418 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { 1419 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); 1420 if (!Legal->canFoldTailByMasking()) { 1421 ChosenTailFoldingStyle = 1422 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); 1423 return; 1424 } 1425 1426 if (!ForceTailFoldingStyle.getNumOccurrences()) { 1427 ChosenTailFoldingStyle = std::make_pair( 1428 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), 1429 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); 1430 return; 1431 } 1432 1433 // Set styles when forced. 1434 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), 1435 ForceTailFoldingStyle.getValue()); 1436 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) 1437 return; 1438 // Override forced styles if needed. 1439 // FIXME: use actual opcode/data type for analysis here. 1440 // FIXME: Investigate opportunity for fixed vector factor. 1441 // FIXME: support fixed-order recurrences by fixing splice of non VFxUF 1442 // penultimate EVL. 1443 bool EVLIsLegal = 1444 UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && 1445 !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty(); 1446 if (!EVLIsLegal) { 1447 // If for some reason EVL mode is unsupported, fallback to 1448 // DataWithoutLaneMask to try to vectorize the loop with folded tail 1449 // in a generic way. 1450 ChosenTailFoldingStyle = 1451 std::make_pair(TailFoldingStyle::DataWithoutLaneMask, 1452 TailFoldingStyle::DataWithoutLaneMask); 1453 LLVM_DEBUG( 1454 dbgs() 1455 << "LV: Preference for VP intrinsics indicated. Will " 1456 "not try to generate VP Intrinsics " 1457 << (UserIC > 1 1458 ? "since interleave count specified is greater than 1.\n" 1459 : "due to non-interleaving reasons.\n")); 1460 } 1461 } 1462 1463 /// Returns true if all loop blocks should be masked to fold tail loop. 1464 bool foldTailByMasking() const { 1465 // TODO: check if it is possible to check for None style independent of 1466 // IVUpdateMayOverflow flag in getTailFoldingStyle. 1467 return getTailFoldingStyle() != TailFoldingStyle::None; 1468 } 1469 1470 /// Return maximum safe number of elements to be processed per vector 1471 /// iteration, which do not prevent store-load forwarding and are safe with 1472 /// regard to the memory dependencies. Required for EVL-based VPlans to 1473 /// correctly calculate AVL (application vector length) as min(remaining AVL, 1474 /// MaxSafeElements). 1475 /// TODO: need to consider adjusting cost model to use this value as a 1476 /// vectorization factor for EVL-based vectorization. 1477 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; } 1478 1479 /// Returns true if the instructions in this block requires predication 1480 /// for any reason, e.g. because tail folding now requires a predicate 1481 /// or because the block in the original loop was predicated. 1482 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1483 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1484 } 1485 1486 /// Returns true if VP intrinsics with explicit vector length support should 1487 /// be generated in the tail folded loop. 1488 bool foldTailWithEVL() const { 1489 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; 1490 } 1491 1492 /// Returns true if the Phi is part of an inloop reduction. 1493 bool isInLoopReduction(PHINode *Phi) const { 1494 return InLoopReductions.contains(Phi); 1495 } 1496 1497 /// Returns true if the predicated reduction select should be used to set the 1498 /// incoming value for the reduction phi. 1499 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { 1500 // Force to use predicated reduction select since the EVL of the 1501 // second-to-last iteration might not be VF*UF. 1502 if (foldTailWithEVL()) 1503 return true; 1504 return PreferPredicatedReductionSelect || 1505 TTI.preferPredicatedReductionSelect( 1506 Opcode, PhiTy, TargetTransformInfo::ReductionFlags()); 1507 } 1508 1509 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1510 /// with factor VF. Return the cost of the instruction, including 1511 /// scalarization overhead if it's needed. 1512 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1513 1514 /// Estimate cost of a call instruction CI if it were vectorized with factor 1515 /// VF. Return the cost of the instruction, including scalarization overhead 1516 /// if it's needed. 1517 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1518 1519 /// Invalidates decisions already taken by the cost model. 1520 void invalidateCostModelingDecisions() { 1521 WideningDecisions.clear(); 1522 CallWideningDecisions.clear(); 1523 Uniforms.clear(); 1524 Scalars.clear(); 1525 } 1526 1527 /// Returns the expected execution cost. The unit of the cost does 1528 /// not matter because we use the 'cost' units to compare different 1529 /// vector widths. The cost that is returned is *not* normalized by 1530 /// the factor width. 1531 InstructionCost expectedCost(ElementCount VF); 1532 1533 bool hasPredStores() const { return NumPredStores > 0; } 1534 1535 /// Returns true if epilogue vectorization is considered profitable, and 1536 /// false otherwise. 1537 /// \p VF is the vectorization factor chosen for the original loop. 1538 /// \p Multiplier is an aditional scaling factor applied to VF before 1539 /// comparing to EpilogueVectorizationMinVF. 1540 bool isEpilogueVectorizationProfitable(const ElementCount VF, 1541 const unsigned IC) const; 1542 1543 /// Returns the execution time cost of an instruction for a given vector 1544 /// width. Vector width of one means scalar. 1545 InstructionCost getInstructionCost(Instruction *I, ElementCount VF); 1546 1547 /// Return the cost of instructions in an inloop reduction pattern, if I is 1548 /// part of that pattern. 1549 std::optional<InstructionCost> getReductionPatternCost(Instruction *I, 1550 ElementCount VF, 1551 Type *VectorTy) const; 1552 1553 /// Returns true if \p Op should be considered invariant and if it is 1554 /// trivially hoistable. 1555 bool shouldConsiderInvariant(Value *Op); 1556 1557 private: 1558 unsigned NumPredStores = 0; 1559 1560 /// \return An upper bound for the vectorization factors for both 1561 /// fixed and scalable vectorization, where the minimum-known number of 1562 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1563 /// disabled or unsupported, then the scalable part will be equal to 1564 /// ElementCount::getScalable(0). 1565 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1566 ElementCount UserVF, 1567 bool FoldTailByMasking); 1568 1569 /// \return the maximized element count based on the targets vector 1570 /// registers and the loop trip-count, but limited to a maximum safe VF. 1571 /// This is a helper function of computeFeasibleMaxVF. 1572 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1573 unsigned SmallestType, 1574 unsigned WidestType, 1575 ElementCount MaxSafeVF, 1576 bool FoldTailByMasking); 1577 1578 /// Checks if scalable vectorization is supported and enabled. Caches the 1579 /// result to avoid repeated debug dumps for repeated queries. 1580 bool isScalableVectorizationAllowed(); 1581 1582 /// \return the maximum legal scalable VF, based on the safe max number 1583 /// of elements. 1584 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1585 1586 /// Calculate vectorization cost of memory instruction \p I. 1587 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1588 1589 /// The cost computation for scalarized memory instruction. 1590 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1591 1592 /// The cost computation for interleaving group of memory instructions. 1593 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1594 1595 /// The cost computation for Gather/Scatter instruction. 1596 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1597 1598 /// The cost computation for widening instruction \p I with consecutive 1599 /// memory access. 1600 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1601 1602 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1603 /// Load: scalar load + broadcast. 1604 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1605 /// element) 1606 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1607 1608 /// Estimate the overhead of scalarizing an instruction. This is a 1609 /// convenience wrapper for the type-based getScalarizationOverhead API. 1610 InstructionCost getScalarizationOverhead(Instruction *I, 1611 ElementCount VF) const; 1612 1613 /// Returns true if an artificially high cost for emulated masked memrefs 1614 /// should be used. 1615 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1616 1617 /// Map of scalar integer values to the smallest bitwidth they can be legally 1618 /// represented as. The vector equivalents of these values should be truncated 1619 /// to this type. 1620 MapVector<Instruction *, uint64_t> MinBWs; 1621 1622 /// A type representing the costs for instructions if they were to be 1623 /// scalarized rather than vectorized. The entries are Instruction-Cost 1624 /// pairs. 1625 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1626 1627 /// A set containing all BasicBlocks that are known to present after 1628 /// vectorization as a predicated block. 1629 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1630 PredicatedBBsAfterVectorization; 1631 1632 /// Records whether it is allowed to have the original scalar loop execute at 1633 /// least once. This may be needed as a fallback loop in case runtime 1634 /// aliasing/dependence checks fail, or to handle the tail/remainder 1635 /// iterations when the trip count is unknown or doesn't divide by the VF, 1636 /// or as a peel-loop to handle gaps in interleave-groups. 1637 /// Under optsize and when the trip count is very small we don't allow any 1638 /// iterations to execute in the scalar loop. 1639 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1640 1641 /// Control finally chosen tail folding style. The first element is used if 1642 /// the IV update may overflow, the second element - if it does not. 1643 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> 1644 ChosenTailFoldingStyle; 1645 1646 /// true if scalable vectorization is supported and enabled. 1647 std::optional<bool> IsScalableVectorizationAllowed; 1648 1649 /// Maximum safe number of elements to be processed per vector iteration, 1650 /// which do not prevent store-load forwarding and are safe with regard to the 1651 /// memory dependencies. Required for EVL-based veectorization, where this 1652 /// value is used as the upper bound of the safe AVL. 1653 std::optional<unsigned> MaxSafeElements; 1654 1655 /// A map holding scalar costs for different vectorization factors. The 1656 /// presence of a cost for an instruction in the mapping indicates that the 1657 /// instruction will be scalarized when vectorizing with the associated 1658 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1659 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1660 1661 /// Holds the instructions known to be uniform after vectorization. 1662 /// The data is collected per VF. 1663 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1664 1665 /// Holds the instructions known to be scalar after vectorization. 1666 /// The data is collected per VF. 1667 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1668 1669 /// Holds the instructions (address computations) that are forced to be 1670 /// scalarized. 1671 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1672 1673 /// PHINodes of the reductions that should be expanded in-loop. 1674 SmallPtrSet<PHINode *, 4> InLoopReductions; 1675 1676 /// A Map of inloop reduction operations and their immediate chain operand. 1677 /// FIXME: This can be removed once reductions can be costed correctly in 1678 /// VPlan. This was added to allow quick lookup of the inloop operations. 1679 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1680 1681 /// Returns the expected difference in cost from scalarizing the expression 1682 /// feeding a predicated instruction \p PredInst. The instructions to 1683 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1684 /// non-negative return value implies the expression will be scalarized. 1685 /// Currently, only single-use chains are considered for scalarization. 1686 InstructionCost computePredInstDiscount(Instruction *PredInst, 1687 ScalarCostsTy &ScalarCosts, 1688 ElementCount VF); 1689 1690 /// Collect the instructions that are uniform after vectorization. An 1691 /// instruction is uniform if we represent it with a single scalar value in 1692 /// the vectorized loop corresponding to each vector iteration. Examples of 1693 /// uniform instructions include pointer operands of consecutive or 1694 /// interleaved memory accesses. Note that although uniformity implies an 1695 /// instruction will be scalar, the reverse is not true. In general, a 1696 /// scalarized instruction will be represented by VF scalar values in the 1697 /// vectorized loop, each corresponding to an iteration of the original 1698 /// scalar loop. 1699 void collectLoopUniforms(ElementCount VF); 1700 1701 /// Collect the instructions that are scalar after vectorization. An 1702 /// instruction is scalar if it is known to be uniform or will be scalarized 1703 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1704 /// to the list if they are used by a load/store instruction that is marked as 1705 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1706 /// VF values in the vectorized loop, each corresponding to an iteration of 1707 /// the original scalar loop. 1708 void collectLoopScalars(ElementCount VF); 1709 1710 /// Keeps cost model vectorization decision and cost for instructions. 1711 /// Right now it is used for memory instructions only. 1712 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1713 std::pair<InstWidening, InstructionCost>>; 1714 1715 DecisionList WideningDecisions; 1716 1717 using CallDecisionList = 1718 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1719 1720 CallDecisionList CallWideningDecisions; 1721 1722 /// Returns true if \p V is expected to be vectorized and it needs to be 1723 /// extracted. 1724 bool needsExtract(Value *V, ElementCount VF) const { 1725 Instruction *I = dyn_cast<Instruction>(V); 1726 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1727 TheLoop->isLoopInvariant(I) || 1728 getWideningDecision(I, VF) == CM_Scalarize) 1729 return false; 1730 1731 // Assume we can vectorize V (and hence we need extraction) if the 1732 // scalars are not computed yet. This can happen, because it is called 1733 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1734 // the scalars are collected. That should be a safe assumption in most 1735 // cases, because we check if the operands have vectorizable types 1736 // beforehand in LoopVectorizationLegality. 1737 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1738 }; 1739 1740 /// Returns a range containing only operands needing to be extracted. 1741 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1742 ElementCount VF) const { 1743 return SmallVector<Value *, 4>(make_filter_range( 1744 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1745 } 1746 1747 public: 1748 /// The loop that we evaluate. 1749 Loop *TheLoop; 1750 1751 /// Predicated scalar evolution analysis. 1752 PredicatedScalarEvolution &PSE; 1753 1754 /// Loop Info analysis. 1755 LoopInfo *LI; 1756 1757 /// Vectorization legality. 1758 LoopVectorizationLegality *Legal; 1759 1760 /// Vector target information. 1761 const TargetTransformInfo &TTI; 1762 1763 /// Target Library Info. 1764 const TargetLibraryInfo *TLI; 1765 1766 /// Demanded bits analysis. 1767 DemandedBits *DB; 1768 1769 /// Assumption cache. 1770 AssumptionCache *AC; 1771 1772 /// Interface to emit optimization remarks. 1773 OptimizationRemarkEmitter *ORE; 1774 1775 const Function *TheFunction; 1776 1777 /// Loop Vectorize Hint. 1778 const LoopVectorizeHints *Hints; 1779 1780 /// The interleave access information contains groups of interleaved accesses 1781 /// with the same stride and close to each other. 1782 InterleavedAccessInfo &InterleaveInfo; 1783 1784 /// Values to ignore in the cost model. 1785 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1786 1787 /// Values to ignore in the cost model when VF > 1. 1788 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1789 1790 /// All element types found in the loop. 1791 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1792 1793 /// The kind of cost that we are calculating 1794 TTI::TargetCostKind CostKind; 1795 }; 1796 } // end namespace llvm 1797 1798 namespace { 1799 /// Helper struct to manage generating runtime checks for vectorization. 1800 /// 1801 /// The runtime checks are created up-front in temporary blocks to allow better 1802 /// estimating the cost and un-linked from the existing IR. After deciding to 1803 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1804 /// temporary blocks are completely removed. 1805 class GeneratedRTChecks { 1806 /// Basic block which contains the generated SCEV checks, if any. 1807 BasicBlock *SCEVCheckBlock = nullptr; 1808 1809 /// The value representing the result of the generated SCEV checks. If it is 1810 /// nullptr, either no SCEV checks have been generated or they have been used. 1811 Value *SCEVCheckCond = nullptr; 1812 1813 /// Basic block which contains the generated memory runtime checks, if any. 1814 BasicBlock *MemCheckBlock = nullptr; 1815 1816 /// The value representing the result of the generated memory runtime checks. 1817 /// If it is nullptr, either no memory runtime checks have been generated or 1818 /// they have been used. 1819 Value *MemRuntimeCheckCond = nullptr; 1820 1821 DominatorTree *DT; 1822 LoopInfo *LI; 1823 TargetTransformInfo *TTI; 1824 1825 SCEVExpander SCEVExp; 1826 SCEVExpander MemCheckExp; 1827 1828 bool CostTooHigh = false; 1829 const bool AddBranchWeights; 1830 1831 Loop *OuterLoop = nullptr; 1832 1833 PredicatedScalarEvolution &PSE; 1834 1835 /// The kind of cost that we are calculating 1836 TTI::TargetCostKind CostKind; 1837 1838 public: 1839 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, 1840 LoopInfo *LI, TargetTransformInfo *TTI, 1841 const DataLayout &DL, bool AddBranchWeights, 1842 TTI::TargetCostKind CostKind) 1843 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), 1844 MemCheckExp(*PSE.getSE(), DL, "scev.check"), 1845 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {} 1846 1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1848 /// accurately estimate the cost of the runtime checks. The blocks are 1849 /// un-linked from the IR and are added back during vector code generation. If 1850 /// there is no vector code generation, the check blocks are removed 1851 /// completely. 1852 void create(Loop *L, const LoopAccessInfo &LAI, 1853 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1854 1855 // Hard cutoff to limit compile-time increase in case a very large number of 1856 // runtime checks needs to be generated. 1857 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1858 // profile info. 1859 CostTooHigh = 1860 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1861 if (CostTooHigh) 1862 return; 1863 1864 BasicBlock *LoopHeader = L->getHeader(); 1865 BasicBlock *Preheader = L->getLoopPreheader(); 1866 1867 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1868 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1869 // may be used by SCEVExpander. The blocks will be un-linked from their 1870 // predecessors and removed from LI & DT at the end of the function. 1871 if (!UnionPred.isAlwaysTrue()) { 1872 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1873 nullptr, "vector.scevcheck"); 1874 1875 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1876 &UnionPred, SCEVCheckBlock->getTerminator()); 1877 } 1878 1879 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1880 if (RtPtrChecking.Need) { 1881 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1882 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1883 "vector.memcheck"); 1884 1885 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1886 if (DiffChecks) { 1887 Value *RuntimeVF = nullptr; 1888 MemRuntimeCheckCond = addDiffRuntimeChecks( 1889 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1890 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1891 if (!RuntimeVF) 1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1893 return RuntimeVF; 1894 }, 1895 IC); 1896 } else { 1897 MemRuntimeCheckCond = addRuntimeChecks( 1898 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 1899 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 1900 } 1901 assert(MemRuntimeCheckCond && 1902 "no RT checks generated although RtPtrChecking " 1903 "claimed checks are required"); 1904 } 1905 1906 if (!MemCheckBlock && !SCEVCheckBlock) 1907 return; 1908 1909 // Unhook the temporary block with the checks, update various places 1910 // accordingly. 1911 if (SCEVCheckBlock) 1912 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1913 if (MemCheckBlock) 1914 MemCheckBlock->replaceAllUsesWith(Preheader); 1915 1916 if (SCEVCheckBlock) { 1917 SCEVCheckBlock->getTerminator()->moveBefore( 1918 Preheader->getTerminator()->getIterator()); 1919 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1920 Preheader->getTerminator()->eraseFromParent(); 1921 } 1922 if (MemCheckBlock) { 1923 MemCheckBlock->getTerminator()->moveBefore( 1924 Preheader->getTerminator()->getIterator()); 1925 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1926 Preheader->getTerminator()->eraseFromParent(); 1927 } 1928 1929 DT->changeImmediateDominator(LoopHeader, Preheader); 1930 if (MemCheckBlock) { 1931 DT->eraseNode(MemCheckBlock); 1932 LI->removeBlock(MemCheckBlock); 1933 } 1934 if (SCEVCheckBlock) { 1935 DT->eraseNode(SCEVCheckBlock); 1936 LI->removeBlock(SCEVCheckBlock); 1937 } 1938 1939 // Outer loop is used as part of the later cost calculations. 1940 OuterLoop = L->getParentLoop(); 1941 } 1942 1943 InstructionCost getCost() { 1944 if (SCEVCheckBlock || MemCheckBlock) 1945 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1946 1947 if (CostTooHigh) { 1948 InstructionCost Cost; 1949 Cost.setInvalid(); 1950 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1951 return Cost; 1952 } 1953 1954 InstructionCost RTCheckCost = 0; 1955 if (SCEVCheckBlock) 1956 for (Instruction &I : *SCEVCheckBlock) { 1957 if (SCEVCheckBlock->getTerminator() == &I) 1958 continue; 1959 InstructionCost C = TTI->getInstructionCost(&I, CostKind); 1960 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1961 RTCheckCost += C; 1962 } 1963 if (MemCheckBlock) { 1964 InstructionCost MemCheckCost = 0; 1965 for (Instruction &I : *MemCheckBlock) { 1966 if (MemCheckBlock->getTerminator() == &I) 1967 continue; 1968 InstructionCost C = TTI->getInstructionCost(&I, CostKind); 1969 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1970 MemCheckCost += C; 1971 } 1972 1973 // If the runtime memory checks are being created inside an outer loop 1974 // we should find out if these checks are outer loop invariant. If so, 1975 // the checks will likely be hoisted out and so the effective cost will 1976 // reduce according to the outer loop trip count. 1977 if (OuterLoop) { 1978 ScalarEvolution *SE = MemCheckExp.getSE(); 1979 // TODO: If profitable, we could refine this further by analysing every 1980 // individual memory check, since there could be a mixture of loop 1981 // variant and invariant checks that mean the final condition is 1982 // variant. 1983 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 1984 if (SE->isLoopInvariant(Cond, OuterLoop)) { 1985 // It seems reasonable to assume that we can reduce the effective 1986 // cost of the checks even when we know nothing about the trip 1987 // count. Assume that the outer loop executes at least twice. 1988 unsigned BestTripCount = 2; 1989 1990 // Get the best known TC estimate. 1991 if (auto EstimatedTC = getSmallBestKnownTC( 1992 PSE, OuterLoop, /* CanUseConstantMax = */ false)) 1993 BestTripCount = *EstimatedTC; 1994 1995 BestTripCount = std::max(BestTripCount, 1U); 1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 1997 1998 // Let's ensure the cost is always at least 1. 1999 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 2000 (InstructionCost::CostType)1); 2001 2002 if (BestTripCount > 1) 2003 LLVM_DEBUG(dbgs() 2004 << "We expect runtime memory checks to be hoisted " 2005 << "out of the outer loop. Cost reduced from " 2006 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2007 2008 MemCheckCost = NewMemCheckCost; 2009 } 2010 } 2011 2012 RTCheckCost += MemCheckCost; 2013 } 2014 2015 if (SCEVCheckBlock || MemCheckBlock) 2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2017 << "\n"); 2018 2019 return RTCheckCost; 2020 } 2021 2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2023 /// unused. 2024 ~GeneratedRTChecks() { 2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2027 if (!SCEVCheckCond) 2028 SCEVCleaner.markResultUsed(); 2029 2030 if (!MemRuntimeCheckCond) 2031 MemCheckCleaner.markResultUsed(); 2032 2033 if (MemRuntimeCheckCond) { 2034 auto &SE = *MemCheckExp.getSE(); 2035 // Memory runtime check generation creates compares that use expanded 2036 // values. Remove them before running the SCEVExpanderCleaners. 2037 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2038 if (MemCheckExp.isInsertedInstruction(&I)) 2039 continue; 2040 SE.forgetValue(&I); 2041 I.eraseFromParent(); 2042 } 2043 } 2044 MemCheckCleaner.cleanup(); 2045 SCEVCleaner.cleanup(); 2046 2047 if (SCEVCheckCond) 2048 SCEVCheckBlock->eraseFromParent(); 2049 if (MemRuntimeCheckCond) 2050 MemCheckBlock->eraseFromParent(); 2051 } 2052 2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2054 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2055 /// depending on the generated condition. 2056 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2057 BasicBlock *LoopVectorPreHeader) { 2058 if (!SCEVCheckCond) 2059 return nullptr; 2060 2061 Value *Cond = SCEVCheckCond; 2062 // Mark the check as used, to prevent it from being removed during cleanup. 2063 SCEVCheckCond = nullptr; 2064 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2065 if (C->isZero()) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2071 // Create new preheader for vector loop. 2072 if (OuterLoop) 2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2074 2075 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2078 SCEVCheckBlock); 2079 2080 DT->addNewBlock(SCEVCheckBlock, Pred); 2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2082 2083 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2084 if (AddBranchWeights) 2085 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); 2086 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2087 return SCEVCheckBlock; 2088 } 2089 2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2091 /// the branches to branch to the vector preheader or \p Bypass, depending on 2092 /// the generated condition. 2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2094 BasicBlock *LoopVectorPreHeader) { 2095 // Check if we generated code that checks in runtime if arrays overlap. 2096 if (!MemRuntimeCheckCond) 2097 return nullptr; 2098 2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2101 MemCheckBlock); 2102 2103 DT->addNewBlock(MemCheckBlock, Pred); 2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2105 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2106 2107 if (OuterLoop) 2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2109 2110 BranchInst &BI = 2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2112 if (AddBranchWeights) { 2113 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); 2114 } 2115 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2116 MemCheckBlock->getTerminator()->setDebugLoc( 2117 Pred->getTerminator()->getDebugLoc()); 2118 2119 // Mark the check as used, to prevent it from being removed during cleanup. 2120 MemRuntimeCheckCond = nullptr; 2121 return MemCheckBlock; 2122 } 2123 }; 2124 } // namespace 2125 2126 static bool useActiveLaneMask(TailFoldingStyle Style) { 2127 return Style == TailFoldingStyle::Data || 2128 Style == TailFoldingStyle::DataAndControlFlow || 2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2130 } 2131 2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2133 return Style == TailFoldingStyle::DataAndControlFlow || 2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2135 } 2136 2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2138 // vectorization. The loop needs to be annotated with #pragma omp simd 2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2140 // vector length information is not provided, vectorization is not considered 2141 // explicit. Interleave hints are not allowed either. These limitations will be 2142 // relaxed in the future. 2143 // Please, note that we are currently forced to abuse the pragma 'clang 2144 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2146 // provides *explicit vectorization hints* (LV can bypass legal checks and 2147 // assume that vectorization is legal). However, both hints are implemented 2148 // using the same metadata (llvm.loop.vectorize, processed by 2149 // LoopVectorizeHints). This will be fixed in the future when the native IR 2150 // representation for pragma 'omp simd' is introduced. 2151 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2152 OptimizationRemarkEmitter *ORE) { 2153 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2155 2156 // Only outer loops with an explicit vectorization hint are supported. 2157 // Unannotated outer loops are ignored. 2158 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2159 return false; 2160 2161 Function *Fn = OuterLp->getHeader()->getParent(); 2162 if (!Hints.allowVectorization(Fn, OuterLp, 2163 true /*VectorizeOnlyWhenForced*/)) { 2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2165 return false; 2166 } 2167 2168 if (Hints.getInterleave() > 1) { 2169 // TODO: Interleave support is future work. 2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2171 "outer loops.\n"); 2172 Hints.emitRemarkWithHints(); 2173 return false; 2174 } 2175 2176 return true; 2177 } 2178 2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2180 OptimizationRemarkEmitter *ORE, 2181 SmallVectorImpl<Loop *> &V) { 2182 // Collect inner loops and outer loops without irreducible control flow. For 2183 // now, only collect outer loops that have explicit vectorization hints. If we 2184 // are stress testing the VPlan H-CFG construction, we collect the outermost 2185 // loop of every loop nest. 2186 if (L.isInnermost() || VPlanBuildStressTest || 2187 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2188 LoopBlocksRPO RPOT(&L); 2189 RPOT.perform(LI); 2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2191 V.push_back(&L); 2192 // TODO: Collect inner loops inside marked outer loops in case 2193 // vectorization fails for the outer loop. Do not invoke 2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2195 // already known to be reducible. We can use an inherited attribute for 2196 // that. 2197 return; 2198 } 2199 } 2200 for (Loop *InnerL : L) 2201 collectSupportedLoops(*InnerL, LI, ORE, V); 2202 } 2203 2204 //===----------------------------------------------------------------------===// 2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2206 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2207 //===----------------------------------------------------------------------===// 2208 2209 /// Compute the transformed value of Index at offset StartValue using step 2210 /// StepValue. 2211 /// For integer induction, returns StartValue + Index * StepValue. 2212 /// For pointer induction, returns StartValue[Index * StepValue]. 2213 /// FIXME: The newly created binary instructions should contain nsw/nuw 2214 /// flags, which can be found from the original scalar operations. 2215 static Value * 2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2217 Value *Step, 2218 InductionDescriptor::InductionKind InductionKind, 2219 const BinaryOperator *InductionBinOp) { 2220 Type *StepTy = Step->getType(); 2221 Value *CastedIndex = StepTy->isIntegerTy() 2222 ? B.CreateSExtOrTrunc(Index, StepTy) 2223 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2224 if (CastedIndex != Index) { 2225 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2226 Index = CastedIndex; 2227 } 2228 2229 // Note: the IR at this point is broken. We cannot use SE to create any new 2230 // SCEV and then expand it, hoping that SCEV's simplification will give us 2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2232 // lead to various SCEV crashes. So all we can do is to use builder and rely 2233 // on InstCombine for future simplifications. Here we handle some trivial 2234 // cases only. 2235 auto CreateAdd = [&B](Value *X, Value *Y) { 2236 assert(X->getType() == Y->getType() && "Types don't match!"); 2237 if (auto *CX = dyn_cast<ConstantInt>(X)) 2238 if (CX->isZero()) 2239 return Y; 2240 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2241 if (CY->isZero()) 2242 return X; 2243 return B.CreateAdd(X, Y); 2244 }; 2245 2246 // We allow X to be a vector type, in which case Y will potentially be 2247 // splatted into a vector with the same element count. 2248 auto CreateMul = [&B](Value *X, Value *Y) { 2249 assert(X->getType()->getScalarType() == Y->getType() && 2250 "Types don't match!"); 2251 if (auto *CX = dyn_cast<ConstantInt>(X)) 2252 if (CX->isOne()) 2253 return Y; 2254 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2255 if (CY->isOne()) 2256 return X; 2257 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2258 if (XVTy && !isa<VectorType>(Y->getType())) 2259 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2260 return B.CreateMul(X, Y); 2261 }; 2262 2263 switch (InductionKind) { 2264 case InductionDescriptor::IK_IntInduction: { 2265 assert(!isa<VectorType>(Index->getType()) && 2266 "Vector indices not supported for integer inductions yet"); 2267 assert(Index->getType() == StartValue->getType() && 2268 "Index type does not match StartValue type"); 2269 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2270 return B.CreateSub(StartValue, Index); 2271 auto *Offset = CreateMul(Index, Step); 2272 return CreateAdd(StartValue, Offset); 2273 } 2274 case InductionDescriptor::IK_PtrInduction: 2275 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2276 case InductionDescriptor::IK_FpInduction: { 2277 assert(!isa<VectorType>(Index->getType()) && 2278 "Vector indices not supported for FP inductions yet"); 2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2280 assert(InductionBinOp && 2281 (InductionBinOp->getOpcode() == Instruction::FAdd || 2282 InductionBinOp->getOpcode() == Instruction::FSub) && 2283 "Original bin op should be defined for FP induction"); 2284 2285 Value *MulExp = B.CreateFMul(Step, Index); 2286 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2287 "induction"); 2288 } 2289 case InductionDescriptor::IK_NoInduction: 2290 return nullptr; 2291 } 2292 llvm_unreachable("invalid enum"); 2293 } 2294 2295 std::optional<unsigned> getMaxVScale(const Function &F, 2296 const TargetTransformInfo &TTI) { 2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2298 return MaxVScale; 2299 2300 if (F.hasFnAttribute(Attribute::VScaleRange)) 2301 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2302 2303 return std::nullopt; 2304 } 2305 2306 /// For the given VF and UF and maximum trip count computed for the loop, return 2307 /// whether the induction variable might overflow in the vectorized loop. If not, 2308 /// then we know a runtime overflow check always evaluates to false and can be 2309 /// removed. 2310 static bool isIndvarOverflowCheckKnownFalse( 2311 const LoopVectorizationCostModel *Cost, 2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2313 // Always be conservative if we don't know the exact unroll factor. 2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2315 2316 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2317 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2318 2319 // We know the runtime overflow check is known false iff the (max) trip-count 2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2321 // the vector loop induction variable. 2322 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { 2323 uint64_t MaxVF = VF.getKnownMinValue(); 2324 if (VF.isScalable()) { 2325 std::optional<unsigned> MaxVScale = 2326 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2327 if (!MaxVScale) 2328 return false; 2329 MaxVF *= *MaxVScale; 2330 } 2331 2332 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2333 } 2334 2335 return false; 2336 } 2337 2338 // Return whether we allow using masked interleave-groups (for dealing with 2339 // strided loads/stores that reside in predicated blocks, or for dealing 2340 // with gaps). 2341 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2342 // If an override option has been passed in for interleaved accesses, use it. 2343 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2344 return EnableMaskedInterleavedMemAccesses; 2345 2346 return TTI.enableMaskedInterleavedAccessVectorization(); 2347 } 2348 2349 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2350 VPReplicateRecipe *RepRecipe, 2351 const VPLane &Lane, 2352 VPTransformState &State) { 2353 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2354 2355 // Does this instruction return a value ? 2356 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2357 2358 Instruction *Cloned = Instr->clone(); 2359 if (!IsVoidRetTy) { 2360 Cloned->setName(Instr->getName() + ".cloned"); 2361 #if !defined(NDEBUG) 2362 // Verify that VPlan type inference results agree with the type of the 2363 // generated values. 2364 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2365 "inferred type and type from generated instructions do not match"); 2366 #endif 2367 } 2368 2369 RepRecipe->setFlags(Cloned); 2370 2371 if (auto DL = Instr->getDebugLoc()) 2372 State.setDebugLocFrom(DL); 2373 2374 // Replace the operands of the cloned instructions with their scalar 2375 // equivalents in the new loop. 2376 for (const auto &I : enumerate(RepRecipe->operands())) { 2377 auto InputLane = Lane; 2378 VPValue *Operand = I.value(); 2379 if (vputils::isUniformAfterVectorization(Operand)) 2380 InputLane = VPLane::getFirstLane(); 2381 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2382 } 2383 State.addNewMetadata(Cloned, Instr); 2384 2385 // Place the cloned scalar in the new loop. 2386 State.Builder.Insert(Cloned); 2387 2388 State.set(RepRecipe, Cloned, Lane); 2389 2390 // If we just cloned a new assumption, add it the assumption cache. 2391 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2392 AC->registerAssumption(II); 2393 2394 // End if-block. 2395 VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); 2396 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; 2397 assert( 2398 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2399 all_of(RepRecipe->operands(), 2400 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2401 "Expected a recipe is either within a region or all of its operands " 2402 "are defined outside the vectorized region."); 2403 if (IfPredicateInstr) 2404 PredicatedInstructions.push_back(Cloned); 2405 } 2406 2407 Value * 2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2409 if (VectorTripCount) 2410 return VectorTripCount; 2411 2412 Value *TC = getTripCount(); 2413 IRBuilder<> Builder(InsertBlock->getTerminator()); 2414 2415 Type *Ty = TC->getType(); 2416 // This is where we can make the step a runtime constant. 2417 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2418 2419 // If the tail is to be folded by masking, round the number of iterations N 2420 // up to a multiple of Step instead of rounding down. This is done by first 2421 // adding Step-1 and then rounding down. Note that it's ok if this addition 2422 // overflows: the vector induction variable will eventually wrap to zero given 2423 // that it starts at zero and its Step is a power of two; the loop will then 2424 // exit, with the last early-exit vector comparison also producing all-true. 2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2426 // is accounted for in emitIterationCountCheck that adds an overflow check. 2427 if (Cost->foldTailByMasking()) { 2428 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2429 "VF*UF must be a power of 2 when folding tail by masking"); 2430 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), 2431 "n.rnd.up"); 2432 } 2433 2434 // Now we need to generate the expression for the part of the loop that the 2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2436 // iterations are not required for correctness, or N - Step, otherwise. Step 2437 // is equal to the vectorization factor (number of SIMD elements) times the 2438 // unroll factor (number of SIMD instructions). 2439 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2440 2441 // There are cases where we *must* run at least one iteration in the remainder 2442 // loop. See the cost model for when this can happen. If the step evenly 2443 // divides the trip count, we set the remainder to be equal to the step. If 2444 // the step does not evenly divide the trip count, no adjustment is necessary 2445 // since there will already be scalar iterations. Note that the minimum 2446 // iterations check ensures that N >= Step. 2447 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2448 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2449 R = Builder.CreateSelect(IsZero, Step, R); 2450 } 2451 2452 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2453 2454 return VectorTripCount; 2455 } 2456 2457 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { 2458 VPBlockBase *ScalarPH = Plan.getScalarPreheader(); 2459 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); 2460 if (PreVectorPH->getNumSuccessors() != 1) { 2461 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); 2462 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && 2463 "Unexpected successor"); 2464 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); 2465 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); 2466 PreVectorPH = CheckVPIRBB; 2467 } 2468 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); 2469 PreVectorPH->swapSuccessors(); 2470 } 2471 2472 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2473 Value *Count = getTripCount(); 2474 // Reuse existing vector loop preheader for TC checks. 2475 // Note that new preheader block is generated for vector loop. 2476 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2477 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2478 2479 // Generate code to check if the loop's trip count is less than VF * UF, or 2480 // equal to it in case a scalar epilogue is required; this implies that the 2481 // vector trip count is zero. This check also covers the case where adding one 2482 // to the backedge-taken count overflowed leading to an incorrect trip count 2483 // of zero. In this case we will also jump to the scalar loop. 2484 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2485 : ICmpInst::ICMP_ULT; 2486 2487 // If tail is to be folded, vector loop takes care of all iterations. 2488 Type *CountTy = Count->getType(); 2489 Value *CheckMinIters = Builder.getFalse(); 2490 auto CreateStep = [&]() -> Value * { 2491 // Create step with max(MinProTripCount, UF * VF). 2492 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2493 return createStepForVF(Builder, CountTy, VF, UF); 2494 2495 Value *MinProfTC = 2496 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2497 if (!VF.isScalable()) 2498 return MinProfTC; 2499 return Builder.CreateBinaryIntrinsic( 2500 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2501 }; 2502 2503 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2504 if (Style == TailFoldingStyle::None) { 2505 Value *Step = CreateStep(); 2506 ScalarEvolution &SE = *PSE.getSE(); 2507 // TODO: Emit unconditional branch to vector preheader instead of 2508 // conditional branch with known condition. 2509 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); 2510 // Check if the trip count is < the step. 2511 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { 2512 // TODO: Ensure step is at most the trip count when determining max VF and 2513 // UF, w/o tail folding. 2514 CheckMinIters = Builder.getTrue(); 2515 } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), 2516 TripCountSCEV, SE.getSCEV(Step))) { 2517 // Generate the minimum iteration check only if we cannot prove the 2518 // check is known to be true, or known to be false. 2519 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2520 } // else step known to be < trip count, use CheckMinIters preset to false. 2521 } else if (VF.isScalable() && 2522 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2523 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2524 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2525 // an overflow to zero when updating induction variables and so an 2526 // additional overflow check is required before entering the vector loop. 2527 2528 // Get the maximum unsigned value for the type. 2529 Value *MaxUIntTripCount = 2530 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2531 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2532 2533 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2534 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2535 } 2536 2537 // Create new preheader for vector loop. 2538 LoopVectorPreHeader = 2539 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2540 "vector.ph"); 2541 2542 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2543 DT->getNode(Bypass)->getIDom()) && 2544 "TC check is expected to dominate Bypass"); 2545 2546 BranchInst &BI = 2547 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2548 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2549 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 2550 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2551 LoopBypassBlocks.push_back(TCCheckBlock); 2552 2553 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. 2554 introduceCheckBlockInVPlan(TCCheckBlock); 2555 } 2556 2557 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2558 BasicBlock *const SCEVCheckBlock = 2559 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader); 2560 if (!SCEVCheckBlock) 2561 return nullptr; 2562 2563 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2564 (OptForSizeBasedOnProfile && 2565 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2566 "Cannot SCEV check stride or overflow when optimizing for size"); 2567 assert(!LoopBypassBlocks.empty() && 2568 "Should already be a bypass block due to iteration count check"); 2569 LoopBypassBlocks.push_back(SCEVCheckBlock); 2570 AddedSafetyChecks = true; 2571 2572 introduceCheckBlockInVPlan(SCEVCheckBlock); 2573 return SCEVCheckBlock; 2574 } 2575 2576 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2577 // VPlan-native path does not do any analysis for runtime checks currently. 2578 if (EnableVPlanNativePath) 2579 return nullptr; 2580 2581 BasicBlock *const MemCheckBlock = 2582 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2583 2584 // Check if we generated code that checks in runtime if arrays overlap. We put 2585 // the checks into a separate block to make the more common case of few 2586 // elements faster. 2587 if (!MemCheckBlock) 2588 return nullptr; 2589 2590 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2591 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2592 "Cannot emit memory checks when optimizing for size, unless forced " 2593 "to vectorize."); 2594 ORE->emit([&]() { 2595 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2596 OrigLoop->getStartLoc(), 2597 OrigLoop->getHeader()) 2598 << "Code-size may be reduced by not forcing " 2599 "vectorization, or by source-code modifications " 2600 "eliminating the need for runtime checks " 2601 "(e.g., adding 'restrict')."; 2602 }); 2603 } 2604 2605 LoopBypassBlocks.push_back(MemCheckBlock); 2606 2607 AddedSafetyChecks = true; 2608 2609 introduceCheckBlockInVPlan(MemCheckBlock); 2610 return MemCheckBlock; 2611 } 2612 2613 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p 2614 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must 2615 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All 2616 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. 2617 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { 2618 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); 2619 for (auto &R : make_early_inc_range(*VPBB)) { 2620 assert(!R.isPhi() && "Tried to move phi recipe to end of block"); 2621 R.moveBefore(*IRVPBB, IRVPBB->end()); 2622 } 2623 2624 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); 2625 // VPBB is now dead and will be cleaned up when the plan gets destroyed. 2626 } 2627 2628 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2629 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2630 assert(LoopVectorPreHeader && "Invalid loop structure"); 2631 assert((OrigLoop->getUniqueLatchExitBlock() || 2632 Cost->requiresScalarEpilogue(VF.isVector())) && 2633 "loops not exiting via the latch without required epilogue?"); 2634 2635 LoopMiddleBlock = 2636 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2637 LI, nullptr, Twine(Prefix) + "middle.block"); 2638 replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); 2639 LoopScalarPreHeader = 2640 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2641 nullptr, Twine(Prefix) + "scalar.ph"); 2642 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); 2643 } 2644 2645 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2646 /// expansion results. 2647 static Value *getExpandedStep(const InductionDescriptor &ID, 2648 const SCEV2ValueTy &ExpandedSCEVs) { 2649 const SCEV *Step = ID.getStep(); 2650 if (auto *C = dyn_cast<SCEVConstant>(Step)) 2651 return C->getValue(); 2652 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 2653 return U->getValue(); 2654 auto I = ExpandedSCEVs.find(Step); 2655 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 2656 return I->second; 2657 } 2658 2659 /// Knowing that loop \p L executes a single vector iteration, add instructions 2660 /// that will get simplified and thus should not have any cost to \p 2661 /// InstsToIgnore. 2662 static void addFullyUnrolledInstructionsToIgnore( 2663 Loop *L, const LoopVectorizationLegality::InductionList &IL, 2664 SmallPtrSetImpl<Instruction *> &InstsToIgnore) { 2665 auto *Cmp = L->getLatchCmpInst(); 2666 if (Cmp) 2667 InstsToIgnore.insert(Cmp); 2668 for (const auto &KV : IL) { 2669 // Extract the key by hand so that it can be used in the lambda below. Note 2670 // that captured structured bindings are a C++20 extension. 2671 const PHINode *IV = KV.first; 2672 2673 // Get next iteration value of the induction variable. 2674 Instruction *IVInst = 2675 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch())); 2676 if (all_of(IVInst->users(), 2677 [&](const User *U) { return U == IV || U == Cmp; })) 2678 InstsToIgnore.insert(IVInst); 2679 } 2680 } 2681 2682 void InnerLoopVectorizer::createInductionAdditionalBypassValues( 2683 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { 2684 assert(MainVectorTripCount && "Must have bypass information"); 2685 2686 Instruction *OldInduction = Legal->getPrimaryInduction(); 2687 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), 2688 getAdditionalBypassBlock()->getFirstInsertionPt()); 2689 for (const auto &InductionEntry : Legal->getInductionVars()) { 2690 PHINode *OrigPhi = InductionEntry.first; 2691 const InductionDescriptor &II = InductionEntry.second; 2692 Value *Step = getExpandedStep(II, ExpandedSCEVs); 2693 // For the primary induction the additional bypass end value is known. 2694 // Otherwise it is computed. 2695 Value *EndValueFromAdditionalBypass = MainVectorTripCount; 2696 if (OrigPhi != OldInduction) { 2697 auto *BinOp = II.getInductionBinOp(); 2698 // Fast-math-flags propagate from the original induction instruction. 2699 if (isa_and_nonnull<FPMathOperator>(BinOp)) 2700 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); 2701 2702 // Compute the end value for the additional bypass. 2703 EndValueFromAdditionalBypass = 2704 emitTransformedIndex(BypassBuilder, MainVectorTripCount, 2705 II.getStartValue(), Step, II.getKind(), BinOp); 2706 EndValueFromAdditionalBypass->setName("ind.end"); 2707 } 2708 2709 // Store the bypass value here, as it needs to be added as operand to its 2710 // scalar preheader phi node after the epilogue skeleton has been created. 2711 // TODO: Directly add as extra operand to the VPResumePHI recipe. 2712 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && 2713 "entry for OrigPhi already exits"); 2714 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; 2715 } 2716 } 2717 2718 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( 2719 const SCEV2ValueTy &ExpandedSCEVs) { 2720 /* 2721 In this function we generate a new loop. The new loop will contain 2722 the vectorized instructions while the old loop will continue to run the 2723 scalar remainder. 2724 2725 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2726 / | preheader are expanded here. Eventually all required SCEV 2727 / | expansion should happen here. 2728 / v 2729 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2730 | / | 2731 | / v 2732 || [ ] <-- vector pre header. 2733 |/ | 2734 | v 2735 | [ ] \ 2736 | [ ]_| <-- vector loop (created during VPlan execution). 2737 | | 2738 | v 2739 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2740 | | successors created during VPlan execution) 2741 \/ | 2742 /\ v 2743 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2744 | | 2745 (opt) v <-- edge from middle to exit iff epilogue is not required. 2746 | [ ] \ 2747 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header 2748 | | wrapped in VPIRBasicBlock). 2749 \ | 2750 \ v 2751 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2752 ... 2753 */ 2754 2755 // Create an empty vector loop, and prepare basic blocks for the runtime 2756 // checks. 2757 createVectorLoopSkeleton(""); 2758 2759 // Now, compare the new count to zero. If it is zero skip the vector loop and 2760 // jump to the scalar loop. This check also covers the case where the 2761 // backedge-taken count is uint##_max: adding one to it will overflow leading 2762 // to an incorrect trip count of zero. In this (rare) case we will also jump 2763 // to the scalar loop. 2764 emitIterationCountCheck(LoopScalarPreHeader); 2765 2766 // Generate the code to check any assumptions that we've made for SCEV 2767 // expressions. 2768 emitSCEVChecks(LoopScalarPreHeader); 2769 2770 // Generate the code that checks in runtime if arrays overlap. We put the 2771 // checks into a separate block to make the more common case of few elements 2772 // faster. 2773 emitMemRuntimeChecks(LoopScalarPreHeader); 2774 2775 return LoopVectorPreHeader; 2776 } 2777 2778 namespace { 2779 2780 struct CSEDenseMapInfo { 2781 static bool canHandle(const Instruction *I) { 2782 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2783 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2784 } 2785 2786 static inline Instruction *getEmptyKey() { 2787 return DenseMapInfo<Instruction *>::getEmptyKey(); 2788 } 2789 2790 static inline Instruction *getTombstoneKey() { 2791 return DenseMapInfo<Instruction *>::getTombstoneKey(); 2792 } 2793 2794 static unsigned getHashValue(const Instruction *I) { 2795 assert(canHandle(I) && "Unknown instruction!"); 2796 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2797 I->value_op_end())); 2798 } 2799 2800 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 2801 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2802 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2803 return LHS == RHS; 2804 return LHS->isIdenticalTo(RHS); 2805 } 2806 }; 2807 2808 } // end anonymous namespace 2809 2810 ///Perform cse of induction variable instructions. 2811 static void cse(BasicBlock *BB) { 2812 // Perform simple cse. 2813 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2814 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 2815 if (!CSEDenseMapInfo::canHandle(&In)) 2816 continue; 2817 2818 // Check if we can replace this instruction with any of the 2819 // visited instructions. 2820 if (Instruction *V = CSEMap.lookup(&In)) { 2821 In.replaceAllUsesWith(V); 2822 In.eraseFromParent(); 2823 continue; 2824 } 2825 2826 CSEMap[&In] = &In; 2827 } 2828 } 2829 2830 InstructionCost 2831 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 2832 ElementCount VF) const { 2833 // We only need to calculate a cost if the VF is scalar; for actual vectors 2834 // we should already have a pre-calculated cost at each VF. 2835 if (!VF.isScalar()) 2836 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 2837 2838 Type *RetTy = CI->getType(); 2839 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 2840 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) 2841 return *RedCost; 2842 2843 SmallVector<Type *, 4> Tys; 2844 for (auto &ArgOp : CI->args()) 2845 Tys.push_back(ArgOp->getType()); 2846 2847 InstructionCost ScalarCallCost = 2848 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 2849 2850 // If this is an intrinsic we may have a lower cost for it. 2851 if (getVectorIntrinsicIDForCall(CI, TLI)) { 2852 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 2853 return std::min(ScalarCallCost, IntrinsicCost); 2854 } 2855 return ScalarCallCost; 2856 } 2857 2858 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { 2859 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 2860 return Elt; 2861 return VectorType::get(Elt, VF); 2862 } 2863 2864 InstructionCost 2865 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 2866 ElementCount VF) const { 2867 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 2868 assert(ID && "Expected intrinsic call!"); 2869 Type *RetTy = maybeVectorizeType(CI->getType(), VF); 2870 FastMathFlags FMF; 2871 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 2872 FMF = FPMO->getFastMathFlags(); 2873 2874 SmallVector<const Value *> Arguments(CI->args()); 2875 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 2876 SmallVector<Type *> ParamTys; 2877 std::transform(FTy->param_begin(), FTy->param_end(), 2878 std::back_inserter(ParamTys), 2879 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); 2880 2881 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 2882 dyn_cast<IntrinsicInst>(CI)); 2883 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 2884 } 2885 2886 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 2887 // Fix widened non-induction PHIs by setting up the PHI operands. 2888 if (EnableVPlanNativePath) 2889 fixNonInductionPHIs(State); 2890 2891 // Forget the original basic block. 2892 PSE.getSE()->forgetLoop(OrigLoop); 2893 PSE.getSE()->forgetBlockAndLoopDispositions(); 2894 2895 // After vectorization, the exit blocks of the original loop will have 2896 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 2897 // looked through single-entry phis. 2898 SmallVector<BasicBlock *> ExitBlocks; 2899 OrigLoop->getExitBlocks(ExitBlocks); 2900 for (BasicBlock *Exit : ExitBlocks) 2901 for (PHINode &PN : Exit->phis()) 2902 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 2903 2904 // Don't apply optimizations below when no vector region remains, as they all 2905 // require a vector loop at the moment. 2906 if (!State.Plan->getVectorLoopRegion()) 2907 return; 2908 2909 for (Instruction *PI : PredicatedInstructions) 2910 sinkScalarOperands(&*PI); 2911 2912 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 2913 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 2914 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; 2915 2916 // Remove redundant induction instructions. 2917 cse(HeaderBB); 2918 2919 // Set/update profile weights for the vector and remainder loops as original 2920 // loop iterations are now distributed among them. Note that original loop 2921 // becomes the scalar remainder loop after vectorization. 2922 // 2923 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 2924 // end up getting slightly roughened result but that should be OK since 2925 // profile is not inherently precise anyway. Note also possible bypass of 2926 // vector code caused by legality checks is ignored, assigning all the weight 2927 // to the vector loop, optimistically. 2928 // 2929 // For scalable vectorization we can't know at compile time how many 2930 // iterations of the loop are handled in one vector iteration, so instead 2931 // assume a pessimistic vscale of '1'. 2932 Loop *VectorLoop = LI->getLoopFor(HeaderBB); 2933 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, 2934 VF.getKnownMinValue() * UF); 2935 } 2936 2937 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 2938 // The basic block and loop containing the predicated instruction. 2939 auto *PredBB = PredInst->getParent(); 2940 auto *VectorLoop = LI->getLoopFor(PredBB); 2941 2942 // Initialize a worklist with the operands of the predicated instruction. 2943 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 2944 2945 // Holds instructions that we need to analyze again. An instruction may be 2946 // reanalyzed if we don't yet know if we can sink it or not. 2947 SmallVector<Instruction *, 8> InstsToReanalyze; 2948 2949 // Returns true if a given use occurs in the predicated block. Phi nodes use 2950 // their operands in their corresponding predecessor blocks. 2951 auto IsBlockOfUsePredicated = [&](Use &U) -> bool { 2952 auto *I = cast<Instruction>(U.getUser()); 2953 BasicBlock *BB = I->getParent(); 2954 if (auto *Phi = dyn_cast<PHINode>(I)) 2955 BB = Phi->getIncomingBlock( 2956 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 2957 return BB == PredBB; 2958 }; 2959 2960 // Iteratively sink the scalarized operands of the predicated instruction 2961 // into the block we created for it. When an instruction is sunk, it's 2962 // operands are then added to the worklist. The algorithm ends after one pass 2963 // through the worklist doesn't sink a single instruction. 2964 bool Changed; 2965 do { 2966 // Add the instructions that need to be reanalyzed to the worklist, and 2967 // reset the changed indicator. 2968 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 2969 InstsToReanalyze.clear(); 2970 Changed = false; 2971 2972 while (!Worklist.empty()) { 2973 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 2974 2975 // We can't sink an instruction if it is a phi node, is not in the loop, 2976 // may have side effects or may read from memory. 2977 // TODO: Could do more granular checking to allow sinking 2978 // a load past non-store instructions. 2979 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 2980 I->mayHaveSideEffects() || I->mayReadFromMemory()) 2981 continue; 2982 2983 // If the instruction is already in PredBB, check if we can sink its 2984 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 2985 // sinking the scalar instruction I, hence it appears in PredBB; but it 2986 // may have failed to sink I's operands (recursively), which we try 2987 // (again) here. 2988 if (I->getParent() == PredBB) { 2989 Worklist.insert(I->op_begin(), I->op_end()); 2990 continue; 2991 } 2992 2993 // It's legal to sink the instruction if all its uses occur in the 2994 // predicated block. Otherwise, there's nothing to do yet, and we may 2995 // need to reanalyze the instruction. 2996 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { 2997 InstsToReanalyze.push_back(I); 2998 continue; 2999 } 3000 3001 // Move the instruction to the beginning of the predicated block, and add 3002 // it's operands to the worklist. 3003 I->moveBefore(PredBB->getFirstInsertionPt()); 3004 Worklist.insert(I->op_begin(), I->op_end()); 3005 3006 // The sinking may have enabled other instructions to be sunk, so we will 3007 // need to iterate. 3008 Changed = true; 3009 } 3010 } while (Changed); 3011 } 3012 3013 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 3014 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3015 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3016 for (VPRecipeBase &P : VPBB->phis()) { 3017 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3018 if (!VPPhi) 3019 continue; 3020 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi)); 3021 // Make sure the builder has a valid insert point. 3022 Builder.SetInsertPoint(NewPhi); 3023 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) { 3024 VPValue *Inc = VPPhi->getIncomingValue(Idx); 3025 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); 3026 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); 3027 } 3028 } 3029 } 3030 } 3031 3032 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3033 // We should not collect Scalars more than once per VF. Right now, this 3034 // function is called from collectUniformsAndScalars(), which already does 3035 // this check. Collecting Scalars for VF=1 does not make any sense. 3036 assert(VF.isVector() && !Scalars.contains(VF) && 3037 "This function should not be visited twice for the same VF"); 3038 3039 // This avoids any chances of creating a REPLICATE recipe during planning 3040 // since that would result in generation of scalarized code during execution, 3041 // which is not supported for scalable vectors. 3042 if (VF.isScalable()) { 3043 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3044 return; 3045 } 3046 3047 SmallSetVector<Instruction *, 8> Worklist; 3048 3049 // These sets are used to seed the analysis with pointers used by memory 3050 // accesses that will remain scalar. 3051 SmallSetVector<Instruction *, 8> ScalarPtrs; 3052 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3053 auto *Latch = TheLoop->getLoopLatch(); 3054 3055 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3056 // The pointer operands of loads and stores will be scalar as long as the 3057 // memory access is not a gather or scatter operation. The value operand of a 3058 // store will remain scalar if the store is scalarized. 3059 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3060 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3061 assert(WideningDecision != CM_Unknown && 3062 "Widening decision should be ready at this moment"); 3063 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3064 if (Ptr == Store->getValueOperand()) 3065 return WideningDecision == CM_Scalarize; 3066 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3067 "Ptr is neither a value or pointer operand"); 3068 return WideningDecision != CM_GatherScatter; 3069 }; 3070 3071 // A helper that returns true if the given value is a getelementptr 3072 // instruction contained in the loop. 3073 auto IsLoopVaryingGEP = [&](Value *V) { 3074 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V); 3075 }; 3076 3077 // A helper that evaluates a memory access's use of a pointer. If the use will 3078 // be a scalar use and the pointer is only used by memory accesses, we place 3079 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3080 // PossibleNonScalarPtrs. 3081 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3082 // We only care about bitcast and getelementptr instructions contained in 3083 // the loop. 3084 if (!IsLoopVaryingGEP(Ptr)) 3085 return; 3086 3087 // If the pointer has already been identified as scalar (e.g., if it was 3088 // also identified as uniform), there's nothing to do. 3089 auto *I = cast<Instruction>(Ptr); 3090 if (Worklist.count(I)) 3091 return; 3092 3093 // If the use of the pointer will be a scalar use, and all users of the 3094 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3095 // place the pointer in PossibleNonScalarPtrs. 3096 if (IsScalarUse(MemAccess, Ptr) && 3097 all_of(I->users(), IsaPred<LoadInst, StoreInst>)) 3098 ScalarPtrs.insert(I); 3099 else 3100 PossibleNonScalarPtrs.insert(I); 3101 }; 3102 3103 // We seed the scalars analysis with three classes of instructions: (1) 3104 // instructions marked uniform-after-vectorization and (2) bitcast, 3105 // getelementptr and (pointer) phi instructions used by memory accesses 3106 // requiring a scalar use. 3107 // 3108 // (1) Add to the worklist all instructions that have been identified as 3109 // uniform-after-vectorization. 3110 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3111 3112 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3113 // memory accesses requiring a scalar use. The pointer operands of loads and 3114 // stores will be scalar unless the operation is a gather or scatter. 3115 // The value operand of a store will remain scalar if the store is scalarized. 3116 for (auto *BB : TheLoop->blocks()) 3117 for (auto &I : *BB) { 3118 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3119 EvaluatePtrUse(Load, Load->getPointerOperand()); 3120 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3121 EvaluatePtrUse(Store, Store->getPointerOperand()); 3122 EvaluatePtrUse(Store, Store->getValueOperand()); 3123 } 3124 } 3125 for (auto *I : ScalarPtrs) 3126 if (!PossibleNonScalarPtrs.count(I)) { 3127 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3128 Worklist.insert(I); 3129 } 3130 3131 // Insert the forced scalars. 3132 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3133 // induction variable when the PHI user is scalarized. 3134 auto ForcedScalar = ForcedScalars.find(VF); 3135 if (ForcedScalar != ForcedScalars.end()) 3136 for (auto *I : ForcedScalar->second) { 3137 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3138 Worklist.insert(I); 3139 } 3140 3141 // Expand the worklist by looking through any bitcasts and getelementptr 3142 // instructions we've already identified as scalar. This is similar to the 3143 // expansion step in collectLoopUniforms(); however, here we're only 3144 // expanding to include additional bitcasts and getelementptr instructions. 3145 unsigned Idx = 0; 3146 while (Idx != Worklist.size()) { 3147 Instruction *Dst = Worklist[Idx++]; 3148 if (!IsLoopVaryingGEP(Dst->getOperand(0))) 3149 continue; 3150 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3151 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3152 auto *J = cast<Instruction>(U); 3153 return !TheLoop->contains(J) || Worklist.count(J) || 3154 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3155 IsScalarUse(J, Src)); 3156 })) { 3157 Worklist.insert(Src); 3158 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3159 } 3160 } 3161 3162 // An induction variable will remain scalar if all users of the induction 3163 // variable and induction variable update remain scalar. 3164 for (const auto &Induction : Legal->getInductionVars()) { 3165 auto *Ind = Induction.first; 3166 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3167 3168 // If tail-folding is applied, the primary induction variable will be used 3169 // to feed a vector compare. 3170 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3171 continue; 3172 3173 // Returns true if \p Indvar is a pointer induction that is used directly by 3174 // load/store instruction \p I. 3175 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3176 Instruction *I) { 3177 return Induction.second.getKind() == 3178 InductionDescriptor::IK_PtrInduction && 3179 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3180 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar); 3181 }; 3182 3183 // Determine if all users of the induction variable are scalar after 3184 // vectorization. 3185 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { 3186 auto *I = cast<Instruction>(U); 3187 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3188 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3189 }); 3190 if (!ScalarInd) 3191 continue; 3192 3193 // If the induction variable update is a fixed-order recurrence, neither the 3194 // induction variable or its update should be marked scalar after 3195 // vectorization. 3196 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); 3197 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) 3198 continue; 3199 3200 // Determine if all users of the induction variable update instruction are 3201 // scalar after vectorization. 3202 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3203 auto *I = cast<Instruction>(U); 3204 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3205 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3206 }); 3207 if (!ScalarIndUpdate) 3208 continue; 3209 3210 // The induction variable and its update instruction will remain scalar. 3211 Worklist.insert(Ind); 3212 Worklist.insert(IndUpdate); 3213 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3214 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3215 << "\n"); 3216 } 3217 3218 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3219 } 3220 3221 bool LoopVectorizationCostModel::isScalarWithPredication( 3222 Instruction *I, ElementCount VF) const { 3223 if (!isPredicatedInst(I)) 3224 return false; 3225 3226 // Do we have a non-scalar lowering for this predicated 3227 // instruction? No - it is scalar with predication. 3228 switch(I->getOpcode()) { 3229 default: 3230 return true; 3231 case Instruction::Call: 3232 if (VF.isScalar()) 3233 return true; 3234 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3235 .Kind == CM_Scalarize; 3236 case Instruction::Load: 3237 case Instruction::Store: { 3238 auto *Ptr = getLoadStorePointerOperand(I); 3239 auto *Ty = getLoadStoreType(I); 3240 Type *VTy = Ty; 3241 if (VF.isVector()) 3242 VTy = VectorType::get(Ty, VF); 3243 const Align Alignment = getLoadStoreAlignment(I); 3244 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3245 TTI.isLegalMaskedGather(VTy, Alignment)) 3246 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3247 TTI.isLegalMaskedScatter(VTy, Alignment)); 3248 } 3249 case Instruction::UDiv: 3250 case Instruction::SDiv: 3251 case Instruction::SRem: 3252 case Instruction::URem: { 3253 // We have the option to use the safe-divisor idiom to avoid predication. 3254 // The cost based decision here will always select safe-divisor for 3255 // scalable vectors as scalarization isn't legal. 3256 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3257 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3258 } 3259 } 3260 } 3261 3262 // TODO: Fold into LoopVectorizationLegality::isMaskRequired. 3263 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3264 // If predication is not needed, avoid it. 3265 // TODO: We can use the loop-preheader as context point here and get 3266 // context sensitive reasoning for isSafeToSpeculativelyExecute. 3267 if (!blockNeedsPredicationForAnyReason(I->getParent()) || 3268 isSafeToSpeculativelyExecute(I) || 3269 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) || 3270 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I)) 3271 return false; 3272 3273 // If the instruction was executed conditionally in the original scalar loop, 3274 // predication is needed with a mask whose lanes are all possibly inactive. 3275 if (Legal->blockNeedsPredication(I->getParent())) 3276 return true; 3277 3278 // All that remain are instructions with side-effects originally executed in 3279 // the loop unconditionally, but now execute under a tail-fold mask (only) 3280 // having at least one active lane (the first). If the side-effects of the 3281 // instruction are invariant, executing it w/o (the tail-folding) mask is safe 3282 // - it will cause the same side-effects as when masked. 3283 switch(I->getOpcode()) { 3284 default: 3285 llvm_unreachable( 3286 "instruction should have been considered by earlier checks"); 3287 case Instruction::Call: 3288 // Side-effects of a Call are assumed to be non-invariant, needing a 3289 // (fold-tail) mask. 3290 assert(Legal->isMaskRequired(I) && 3291 "should have returned earlier for calls not needing a mask"); 3292 return true; 3293 case Instruction::Load: 3294 // If the address is loop invariant no predication is needed. 3295 return !Legal->isInvariant(getLoadStorePointerOperand(I)); 3296 case Instruction::Store: { 3297 // For stores, we need to prove both speculation safety (which follows from 3298 // the same argument as loads), but also must prove the value being stored 3299 // is correct. The easiest form of the later is to require that all values 3300 // stored are the same. 3301 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && 3302 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); 3303 } 3304 case Instruction::UDiv: 3305 case Instruction::SDiv: 3306 case Instruction::SRem: 3307 case Instruction::URem: 3308 // If the divisor is loop-invariant no predication is needed. 3309 return !TheLoop->isLoopInvariant(I->getOperand(1)); 3310 } 3311 } 3312 3313 std::pair<InstructionCost, InstructionCost> 3314 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3315 ElementCount VF) const { 3316 assert(I->getOpcode() == Instruction::UDiv || 3317 I->getOpcode() == Instruction::SDiv || 3318 I->getOpcode() == Instruction::SRem || 3319 I->getOpcode() == Instruction::URem); 3320 assert(!isSafeToSpeculativelyExecute(I)); 3321 3322 // Scalarization isn't legal for scalable vector types 3323 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3324 if (!VF.isScalable()) { 3325 // Get the scalarization cost and scale this amount by the probability of 3326 // executing the predicated block. If the instruction is not predicated, 3327 // we fall through to the next case. 3328 ScalarizationCost = 0; 3329 3330 // These instructions have a non-void type, so account for the phi nodes 3331 // that we will create. This cost is likely to be zero. The phi node 3332 // cost, if any, should be scaled by the block probability because it 3333 // models a copy at the end of each predicated block. 3334 ScalarizationCost += VF.getKnownMinValue() * 3335 TTI.getCFInstrCost(Instruction::PHI, CostKind); 3336 3337 // The cost of the non-predicated instruction. 3338 ScalarizationCost += VF.getKnownMinValue() * 3339 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 3340 3341 // The cost of insertelement and extractelement instructions needed for 3342 // scalarization. 3343 ScalarizationCost += getScalarizationOverhead(I, VF); 3344 3345 // Scale the cost by the probability of executing the predicated blocks. 3346 // This assumes the predicated block for each vector lane is equally 3347 // likely. 3348 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 3349 } 3350 InstructionCost SafeDivisorCost = 0; 3351 3352 auto *VecTy = toVectorTy(I->getType(), VF); 3353 3354 // The cost of the select guard to ensure all lanes are well defined 3355 // after we speculate above any internal control flow. 3356 SafeDivisorCost += 3357 TTI.getCmpSelInstrCost(Instruction::Select, VecTy, 3358 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 3359 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3360 3361 // Certain instructions can be cheaper to vectorize if they have a constant 3362 // second vector operand. One example of this are shifts on x86. 3363 Value *Op2 = I->getOperand(1); 3364 auto Op2Info = TTI.getOperandInfo(Op2); 3365 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 3366 Legal->isInvariant(Op2)) 3367 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 3368 3369 SmallVector<const Value *, 4> Operands(I->operand_values()); 3370 SafeDivisorCost += TTI.getArithmeticInstrCost( 3371 I->getOpcode(), VecTy, CostKind, 3372 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 3373 Op2Info, Operands, I); 3374 return {ScalarizationCost, SafeDivisorCost}; 3375 } 3376 3377 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 3378 Instruction *I, ElementCount VF) const { 3379 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 3380 assert(getWideningDecision(I, VF) == CM_Unknown && 3381 "Decision should not be set yet."); 3382 auto *Group = getInterleavedAccessGroup(I); 3383 assert(Group && "Must have a group."); 3384 unsigned InterleaveFactor = Group->getFactor(); 3385 3386 // If the instruction's allocated size doesn't equal its type size, it 3387 // requires padding and will be scalarized. 3388 auto &DL = I->getDataLayout(); 3389 auto *ScalarTy = getLoadStoreType(I); 3390 if (hasIrregularType(ScalarTy, DL)) 3391 return false; 3392 3393 // We currently only know how to emit interleave/deinterleave with 3394 // Factor=2 for scalable vectors. This is purely an implementation 3395 // limit. 3396 if (VF.isScalable() && InterleaveFactor != 2) 3397 return false; 3398 3399 // If the group involves a non-integral pointer, we may not be able to 3400 // losslessly cast all values to a common type. 3401 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 3402 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) { 3403 Instruction *Member = Group->getMember(Idx); 3404 if (!Member) 3405 continue; 3406 auto *MemberTy = getLoadStoreType(Member); 3407 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 3408 // Don't coerce non-integral pointers to integers or vice versa. 3409 if (MemberNI != ScalarNI) 3410 // TODO: Consider adding special nullptr value case here 3411 return false; 3412 if (MemberNI && ScalarNI && 3413 ScalarTy->getPointerAddressSpace() != 3414 MemberTy->getPointerAddressSpace()) 3415 return false; 3416 } 3417 3418 // Check if masking is required. 3419 // A Group may need masking for one of two reasons: it resides in a block that 3420 // needs predication, or it was decided to use masking to deal with gaps 3421 // (either a gap at the end of a load-access that may result in a speculative 3422 // load, or any gaps in a store-access). 3423 bool PredicatedAccessRequiresMasking = 3424 blockNeedsPredicationForAnyReason(I->getParent()) && 3425 Legal->isMaskRequired(I); 3426 bool LoadAccessWithGapsRequiresEpilogMasking = 3427 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 3428 !isScalarEpilogueAllowed(); 3429 bool StoreAccessWithGapsRequiresMasking = 3430 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 3431 if (!PredicatedAccessRequiresMasking && 3432 !LoadAccessWithGapsRequiresEpilogMasking && 3433 !StoreAccessWithGapsRequiresMasking) 3434 return true; 3435 3436 // If masked interleaving is required, we expect that the user/target had 3437 // enabled it, because otherwise it either wouldn't have been created or 3438 // it should have been invalidated by the CostModel. 3439 assert(useMaskedInterleavedAccesses(TTI) && 3440 "Masked interleave-groups for predicated accesses are not enabled."); 3441 3442 if (Group->isReverse()) 3443 return false; 3444 3445 auto *Ty = getLoadStoreType(I); 3446 const Align Alignment = getLoadStoreAlignment(I); 3447 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 3448 : TTI.isLegalMaskedStore(Ty, Alignment); 3449 } 3450 3451 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 3452 Instruction *I, ElementCount VF) { 3453 // Get and ensure we have a valid memory instruction. 3454 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 3455 3456 auto *Ptr = getLoadStorePointerOperand(I); 3457 auto *ScalarTy = getLoadStoreType(I); 3458 3459 // In order to be widened, the pointer should be consecutive, first of all. 3460 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 3461 return false; 3462 3463 // If the instruction is a store located in a predicated block, it will be 3464 // scalarized. 3465 if (isScalarWithPredication(I, VF)) 3466 return false; 3467 3468 // If the instruction's allocated size doesn't equal it's type size, it 3469 // requires padding and will be scalarized. 3470 auto &DL = I->getDataLayout(); 3471 if (hasIrregularType(ScalarTy, DL)) 3472 return false; 3473 3474 return true; 3475 } 3476 3477 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 3478 // We should not collect Uniforms more than once per VF. Right now, 3479 // this function is called from collectUniformsAndScalars(), which 3480 // already does this check. Collecting Uniforms for VF=1 does not make any 3481 // sense. 3482 3483 assert(VF.isVector() && !Uniforms.contains(VF) && 3484 "This function should not be visited twice for the same VF"); 3485 3486 // Visit the list of Uniforms. If we find no uniform value, we won't 3487 // analyze again. Uniforms.count(VF) will return 1. 3488 Uniforms[VF].clear(); 3489 3490 // Now we know that the loop is vectorizable! 3491 // Collect instructions inside the loop that will remain uniform after 3492 // vectorization. 3493 3494 // Global values, params and instructions outside of current loop are out of 3495 // scope. 3496 auto IsOutOfScope = [&](Value *V) -> bool { 3497 Instruction *I = dyn_cast<Instruction>(V); 3498 return (!I || !TheLoop->contains(I)); 3499 }; 3500 3501 // Worklist containing uniform instructions demanding lane 0. 3502 SetVector<Instruction *> Worklist; 3503 3504 // Add uniform instructions demanding lane 0 to the worklist. Instructions 3505 // that require predication must not be considered uniform after 3506 // vectorization, because that would create an erroneous replicating region 3507 // where only a single instance out of VF should be formed. 3508 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void { 3509 if (IsOutOfScope(I)) { 3510 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 3511 << *I << "\n"); 3512 return; 3513 } 3514 if (isPredicatedInst(I)) { 3515 LLVM_DEBUG( 3516 dbgs() << "LV: Found not uniform due to requiring predication: " << *I 3517 << "\n"); 3518 return; 3519 } 3520 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 3521 Worklist.insert(I); 3522 }; 3523 3524 // Start with the conditional branches exiting the loop. If the branch 3525 // condition is an instruction contained in the loop that is only used by the 3526 // branch, it is uniform. Note conditions from uncountable early exits are not 3527 // uniform. 3528 SmallVector<BasicBlock *> Exiting; 3529 TheLoop->getExitingBlocks(Exiting); 3530 for (BasicBlock *E : Exiting) { 3531 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) 3532 continue; 3533 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); 3534 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 3535 AddToWorklistIfAllowed(Cmp); 3536 } 3537 3538 auto PrevVF = VF.divideCoefficientBy(2); 3539 // Return true if all lanes perform the same memory operation, and we can 3540 // thus choose to execute only one. 3541 auto IsUniformMemOpUse = [&](Instruction *I) { 3542 // If the value was already known to not be uniform for the previous 3543 // (smaller VF), it cannot be uniform for the larger VF. 3544 if (PrevVF.isVector()) { 3545 auto Iter = Uniforms.find(PrevVF); 3546 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 3547 return false; 3548 } 3549 if (!Legal->isUniformMemOp(*I, VF)) 3550 return false; 3551 if (isa<LoadInst>(I)) 3552 // Loading the same address always produces the same result - at least 3553 // assuming aliasing and ordering which have already been checked. 3554 return true; 3555 // Storing the same value on every iteration. 3556 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 3557 }; 3558 3559 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) { 3560 InstWidening WideningDecision = getWideningDecision(I, VF); 3561 assert(WideningDecision != CM_Unknown && 3562 "Widening decision should be ready at this moment"); 3563 3564 if (IsUniformMemOpUse(I)) 3565 return true; 3566 3567 return (WideningDecision == CM_Widen || 3568 WideningDecision == CM_Widen_Reverse || 3569 WideningDecision == CM_Interleave); 3570 }; 3571 3572 // Returns true if Ptr is the pointer operand of a memory access instruction 3573 // I, I is known to not require scalarization, and the pointer is not also 3574 // stored. 3575 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 3576 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 3577 return false; 3578 return getLoadStorePointerOperand(I) == Ptr && 3579 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 3580 }; 3581 3582 // Holds a list of values which are known to have at least one uniform use. 3583 // Note that there may be other uses which aren't uniform. A "uniform use" 3584 // here is something which only demands lane 0 of the unrolled iterations; 3585 // it does not imply that all lanes produce the same value (e.g. this is not 3586 // the usual meaning of uniform) 3587 SetVector<Value *> HasUniformUse; 3588 3589 // Scan the loop for instructions which are either a) known to have only 3590 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3591 for (auto *BB : TheLoop->blocks()) 3592 for (auto &I : *BB) { 3593 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 3594 switch (II->getIntrinsicID()) { 3595 case Intrinsic::sideeffect: 3596 case Intrinsic::experimental_noalias_scope_decl: 3597 case Intrinsic::assume: 3598 case Intrinsic::lifetime_start: 3599 case Intrinsic::lifetime_end: 3600 if (TheLoop->hasLoopInvariantOperands(&I)) 3601 AddToWorklistIfAllowed(&I); 3602 break; 3603 default: 3604 break; 3605 } 3606 } 3607 3608 // ExtractValue instructions must be uniform, because the operands are 3609 // known to be loop-invariant. 3610 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 3611 assert(IsOutOfScope(EVI->getAggregateOperand()) && 3612 "Expected aggregate value to be loop invariant"); 3613 AddToWorklistIfAllowed(EVI); 3614 continue; 3615 } 3616 3617 // If there's no pointer operand, there's nothing to do. 3618 auto *Ptr = getLoadStorePointerOperand(&I); 3619 if (!Ptr) 3620 continue; 3621 3622 if (IsUniformMemOpUse(&I)) 3623 AddToWorklistIfAllowed(&I); 3624 3625 if (IsVectorizedMemAccessUse(&I, Ptr)) 3626 HasUniformUse.insert(Ptr); 3627 } 3628 3629 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 3630 // demanding) users. Since loops are assumed to be in LCSSA form, this 3631 // disallows uses outside the loop as well. 3632 for (auto *V : HasUniformUse) { 3633 if (IsOutOfScope(V)) 3634 continue; 3635 auto *I = cast<Instruction>(V); 3636 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool { 3637 auto *UI = cast<Instruction>(U); 3638 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V); 3639 }); 3640 if (UsersAreMemAccesses) 3641 AddToWorklistIfAllowed(I); 3642 } 3643 3644 // Expand Worklist in topological order: whenever a new instruction 3645 // is added , its users should be already inside Worklist. It ensures 3646 // a uniform instruction will only be used by uniform instructions. 3647 unsigned Idx = 0; 3648 while (Idx != Worklist.size()) { 3649 Instruction *I = Worklist[Idx++]; 3650 3651 for (auto *OV : I->operand_values()) { 3652 // isOutOfScope operands cannot be uniform instructions. 3653 if (IsOutOfScope(OV)) 3654 continue; 3655 // First order recurrence Phi's should typically be considered 3656 // non-uniform. 3657 auto *OP = dyn_cast<PHINode>(OV); 3658 if (OP && Legal->isFixedOrderRecurrence(OP)) 3659 continue; 3660 // If all the users of the operand are uniform, then add the 3661 // operand into the uniform worklist. 3662 auto *OI = cast<Instruction>(OV); 3663 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 3664 auto *J = cast<Instruction>(U); 3665 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI); 3666 })) 3667 AddToWorklistIfAllowed(OI); 3668 } 3669 } 3670 3671 // For an instruction to be added into Worklist above, all its users inside 3672 // the loop should also be in Worklist. However, this condition cannot be 3673 // true for phi nodes that form a cyclic dependence. We must process phi 3674 // nodes separately. An induction variable will remain uniform if all users 3675 // of the induction variable and induction variable update remain uniform. 3676 // The code below handles both pointer and non-pointer induction variables. 3677 BasicBlock *Latch = TheLoop->getLoopLatch(); 3678 for (const auto &Induction : Legal->getInductionVars()) { 3679 auto *Ind = Induction.first; 3680 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3681 3682 // Determine if all users of the induction variable are uniform after 3683 // vectorization. 3684 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool { 3685 auto *I = cast<Instruction>(U); 3686 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3687 IsVectorizedMemAccessUse(I, Ind); 3688 }); 3689 if (!UniformInd) 3690 continue; 3691 3692 // Determine if all users of the induction variable update instruction are 3693 // uniform after vectorization. 3694 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3695 auto *I = cast<Instruction>(U); 3696 return I == Ind || Worklist.count(I) || 3697 IsVectorizedMemAccessUse(I, IndUpdate); 3698 }); 3699 if (!UniformIndUpdate) 3700 continue; 3701 3702 // The induction variable and its update instruction will remain uniform. 3703 AddToWorklistIfAllowed(Ind); 3704 AddToWorklistIfAllowed(IndUpdate); 3705 } 3706 3707 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 3708 } 3709 3710 bool LoopVectorizationCostModel::runtimeChecksRequired() { 3711 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 3712 3713 if (Legal->getRuntimePointerChecking()->Need) { 3714 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 3715 "runtime pointer checks needed. Enable vectorization of this " 3716 "loop with '#pragma clang loop vectorize(enable)' when " 3717 "compiling with -Os/-Oz", 3718 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3719 return true; 3720 } 3721 3722 if (!PSE.getPredicate().isAlwaysTrue()) { 3723 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 3724 "runtime SCEV checks needed. Enable vectorization of this " 3725 "loop with '#pragma clang loop vectorize(enable)' when " 3726 "compiling with -Os/-Oz", 3727 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3728 return true; 3729 } 3730 3731 // FIXME: Avoid specializing for stride==1 instead of bailing out. 3732 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 3733 reportVectorizationFailure("Runtime stride check for small trip count", 3734 "runtime stride == 1 checks needed. Enable vectorization of " 3735 "this loop without such check by compiling with -Os/-Oz", 3736 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3737 return true; 3738 } 3739 3740 return false; 3741 } 3742 3743 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { 3744 if (IsScalableVectorizationAllowed) 3745 return *IsScalableVectorizationAllowed; 3746 3747 IsScalableVectorizationAllowed = false; 3748 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 3749 return false; 3750 3751 if (Hints->isScalableVectorizationDisabled()) { 3752 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 3753 "ScalableVectorizationDisabled", ORE, TheLoop); 3754 return false; 3755 } 3756 3757 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 3758 3759 auto MaxScalableVF = ElementCount::getScalable( 3760 std::numeric_limits<ElementCount::ScalarTy>::max()); 3761 3762 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 3763 // FIXME: While for scalable vectors this is currently sufficient, this should 3764 // be replaced by a more detailed mechanism that filters out specific VFs, 3765 // instead of invalidating vectorization for a whole set of VFs based on the 3766 // MaxVF. 3767 3768 // Disable scalable vectorization if the loop contains unsupported reductions. 3769 if (!canVectorizeReductions(MaxScalableVF)) { 3770 reportVectorizationInfo( 3771 "Scalable vectorization not supported for the reduction " 3772 "operations found in this loop.", 3773 "ScalableVFUnfeasible", ORE, TheLoop); 3774 return false; 3775 } 3776 3777 // Disable scalable vectorization if the loop contains any instructions 3778 // with element types not supported for scalable vectors. 3779 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 3780 return !Ty->isVoidTy() && 3781 !this->TTI.isElementTypeLegalForScalableVector(Ty); 3782 })) { 3783 reportVectorizationInfo("Scalable vectorization is not supported " 3784 "for all element types found in this loop.", 3785 "ScalableVFUnfeasible", ORE, TheLoop); 3786 return false; 3787 } 3788 3789 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { 3790 reportVectorizationInfo("The target does not provide maximum vscale value " 3791 "for safe distance analysis.", 3792 "ScalableVFUnfeasible", ORE, TheLoop); 3793 return false; 3794 } 3795 3796 IsScalableVectorizationAllowed = true; 3797 return true; 3798 } 3799 3800 ElementCount 3801 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 3802 if (!isScalableVectorizationAllowed()) 3803 return ElementCount::getScalable(0); 3804 3805 auto MaxScalableVF = ElementCount::getScalable( 3806 std::numeric_limits<ElementCount::ScalarTy>::max()); 3807 if (Legal->isSafeForAnyVectorWidth()) 3808 return MaxScalableVF; 3809 3810 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 3811 // Limit MaxScalableVF by the maximum safe dependence distance. 3812 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 3813 3814 if (!MaxScalableVF) 3815 reportVectorizationInfo( 3816 "Max legal vector width too small, scalable vectorization " 3817 "unfeasible.", 3818 "ScalableVFUnfeasible", ORE, TheLoop); 3819 3820 return MaxScalableVF; 3821 } 3822 3823 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 3824 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 3825 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 3826 unsigned SmallestType, WidestType; 3827 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 3828 3829 // Get the maximum safe dependence distance in bits computed by LAA. 3830 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3831 // the memory accesses that is most restrictive (involved in the smallest 3832 // dependence distance). 3833 unsigned MaxSafeElements = 3834 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 3835 3836 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 3837 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 3838 if (!Legal->isSafeForAnyVectorWidth()) 3839 this->MaxSafeElements = MaxSafeElements; 3840 3841 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 3842 << ".\n"); 3843 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 3844 << ".\n"); 3845 3846 // First analyze the UserVF, fall back if the UserVF should be ignored. 3847 if (UserVF) { 3848 auto MaxSafeUserVF = 3849 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 3850 3851 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 3852 // If `VF=vscale x N` is safe, then so is `VF=N` 3853 if (UserVF.isScalable()) 3854 return FixedScalableVFPair( 3855 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 3856 3857 return UserVF; 3858 } 3859 3860 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 3861 3862 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3863 // is better to ignore the hint and let the compiler choose a suitable VF. 3864 if (!UserVF.isScalable()) { 3865 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3866 << " is unsafe, clamping to max safe VF=" 3867 << MaxSafeFixedVF << ".\n"); 3868 ORE->emit([&]() { 3869 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3870 TheLoop->getStartLoc(), 3871 TheLoop->getHeader()) 3872 << "User-specified vectorization factor " 3873 << ore::NV("UserVectorizationFactor", UserVF) 3874 << " is unsafe, clamping to maximum safe vectorization factor " 3875 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 3876 }); 3877 return MaxSafeFixedVF; 3878 } 3879 3880 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 3881 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3882 << " is ignored because scalable vectors are not " 3883 "available.\n"); 3884 ORE->emit([&]() { 3885 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3886 TheLoop->getStartLoc(), 3887 TheLoop->getHeader()) 3888 << "User-specified vectorization factor " 3889 << ore::NV("UserVectorizationFactor", UserVF) 3890 << " is ignored because the target does not support scalable " 3891 "vectors. The compiler will pick a more suitable value."; 3892 }); 3893 } else { 3894 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3895 << " is unsafe. Ignoring scalable UserVF.\n"); 3896 ORE->emit([&]() { 3897 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3898 TheLoop->getStartLoc(), 3899 TheLoop->getHeader()) 3900 << "User-specified vectorization factor " 3901 << ore::NV("UserVectorizationFactor", UserVF) 3902 << " is unsafe. Ignoring the hint to let the compiler pick a " 3903 "more suitable value."; 3904 }); 3905 } 3906 } 3907 3908 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 3909 << " / " << WidestType << " bits.\n"); 3910 3911 FixedScalableVFPair Result(ElementCount::getFixed(1), 3912 ElementCount::getScalable(0)); 3913 if (auto MaxVF = 3914 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3915 MaxSafeFixedVF, FoldTailByMasking)) 3916 Result.FixedVF = MaxVF; 3917 3918 if (auto MaxVF = 3919 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3920 MaxSafeScalableVF, FoldTailByMasking)) 3921 if (MaxVF.isScalable()) { 3922 Result.ScalableVF = MaxVF; 3923 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 3924 << "\n"); 3925 } 3926 3927 return Result; 3928 } 3929 3930 FixedScalableVFPair 3931 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 3932 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 3933 // TODO: It may be useful to do since it's still likely to be dynamically 3934 // uniform if the target can skip. 3935 reportVectorizationFailure( 3936 "Not inserting runtime ptr check for divergent target", 3937 "runtime pointer checks needed. Not enabled for divergent target", 3938 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 3939 return FixedScalableVFPair::getNone(); 3940 } 3941 3942 ScalarEvolution *SE = PSE.getSE(); 3943 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 3944 unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); 3945 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 3946 if (TC != MaxTC) 3947 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); 3948 if (TC == 1) { 3949 reportVectorizationFailure("Single iteration (non) loop", 3950 "loop trip count is one, irrelevant for vectorization", 3951 "SingleIterationLoop", ORE, TheLoop); 3952 return FixedScalableVFPair::getNone(); 3953 } 3954 3955 // If BTC matches the widest induction type and is -1 then the trip count 3956 // computation will wrap to 0 and the vector trip count will be 0. Do not try 3957 // to vectorize. 3958 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); 3959 if (!isa<SCEVCouldNotCompute>(BTC) && 3960 BTC->getType()->getScalarSizeInBits() >= 3961 Legal->getWidestInductionType()->getScalarSizeInBits() && 3962 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, 3963 SE->getMinusOne(BTC->getType()))) { 3964 reportVectorizationFailure( 3965 "Trip count computation wrapped", 3966 "backedge-taken count is -1, loop trip count wrapped to 0", 3967 "TripCountWrapped", ORE, TheLoop); 3968 return FixedScalableVFPair::getNone(); 3969 } 3970 3971 switch (ScalarEpilogueStatus) { 3972 case CM_ScalarEpilogueAllowed: 3973 return computeFeasibleMaxVF(MaxTC, UserVF, false); 3974 case CM_ScalarEpilogueNotAllowedUsePredicate: 3975 [[fallthrough]]; 3976 case CM_ScalarEpilogueNotNeededUsePredicate: 3977 LLVM_DEBUG( 3978 dbgs() << "LV: vector predicate hint/switch found.\n" 3979 << "LV: Not allowing scalar epilogue, creating predicated " 3980 << "vector loop.\n"); 3981 break; 3982 case CM_ScalarEpilogueNotAllowedLowTripLoop: 3983 // fallthrough as a special case of OptForSize 3984 case CM_ScalarEpilogueNotAllowedOptSize: 3985 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 3986 LLVM_DEBUG( 3987 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 3988 else 3989 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 3990 << "count.\n"); 3991 3992 // Bail if runtime checks are required, which are not good when optimising 3993 // for size. 3994 if (runtimeChecksRequired()) 3995 return FixedScalableVFPair::getNone(); 3996 3997 break; 3998 } 3999 4000 // The only loops we can vectorize without a scalar epilogue, are loops with 4001 // a bottom-test and a single exiting block. We'd have to handle the fact 4002 // that not every instruction executes on the last iteration. This will 4003 // require a lane mask which varies through the vector loop body. (TODO) 4004 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4005 // If there was a tail-folding hint/switch, but we can't fold the tail by 4006 // masking, fallback to a vectorization with a scalar epilogue. 4007 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4008 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4009 "scalar epilogue instead.\n"); 4010 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4011 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4012 } 4013 return FixedScalableVFPair::getNone(); 4014 } 4015 4016 // Now try the tail folding 4017 4018 // Invalidate interleave groups that require an epilogue if we can't mask 4019 // the interleave-group. 4020 if (!useMaskedInterleavedAccesses(TTI)) { 4021 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4022 "No decisions should have been taken at this point"); 4023 // Note: There is no need to invalidate any cost modeling decisions here, as 4024 // none were taken so far. 4025 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4026 } 4027 4028 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4029 4030 // Avoid tail folding if the trip count is known to be a multiple of any VF 4031 // we choose. 4032 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4033 MaxFactors.FixedVF.getFixedValue(); 4034 if (MaxFactors.ScalableVF) { 4035 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4036 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4037 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4038 *MaxPowerOf2RuntimeVF, 4039 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4040 } else 4041 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4042 } 4043 4044 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4045 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4046 "MaxFixedVF must be a power of 2"); 4047 unsigned MaxVFtimesIC = 4048 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4049 ScalarEvolution *SE = PSE.getSE(); 4050 // Currently only loops with countable exits are vectorized, but calling 4051 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with 4052 // uncountable exits whilst also ensuring the symbolic maximum and known 4053 // back-edge taken count remain identical for loops with countable exits. 4054 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount(); 4055 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() && 4056 "Invalid loop count"); 4057 const SCEV *ExitCount = SE->getAddExpr( 4058 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4059 const SCEV *Rem = SE->getURemExpr( 4060 SE->applyLoopGuards(ExitCount, TheLoop), 4061 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4062 if (Rem->isZero()) { 4063 // Accept MaxFixedVF if we do not have a tail. 4064 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4065 return MaxFactors; 4066 } 4067 } 4068 4069 // If we don't know the precise trip count, or if the trip count that we 4070 // found modulo the vectorization factor is not zero, try to fold the tail 4071 // by masking. 4072 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4073 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); 4074 if (foldTailByMasking()) { 4075 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { 4076 LLVM_DEBUG( 4077 dbgs() 4078 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4079 "try to generate VP Intrinsics with scalable vector " 4080 "factors only.\n"); 4081 // Tail folded loop using VP intrinsics restricts the VF to be scalable 4082 // for now. 4083 // TODO: extend it for fixed vectors, if required. 4084 assert(MaxFactors.ScalableVF.isScalable() && 4085 "Expected scalable vector factor."); 4086 4087 MaxFactors.FixedVF = ElementCount::getFixed(1); 4088 } 4089 return MaxFactors; 4090 } 4091 4092 // If there was a tail-folding hint/switch, but we can't fold the tail by 4093 // masking, fallback to a vectorization with a scalar epilogue. 4094 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4095 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4096 "scalar epilogue instead.\n"); 4097 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4098 return MaxFactors; 4099 } 4100 4101 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4102 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4103 return FixedScalableVFPair::getNone(); 4104 } 4105 4106 if (TC == 0) { 4107 reportVectorizationFailure( 4108 "unable to calculate the loop count due to complex control flow", 4109 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4110 return FixedScalableVFPair::getNone(); 4111 } 4112 4113 reportVectorizationFailure( 4114 "Cannot optimize for size and vectorize at the same time.", 4115 "cannot optimize for size and vectorize at the same time. " 4116 "Enable vectorization of this loop with '#pragma clang loop " 4117 "vectorize(enable)' when compiling with -Os/-Oz", 4118 "NoTailLoopWithOptForSize", ORE, TheLoop); 4119 return FixedScalableVFPair::getNone(); 4120 } 4121 4122 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4123 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4124 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4125 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4126 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4127 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4128 : TargetTransformInfo::RGK_FixedWidthVector); 4129 4130 // Convenience function to return the minimum of two ElementCounts. 4131 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4132 assert((LHS.isScalable() == RHS.isScalable()) && 4133 "Scalable flags must match"); 4134 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4135 }; 4136 4137 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4138 // Note that both WidestRegister and WidestType may not be a powers of 2. 4139 auto MaxVectorElementCount = ElementCount::get( 4140 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4141 ComputeScalableMaxVF); 4142 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4143 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4144 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4145 4146 if (!MaxVectorElementCount) { 4147 LLVM_DEBUG(dbgs() << "LV: The target has no " 4148 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4149 << " vector registers.\n"); 4150 return ElementCount::getFixed(1); 4151 } 4152 4153 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4154 if (MaxVectorElementCount.isScalable() && 4155 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4156 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4157 auto Min = Attr.getVScaleRangeMin(); 4158 WidestRegisterMinEC *= Min; 4159 } 4160 4161 // When a scalar epilogue is required, at least one iteration of the scalar 4162 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4163 // max VF that results in a dead vector loop. 4164 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4165 MaxTripCount -= 1; 4166 4167 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4168 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4169 // If upper bound loop trip count (TC) is known at compile time there is no 4170 // point in choosing VF greater than TC (as done in the loop below). Select 4171 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4172 // scalable, we only fall back on a fixed VF when the TC is less than or 4173 // equal to the known number of lanes. 4174 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4175 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4176 "exceeding the constant trip count: " 4177 << ClampedUpperTripCount << "\n"); 4178 return ElementCount::get( 4179 ClampedUpperTripCount, 4180 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4181 } 4182 4183 TargetTransformInfo::RegisterKind RegKind = 4184 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4185 : TargetTransformInfo::RGK_FixedWidthVector; 4186 ElementCount MaxVF = MaxVectorElementCount; 4187 if (MaximizeBandwidth || 4188 (MaximizeBandwidth.getNumOccurrences() == 0 && 4189 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4190 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4191 auto MaxVectorElementCountMaxBW = ElementCount::get( 4192 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4193 ComputeScalableMaxVF); 4194 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4195 4196 // Collect all viable vectorization factors larger than the default MaxVF 4197 // (i.e. MaxVectorElementCount). 4198 SmallVector<ElementCount, 8> VFs; 4199 for (ElementCount VS = MaxVectorElementCount * 2; 4200 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4201 VFs.push_back(VS); 4202 4203 // For each VF calculate its register usage. 4204 auto RUs = calculateRegisterUsage(VFs); 4205 4206 // Select the largest VF which doesn't require more registers than existing 4207 // ones. 4208 for (int I = RUs.size() - 1; I >= 0; --I) { 4209 const auto &MLU = RUs[I].MaxLocalUsers; 4210 if (all_of(MLU, [&](decltype(MLU.front()) &LU) { 4211 return LU.second <= TTI.getNumberOfRegisters(LU.first); 4212 })) { 4213 MaxVF = VFs[I]; 4214 break; 4215 } 4216 } 4217 if (ElementCount MinVF = 4218 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4219 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4220 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4221 << ") with target's minimum: " << MinVF << '\n'); 4222 MaxVF = MinVF; 4223 } 4224 } 4225 4226 // Invalidate any widening decisions we might have made, in case the loop 4227 // requires prediction (decided later), but we have already made some 4228 // load/store widening decisions. 4229 invalidateCostModelingDecisions(); 4230 } 4231 return MaxVF; 4232 } 4233 4234 /// Convenience function that returns the value of vscale_range iff 4235 /// vscale_range.min == vscale_range.max or otherwise returns the value 4236 /// returned by the corresponding TTI method. 4237 static std::optional<unsigned> 4238 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4239 const Function *Fn = L->getHeader()->getParent(); 4240 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4241 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4242 auto Min = Attr.getVScaleRangeMin(); 4243 auto Max = Attr.getVScaleRangeMax(); 4244 if (Max && Min == Max) 4245 return Max; 4246 } 4247 4248 return TTI.getVScaleForTuning(); 4249 } 4250 4251 /// This function attempts to return a value that represents the vectorization 4252 /// factor at runtime. For fixed-width VFs we know this precisely at compile 4253 /// time, but for scalable VFs we calculate it based on an estimate of the 4254 /// vscale value. 4255 static unsigned getEstimatedRuntimeVF(const Loop *L, 4256 const TargetTransformInfo &TTI, 4257 ElementCount VF) { 4258 unsigned EstimatedVF = VF.getKnownMinValue(); 4259 if (VF.isScalable()) 4260 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI)) 4261 EstimatedVF *= *VScale; 4262 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 4263 return EstimatedVF; 4264 } 4265 4266 bool LoopVectorizationPlanner::isMoreProfitable( 4267 const VectorizationFactor &A, const VectorizationFactor &B, 4268 const unsigned MaxTripCount) const { 4269 InstructionCost CostA = A.Cost; 4270 InstructionCost CostB = B.Cost; 4271 4272 // Improve estimate for the vector width if it is scalable. 4273 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4274 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4275 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4276 if (A.Width.isScalable()) 4277 EstimatedWidthA *= *VScale; 4278 if (B.Width.isScalable()) 4279 EstimatedWidthB *= *VScale; 4280 } 4281 4282 // Assume vscale may be larger than 1 (or the value being tuned for), 4283 // so that scalable vectorization is slightly favorable over fixed-width 4284 // vectorization. 4285 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && 4286 A.Width.isScalable() && !B.Width.isScalable(); 4287 4288 auto CmpFn = [PreferScalable](const InstructionCost &LHS, 4289 const InstructionCost &RHS) { 4290 return PreferScalable ? LHS <= RHS : LHS < RHS; 4291 }; 4292 4293 // To avoid the need for FP division: 4294 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4295 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4296 if (!MaxTripCount) 4297 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); 4298 4299 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4300 InstructionCost VectorCost, 4301 InstructionCost ScalarCost) { 4302 // If the trip count is a known (possibly small) constant, the trip count 4303 // will be rounded up to an integer number of iterations under 4304 // FoldTailByMasking. The total cost in that case will be 4305 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4306 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4307 // some extra overheads, but for the purpose of comparing the costs of 4308 // different VFs we can use this to compare the total loop-body cost 4309 // expected after vectorization. 4310 if (CM.foldTailByMasking()) 4311 return VectorCost * divideCeil(MaxTripCount, VF); 4312 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); 4313 }; 4314 4315 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); 4316 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); 4317 return CmpFn(RTCostA, RTCostB); 4318 } 4319 4320 bool LoopVectorizationPlanner::isMoreProfitable( 4321 const VectorizationFactor &A, const VectorizationFactor &B) const { 4322 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); 4323 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); 4324 } 4325 4326 void LoopVectorizationPlanner::emitInvalidCostRemarks( 4327 OptimizationRemarkEmitter *ORE) { 4328 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; 4329 SmallVector<RecipeVFPair> InvalidCosts; 4330 for (const auto &Plan : VPlans) { 4331 for (ElementCount VF : Plan->vectorFactors()) { 4332 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), 4333 CM, CM.CostKind); 4334 precomputeCosts(*Plan, VF, CostCtx); 4335 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); 4336 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4337 for (auto &R : *VPBB) { 4338 if (!R.cost(VF, CostCtx).isValid()) 4339 InvalidCosts.emplace_back(&R, VF); 4340 } 4341 } 4342 } 4343 } 4344 if (InvalidCosts.empty()) 4345 return; 4346 4347 // Emit a report of VFs with invalid costs in the loop. 4348 4349 // Group the remarks per recipe, keeping the recipe order from InvalidCosts. 4350 DenseMap<VPRecipeBase *, unsigned> Numbering; 4351 unsigned I = 0; 4352 for (auto &Pair : InvalidCosts) 4353 if (!Numbering.count(Pair.first)) 4354 Numbering[Pair.first] = I++; 4355 4356 // Sort the list, first on recipe(number) then on VF. 4357 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) { 4358 if (Numbering[A.first] != Numbering[B.first]) 4359 return Numbering[A.first] < Numbering[B.first]; 4360 const auto &LHS = A.second; 4361 const auto &RHS = B.second; 4362 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 4363 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 4364 }); 4365 4366 // For a list of ordered recipe-VF pairs: 4367 // [(load, VF1), (load, VF2), (store, VF1)] 4368 // group the recipes together to emit separate remarks for: 4369 // load (VF1, VF2) 4370 // store (VF1) 4371 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts); 4372 auto Subset = ArrayRef<RecipeVFPair>(); 4373 do { 4374 if (Subset.empty()) 4375 Subset = Tail.take_front(1); 4376 4377 VPRecipeBase *R = Subset.front().first; 4378 4379 unsigned Opcode = 4380 TypeSwitch<const VPRecipeBase *, unsigned>(R) 4381 .Case<VPHeaderPHIRecipe>( 4382 [](const auto *R) { return Instruction::PHI; }) 4383 .Case<VPWidenSelectRecipe>( 4384 [](const auto *R) { return Instruction::Select; }) 4385 .Case<VPWidenStoreRecipe>( 4386 [](const auto *R) { return Instruction::Store; }) 4387 .Case<VPWidenLoadRecipe>( 4388 [](const auto *R) { return Instruction::Load; }) 4389 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>( 4390 [](const auto *R) { return Instruction::Call; }) 4391 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe, 4392 VPWidenCastRecipe>( 4393 [](const auto *R) { return R->getOpcode(); }) 4394 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) { 4395 return R->getStoredValues().empty() ? Instruction::Load 4396 : Instruction::Store; 4397 }); 4398 4399 // If the next recipe is different, or if there are no other pairs, 4400 // emit a remark for the collated subset. e.g. 4401 // [(load, VF1), (load, VF2))] 4402 // to emit: 4403 // remark: invalid costs for 'load' at VF=(VF1, VF2) 4404 if (Subset == Tail || Tail[Subset.size()].first != R) { 4405 std::string OutString; 4406 raw_string_ostream OS(OutString); 4407 assert(!Subset.empty() && "Unexpected empty range"); 4408 OS << "Recipe with invalid costs prevented vectorization at VF=("; 4409 for (const auto &Pair : Subset) 4410 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4411 OS << "):"; 4412 if (Opcode == Instruction::Call) { 4413 StringRef Name = ""; 4414 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) { 4415 Name = Int->getIntrinsicName(); 4416 } else { 4417 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R); 4418 Function *CalledFn = 4419 WidenCall ? WidenCall->getCalledScalarFunction() 4420 : cast<Function>(R->getOperand(R->getNumOperands() - 1) 4421 ->getLiveInIRValue()); 4422 Name = CalledFn->getName(); 4423 } 4424 OS << " call to " << Name; 4425 } else 4426 OS << " " << Instruction::getOpcodeName(Opcode); 4427 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr, 4428 R->getDebugLoc()); 4429 Tail = Tail.drop_front(Subset.size()); 4430 Subset = {}; 4431 } else 4432 // Grow the subset by one element 4433 Subset = Tail.take_front(Subset.size() + 1); 4434 } while (!Tail.empty()); 4435 } 4436 4437 /// Check if any recipe of \p Plan will generate a vector value, which will be 4438 /// assigned a vector register. 4439 static bool willGenerateVectors(VPlan &Plan, ElementCount VF, 4440 const TargetTransformInfo &TTI) { 4441 assert(VF.isVector() && "Checking a scalar VF?"); 4442 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 4443 DenseSet<VPRecipeBase *> EphemeralRecipes; 4444 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); 4445 // Set of already visited types. 4446 DenseSet<Type *> Visited; 4447 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 4448 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { 4449 for (VPRecipeBase &R : *VPBB) { 4450 if (EphemeralRecipes.contains(&R)) 4451 continue; 4452 // Continue early if the recipe is considered to not produce a vector 4453 // result. Note that this includes VPInstruction where some opcodes may 4454 // produce a vector, to preserve existing behavior as VPInstructions model 4455 // aspects not directly mapped to existing IR instructions. 4456 switch (R.getVPDefID()) { 4457 case VPDef::VPDerivedIVSC: 4458 case VPDef::VPScalarIVStepsSC: 4459 case VPDef::VPScalarCastSC: 4460 case VPDef::VPReplicateSC: 4461 case VPDef::VPInstructionSC: 4462 case VPDef::VPCanonicalIVPHISC: 4463 case VPDef::VPVectorPointerSC: 4464 case VPDef::VPReverseVectorPointerSC: 4465 case VPDef::VPExpandSCEVSC: 4466 case VPDef::VPEVLBasedIVPHISC: 4467 case VPDef::VPPredInstPHISC: 4468 case VPDef::VPBranchOnMaskSC: 4469 continue; 4470 case VPDef::VPReductionSC: 4471 case VPDef::VPActiveLaneMaskPHISC: 4472 case VPDef::VPWidenCallSC: 4473 case VPDef::VPWidenCanonicalIVSC: 4474 case VPDef::VPWidenCastSC: 4475 case VPDef::VPWidenGEPSC: 4476 case VPDef::VPWidenIntrinsicSC: 4477 case VPDef::VPWidenSC: 4478 case VPDef::VPWidenSelectSC: 4479 case VPDef::VPBlendSC: 4480 case VPDef::VPFirstOrderRecurrencePHISC: 4481 case VPDef::VPWidenPHISC: 4482 case VPDef::VPWidenIntOrFpInductionSC: 4483 case VPDef::VPWidenPointerInductionSC: 4484 case VPDef::VPReductionPHISC: 4485 case VPDef::VPInterleaveSC: 4486 case VPDef::VPWidenLoadEVLSC: 4487 case VPDef::VPWidenLoadSC: 4488 case VPDef::VPWidenStoreEVLSC: 4489 case VPDef::VPWidenStoreSC: 4490 break; 4491 default: 4492 llvm_unreachable("unhandled recipe"); 4493 } 4494 4495 auto WillWiden = [&TTI, VF](Type *ScalarTy) { 4496 Type *VectorTy = toVectorTy(ScalarTy, VF); 4497 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); 4498 if (!NumLegalParts) 4499 return false; 4500 if (VF.isScalable()) { 4501 // <vscale x 1 x iN> is assumed to be profitable over iN because 4502 // scalable registers are a distinct register class from scalar 4503 // ones. If we ever find a target which wants to lower scalable 4504 // vectors back to scalars, we'll need to update this code to 4505 // explicitly ask TTI about the register class uses for each part. 4506 return NumLegalParts <= VF.getKnownMinValue(); 4507 } 4508 // Two or more parts that share a register - are vectorized. 4509 return NumLegalParts < VF.getKnownMinValue(); 4510 }; 4511 4512 // If no def nor is a store, e.g., branches, continue - no value to check. 4513 if (R.getNumDefinedValues() == 0 && 4514 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( 4515 &R)) 4516 continue; 4517 // For multi-def recipes, currently only interleaved loads, suffice to 4518 // check first def only. 4519 // For stores check their stored value; for interleaved stores suffice 4520 // the check first stored value only. In all cases this is the second 4521 // operand. 4522 VPValue *ToCheck = 4523 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); 4524 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); 4525 if (!Visited.insert({ScalarTy}).second) 4526 continue; 4527 if (WillWiden(ScalarTy)) 4528 return true; 4529 } 4530 } 4531 4532 return false; 4533 } 4534 4535 #ifndef NDEBUG 4536 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { 4537 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); 4538 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4539 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4540 assert(any_of(VPlans, 4541 [](std::unique_ptr<VPlan> &P) { 4542 return P->hasVF(ElementCount::getFixed(1)); 4543 }) && 4544 "Expected Scalar VF to be a candidate"); 4545 4546 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4547 ExpectedCost); 4548 VectorizationFactor ChosenFactor = ScalarCost; 4549 4550 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4551 if (ForceVectorization && 4552 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { 4553 // Ignore scalar width, because the user explicitly wants vectorization. 4554 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4555 // evaluation. 4556 ChosenFactor.Cost = InstructionCost::getMax(); 4557 } 4558 4559 for (auto &P : VPlans) { 4560 for (ElementCount VF : P->vectorFactors()) { 4561 // The cost for scalar VF=1 is already calculated, so ignore it. 4562 if (VF.isScalar()) 4563 continue; 4564 4565 InstructionCost C = CM.expectedCost(VF); 4566 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); 4567 4568 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); 4569 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF 4570 << " costs: " << (Candidate.Cost / Width)); 4571 if (VF.isScalable()) 4572 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 4573 << getVScaleForTuning(OrigLoop, TTI).value_or(1) 4574 << ")"); 4575 LLVM_DEBUG(dbgs() << ".\n"); 4576 4577 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 4578 LLVM_DEBUG( 4579 dbgs() 4580 << "LV: Not considering vector loop of width " << VF 4581 << " because it will not generate any vector instructions.\n"); 4582 continue; 4583 } 4584 4585 if (isMoreProfitable(Candidate, ChosenFactor)) 4586 ChosenFactor = Candidate; 4587 } 4588 } 4589 4590 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 4591 reportVectorizationFailure( 4592 "There are conditional stores.", 4593 "store that is conditionally executed prevents vectorization", 4594 "ConditionalStore", ORE, OrigLoop); 4595 ChosenFactor = ScalarCost; 4596 } 4597 4598 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 4599 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 4600 << "LV: Vectorization seems to be not beneficial, " 4601 << "but was forced by a user.\n"); 4602 return ChosenFactor; 4603 } 4604 #endif 4605 4606 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 4607 ElementCount VF) const { 4608 // Cross iteration phis such as reductions need special handling and are 4609 // currently unsupported. 4610 if (any_of(OrigLoop->getHeader()->phis(), 4611 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 4612 return false; 4613 4614 // Phis with uses outside of the loop require special handling and are 4615 // currently unsupported. 4616 for (const auto &Entry : Legal->getInductionVars()) { 4617 // Look for uses of the value of the induction at the last iteration. 4618 Value *PostInc = 4619 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 4620 for (User *U : PostInc->users()) 4621 if (!OrigLoop->contains(cast<Instruction>(U))) 4622 return false; 4623 // Look for uses of penultimate value of the induction. 4624 for (User *U : Entry.first->users()) 4625 if (!OrigLoop->contains(cast<Instruction>(U))) 4626 return false; 4627 } 4628 4629 // Epilogue vectorization code has not been auditted to ensure it handles 4630 // non-latch exits properly. It may be fine, but it needs auditted and 4631 // tested. 4632 // TODO: Add support for loops with an early exit. 4633 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 4634 return false; 4635 4636 return true; 4637 } 4638 4639 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 4640 const ElementCount VF, const unsigned IC) const { 4641 // FIXME: We need a much better cost-model to take different parameters such 4642 // as register pressure, code size increase and cost of extra branches into 4643 // account. For now we apply a very crude heuristic and only consider loops 4644 // with vectorization factors larger than a certain value. 4645 4646 // Allow the target to opt out entirely. 4647 if (!TTI.preferEpilogueVectorization()) 4648 return false; 4649 4650 // We also consider epilogue vectorization unprofitable for targets that don't 4651 // consider interleaving beneficial (eg. MVE). 4652 if (TTI.getMaxInterleaveFactor(VF) <= 1) 4653 return false; 4654 4655 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable 4656 // VFs when deciding profitability. 4657 // See related "TODO: extend to support scalable VFs." in 4658 // selectEpilogueVectorizationFactor. 4659 unsigned Multiplier = VF.isFixed() ? IC : 1; 4660 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 4661 ? EpilogueVectorizationMinVF 4662 : TTI.getEpilogueVectorizationMinVF(); 4663 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; 4664 } 4665 4666 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 4667 const ElementCount MainLoopVF, unsigned IC) { 4668 VectorizationFactor Result = VectorizationFactor::Disabled(); 4669 if (!EnableEpilogueVectorization) { 4670 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 4671 return Result; 4672 } 4673 4674 if (!CM.isScalarEpilogueAllowed()) { 4675 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 4676 "epilogue is allowed.\n"); 4677 return Result; 4678 } 4679 4680 // Not really a cost consideration, but check for unsupported cases here to 4681 // simplify the logic. 4682 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 4683 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 4684 "is not a supported candidate.\n"); 4685 return Result; 4686 } 4687 4688 if (EpilogueVectorizationForceVF > 1) { 4689 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 4690 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 4691 if (hasPlanWithVF(ForcedEC)) 4692 return {ForcedEC, 0, 0}; 4693 4694 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 4695 "viable.\n"); 4696 return Result; 4697 } 4698 4699 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 4700 OrigLoop->getHeader()->getParent()->hasMinSize()) { 4701 LLVM_DEBUG( 4702 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 4703 return Result; 4704 } 4705 4706 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) { 4707 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 4708 "this loop\n"); 4709 return Result; 4710 } 4711 4712 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4713 // the main loop handles 8 lanes per iteration. We could still benefit from 4714 // vectorizing the epilogue loop with VF=4. 4715 ElementCount EstimatedRuntimeVF = 4716 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); 4717 4718 ScalarEvolution &SE = *PSE.getSE(); 4719 Type *TCType = Legal->getWidestInductionType(); 4720 const SCEV *RemainingIterations = nullptr; 4721 unsigned MaxTripCount = 0; 4722 for (auto &NextVF : ProfitableVFs) { 4723 // Skip candidate VFs without a corresponding VPlan. 4724 if (!hasPlanWithVF(NextVF.Width)) 4725 continue; 4726 4727 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable 4728 // vectors) or > the VF of the main loop (fixed vectors). 4729 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 4730 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 4731 (NextVF.Width.isScalable() && 4732 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || 4733 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && 4734 ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) 4735 continue; 4736 4737 // If NextVF is greater than the number of remaining iterations, the 4738 // epilogue loop would be dead. Skip such factors. 4739 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 4740 // TODO: extend to support scalable VFs. 4741 if (!RemainingIterations) { 4742 const SCEV *TC = vputils::getSCEVExprForVPValue( 4743 getPlanFor(NextVF.Width).getTripCount(), SE); 4744 assert(!isa<SCEVCouldNotCompute>(TC) && 4745 "Trip count SCEV must be computable"); 4746 RemainingIterations = SE.getURemExpr( 4747 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 4748 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; 4749 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, 4750 SE.getConstant(TCType, MaxTripCount))) { 4751 MaxTripCount = 4752 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); 4753 } 4754 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " 4755 << MaxTripCount << "\n"); 4756 } 4757 if (SE.isKnownPredicate( 4758 CmpInst::ICMP_UGT, 4759 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 4760 RemainingIterations)) 4761 continue; 4762 } 4763 4764 if (Result.Width.isScalar() || 4765 isMoreProfitable(NextVF, Result, MaxTripCount)) 4766 Result = NextVF; 4767 } 4768 4769 if (Result != VectorizationFactor::Disabled()) 4770 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 4771 << Result.Width << "\n"); 4772 return Result; 4773 } 4774 4775 std::pair<unsigned, unsigned> 4776 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4777 unsigned MinWidth = -1U; 4778 unsigned MaxWidth = 8; 4779 const DataLayout &DL = TheFunction->getDataLayout(); 4780 // For in-loop reductions, no element types are added to ElementTypesInLoop 4781 // if there are no loads/stores in the loop. In this case, check through the 4782 // reduction variables to determine the maximum width. 4783 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 4784 // Reset MaxWidth so that we can find the smallest type used by recurrences 4785 // in the loop. 4786 MaxWidth = -1U; 4787 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 4788 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 4789 // When finding the min width used by the recurrence we need to account 4790 // for casts on the input operands of the recurrence. 4791 MaxWidth = std::min<unsigned>( 4792 MaxWidth, std::min<unsigned>( 4793 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 4794 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 4795 } 4796 } else { 4797 for (Type *T : ElementTypesInLoop) { 4798 MinWidth = std::min<unsigned>( 4799 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4800 MaxWidth = std::max<unsigned>( 4801 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4802 } 4803 } 4804 return {MinWidth, MaxWidth}; 4805 } 4806 4807 void LoopVectorizationCostModel::collectElementTypesForWidening() { 4808 ElementTypesInLoop.clear(); 4809 // For each block. 4810 for (BasicBlock *BB : TheLoop->blocks()) { 4811 // For each instruction in the loop. 4812 for (Instruction &I : BB->instructionsWithoutDebug()) { 4813 Type *T = I.getType(); 4814 4815 // Skip ignored values. 4816 if (ValuesToIgnore.count(&I)) 4817 continue; 4818 4819 // Only examine Loads, Stores and PHINodes. 4820 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 4821 continue; 4822 4823 // Examine PHI nodes that are reduction variables. Update the type to 4824 // account for the recurrence type. 4825 if (auto *PN = dyn_cast<PHINode>(&I)) { 4826 if (!Legal->isReductionVariable(PN)) 4827 continue; 4828 const RecurrenceDescriptor &RdxDesc = 4829 Legal->getReductionVars().find(PN)->second; 4830 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 4831 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 4832 RdxDesc.getRecurrenceType(), 4833 TargetTransformInfo::ReductionFlags())) 4834 continue; 4835 T = RdxDesc.getRecurrenceType(); 4836 } 4837 4838 // Examine the stored values. 4839 if (auto *ST = dyn_cast<StoreInst>(&I)) 4840 T = ST->getValueOperand()->getType(); 4841 4842 assert(T->isSized() && 4843 "Expected the load/store/recurrence type to be sized"); 4844 4845 ElementTypesInLoop.insert(T); 4846 } 4847 } 4848 } 4849 4850 unsigned 4851 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 4852 InstructionCost LoopCost) { 4853 // -- The interleave heuristics -- 4854 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4855 // There are many micro-architectural considerations that we can't predict 4856 // at this level. For example, frontend pressure (on decode or fetch) due to 4857 // code size, or the number and capabilities of the execution ports. 4858 // 4859 // We use the following heuristics to select the interleave count: 4860 // 1. If the code has reductions, then we interleave to break the cross 4861 // iteration dependency. 4862 // 2. If the loop is really small, then we interleave to reduce the loop 4863 // overhead. 4864 // 3. We don't interleave if we think that we will spill registers to memory 4865 // due to the increased register pressure. 4866 4867 if (!isScalarEpilogueAllowed()) 4868 return 1; 4869 4870 // Do not interleave if EVL is preferred and no User IC is specified. 4871 if (foldTailWithEVL()) { 4872 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " 4873 "Unroll factor forced to be 1.\n"); 4874 return 1; 4875 } 4876 4877 // We used the distance for the interleave count. 4878 if (!Legal->isSafeForAnyVectorWidth()) 4879 return 1; 4880 4881 // We don't attempt to perform interleaving for loops with uncountable early 4882 // exits because the VPInstruction::AnyOf code cannot currently handle 4883 // multiple parts. 4884 if (Legal->hasUncountableEarlyExit()) 4885 return 1; 4886 4887 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); 4888 const bool HasReductions = !Legal->getReductionVars().empty(); 4889 4890 // If we did not calculate the cost for VF (because the user selected the VF) 4891 // then we calculate the cost of VF here. 4892 if (LoopCost == 0) { 4893 LoopCost = expectedCost(VF); 4894 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 4895 4896 // Loop body is free and there is no need for interleaving. 4897 if (LoopCost == 0) 4898 return 1; 4899 } 4900 4901 RegisterUsage R = calculateRegisterUsage({VF})[0]; 4902 // We divide by these constants so assume that we have at least one 4903 // instruction that uses at least one register. 4904 for (auto &Pair : R.MaxLocalUsers) { 4905 Pair.second = std::max(Pair.second, 1U); 4906 } 4907 4908 // We calculate the interleave count using the following formula. 4909 // Subtract the number of loop invariants from the number of available 4910 // registers. These registers are used by all of the interleaved instances. 4911 // Next, divide the remaining registers by the number of registers that is 4912 // required by the loop, in order to estimate how many parallel instances 4913 // fit without causing spills. All of this is rounded down if necessary to be 4914 // a power of two. We want power of two interleave count to simplify any 4915 // addressing operations or alignment considerations. 4916 // We also want power of two interleave counts to ensure that the induction 4917 // variable of the vector loop wraps to zero, when tail is folded by masking; 4918 // this currently happens when OptForSize, in which case IC is set to 1 above. 4919 unsigned IC = UINT_MAX; 4920 4921 for (const auto &Pair : R.MaxLocalUsers) { 4922 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first); 4923 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 4924 << " registers of " 4925 << TTI.getRegisterClassName(Pair.first) 4926 << " register class\n"); 4927 if (VF.isScalar()) { 4928 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 4929 TargetNumRegisters = ForceTargetNumScalarRegs; 4930 } else { 4931 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 4932 TargetNumRegisters = ForceTargetNumVectorRegs; 4933 } 4934 unsigned MaxLocalUsers = Pair.second; 4935 unsigned LoopInvariantRegs = 0; 4936 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end()) 4937 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first]; 4938 4939 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 4940 MaxLocalUsers); 4941 // Don't count the induction variable as interleaved. 4942 if (EnableIndVarRegisterHeur) { 4943 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 4944 std::max(1U, (MaxLocalUsers - 1))); 4945 } 4946 4947 IC = std::min(IC, TmpIC); 4948 } 4949 4950 // Clamp the interleave ranges to reasonable counts. 4951 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 4952 4953 // Check if the user has overridden the max. 4954 if (VF.isScalar()) { 4955 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 4956 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 4957 } else { 4958 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 4959 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 4960 } 4961 4962 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); 4963 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4964 if (KnownTC > 0) { 4965 // At least one iteration must be scalar when this constraint holds. So the 4966 // maximum available iterations for interleaving is one less. 4967 unsigned AvailableTC = 4968 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; 4969 4970 // If trip count is known we select between two prospective ICs, where 4971 // 1) the aggressive IC is capped by the trip count divided by VF 4972 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 4973 // The final IC is selected in a way that the epilogue loop trip count is 4974 // minimized while maximizing the IC itself, so that we either run the 4975 // vector loop at least once if it generates a small epilogue loop, or else 4976 // we run the vector loop at least twice. 4977 4978 unsigned InterleaveCountUB = bit_floor( 4979 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); 4980 unsigned InterleaveCountLB = bit_floor(std::max( 4981 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 4982 MaxInterleaveCount = InterleaveCountLB; 4983 4984 if (InterleaveCountUB != InterleaveCountLB) { 4985 unsigned TailTripCountUB = 4986 (AvailableTC % (EstimatedVF * InterleaveCountUB)); 4987 unsigned TailTripCountLB = 4988 (AvailableTC % (EstimatedVF * InterleaveCountLB)); 4989 // If both produce same scalar tail, maximize the IC to do the same work 4990 // in fewer vector loop iterations 4991 if (TailTripCountUB == TailTripCountLB) 4992 MaxInterleaveCount = InterleaveCountUB; 4993 } 4994 } else if (BestKnownTC && *BestKnownTC > 0) { 4995 // At least one iteration must be scalar when this constraint holds. So the 4996 // maximum available iterations for interleaving is one less. 4997 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) 4998 ? (*BestKnownTC) - 1 4999 : *BestKnownTC; 5000 5001 // If trip count is an estimated compile time constant, limit the 5002 // IC to be capped by the trip count divided by VF * 2, such that the vector 5003 // loop runs at least twice to make interleaving seem profitable when there 5004 // is an epilogue loop present. Since exact Trip count is not known we 5005 // choose to be conservative in our IC estimate. 5006 MaxInterleaveCount = bit_floor(std::max( 5007 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5008 } 5009 5010 assert(MaxInterleaveCount > 0 && 5011 "Maximum interleave count must be greater than 0"); 5012 5013 // Clamp the calculated IC to be between the 1 and the max interleave count 5014 // that the target and trip count allows. 5015 if (IC > MaxInterleaveCount) 5016 IC = MaxInterleaveCount; 5017 else 5018 // Make sure IC is greater than 0. 5019 IC = std::max(1u, IC); 5020 5021 assert(IC > 0 && "Interleave count must be greater than 0."); 5022 5023 // Interleave if we vectorized this loop and there is a reduction that could 5024 // benefit from interleaving. 5025 if (VF.isVector() && HasReductions) { 5026 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5027 return IC; 5028 } 5029 5030 // For any scalar loop that either requires runtime checks or predication we 5031 // are better off leaving this to the unroller. Note that if we've already 5032 // vectorized the loop we will have done the runtime check and so interleaving 5033 // won't require further checks. 5034 bool ScalarInterleavingRequiresPredication = 5035 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5036 return Legal->blockNeedsPredication(BB); 5037 })); 5038 bool ScalarInterleavingRequiresRuntimePointerCheck = 5039 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5040 5041 // We want to interleave small loops in order to reduce the loop overhead and 5042 // potentially expose ILP opportunities. 5043 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5044 << "LV: IC is " << IC << '\n' 5045 << "LV: VF is " << VF << '\n'); 5046 const bool AggressivelyInterleaveReductions = 5047 TTI.enableAggressiveInterleaving(HasReductions); 5048 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5049 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5050 // We assume that the cost overhead is 1 and we use the cost model 5051 // to estimate the cost of the loop and interleave until the cost of the 5052 // loop overhead is about 5% of the cost of the loop. 5053 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5054 SmallLoopCost / *LoopCost.getValue())); 5055 5056 // Interleave until store/load ports (estimated by max interleave count) are 5057 // saturated. 5058 unsigned NumStores = Legal->getNumStores(); 5059 unsigned NumLoads = Legal->getNumLoads(); 5060 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5061 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5062 5063 // There is little point in interleaving for reductions containing selects 5064 // and compares when VF=1 since it may just create more overhead than it's 5065 // worth for loops with small trip counts. This is because we still have to 5066 // do the final reduction after the loop. 5067 bool HasSelectCmpReductions = 5068 HasReductions && 5069 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5070 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5071 RecurKind RK = RdxDesc.getRecurrenceKind(); 5072 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 5073 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK); 5074 }); 5075 if (HasSelectCmpReductions) { 5076 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5077 return 1; 5078 } 5079 5080 // If we have a scalar reduction (vector reductions are already dealt with 5081 // by this point), we can increase the critical path length if the loop 5082 // we're interleaving is inside another loop. For tree-wise reductions 5083 // set the limit to 2, and for ordered reductions it's best to disable 5084 // interleaving entirely. 5085 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5086 bool HasOrderedReductions = 5087 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5088 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5089 return RdxDesc.isOrdered(); 5090 }); 5091 if (HasOrderedReductions) { 5092 LLVM_DEBUG( 5093 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5094 return 1; 5095 } 5096 5097 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5098 SmallIC = std::min(SmallIC, F); 5099 StoresIC = std::min(StoresIC, F); 5100 LoadsIC = std::min(LoadsIC, F); 5101 } 5102 5103 if (EnableLoadStoreRuntimeInterleave && 5104 std::max(StoresIC, LoadsIC) > SmallIC) { 5105 LLVM_DEBUG( 5106 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5107 return std::max(StoresIC, LoadsIC); 5108 } 5109 5110 // If there are scalar reductions and TTI has enabled aggressive 5111 // interleaving for reductions, we will interleave to expose ILP. 5112 if (VF.isScalar() && AggressivelyInterleaveReductions) { 5113 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5114 // Interleave no less than SmallIC but not as aggressive as the normal IC 5115 // to satisfy the rare situation when resources are too limited. 5116 return std::max(IC / 2, SmallIC); 5117 } 5118 5119 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5120 return SmallIC; 5121 } 5122 5123 // Interleave if this is a large loop (small loops are already dealt with by 5124 // this point) that could benefit from interleaving. 5125 if (AggressivelyInterleaveReductions) { 5126 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5127 return IC; 5128 } 5129 5130 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5131 return 1; 5132 } 5133 5134 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5135 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5136 // This function calculates the register usage by measuring the highest number 5137 // of values that are alive at a single location. Obviously, this is a very 5138 // rough estimation. We scan the loop in a topological order in order and 5139 // assign a number to each instruction. We use RPO to ensure that defs are 5140 // met before their users. We assume that each instruction that has in-loop 5141 // users starts an interval. We record every time that an in-loop value is 5142 // used, so we have a list of the first and last occurrences of each 5143 // instruction. Next, we transpose this data structure into a multi map that 5144 // holds the list of intervals that *end* at a specific location. This multi 5145 // map allows us to perform a linear search. We scan the instructions linearly 5146 // and record each time that a new interval starts, by placing it in a set. 5147 // If we find this value in the multi-map then we remove it from the set. 5148 // The max register usage is the maximum size of the set. 5149 // We also search for instructions that are defined outside the loop, but are 5150 // used inside the loop. We need this number separately from the max-interval 5151 // usage number because when we unroll, loop-invariant values do not take 5152 // more register. 5153 LoopBlocksDFS DFS(TheLoop); 5154 DFS.perform(LI); 5155 5156 RegisterUsage RU; 5157 5158 // Each 'key' in the map opens a new interval. The values 5159 // of the map are the index of the 'last seen' usage of the 5160 // instruction that is the key. 5161 using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>; 5162 5163 // Maps instruction to its index. 5164 SmallVector<Instruction *, 64> IdxToInstr; 5165 // Marks the end of each interval. 5166 IntervalMap EndPoint; 5167 // Saves the list of instruction indices that are used in the loop. 5168 SmallPtrSet<Instruction *, 8> Ends; 5169 // Saves the list of values that are used in the loop but are defined outside 5170 // the loop (not including non-instruction values such as arguments and 5171 // constants). 5172 SmallSetVector<Instruction *, 8> LoopInvariants; 5173 5174 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5175 for (Instruction &I : BB->instructionsWithoutDebug()) { 5176 IdxToInstr.push_back(&I); 5177 5178 // Save the end location of each USE. 5179 for (Value *U : I.operands()) { 5180 auto *Instr = dyn_cast<Instruction>(U); 5181 5182 // Ignore non-instruction values such as arguments, constants, etc. 5183 // FIXME: Might need some motivation why these values are ignored. If 5184 // for example an argument is used inside the loop it will increase the 5185 // register pressure (so shouldn't we add it to LoopInvariants). 5186 if (!Instr) 5187 continue; 5188 5189 // If this instruction is outside the loop then record it and continue. 5190 if (!TheLoop->contains(Instr)) { 5191 LoopInvariants.insert(Instr); 5192 continue; 5193 } 5194 5195 // Overwrite previous end points. 5196 EndPoint[Instr] = IdxToInstr.size(); 5197 Ends.insert(Instr); 5198 } 5199 } 5200 } 5201 5202 // Saves the list of intervals that end with the index in 'key'. 5203 using InstrList = SmallVector<Instruction *, 2>; 5204 SmallDenseMap<unsigned, InstrList, 16> TransposeEnds; 5205 5206 // Transpose the EndPoints to a list of values that end at each index. 5207 for (auto &Interval : EndPoint) 5208 TransposeEnds[Interval.second].push_back(Interval.first); 5209 5210 SmallPtrSet<Instruction *, 8> OpenIntervals; 5211 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5212 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5213 5214 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5215 5216 const auto &TTICapture = TTI; 5217 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5218 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || 5219 (VF.isScalable() && 5220 !TTICapture.isElementTypeLegalForScalableVector(Ty))) 5221 return 0; 5222 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5223 }; 5224 5225 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { 5226 Instruction *I = IdxToInstr[Idx]; 5227 5228 // Remove all of the instructions that end at this location. 5229 InstrList &List = TransposeEnds[Idx]; 5230 for (Instruction *ToRemove : List) 5231 OpenIntervals.erase(ToRemove); 5232 5233 // Ignore instructions that are never used within the loop. 5234 if (!Ends.count(I)) 5235 continue; 5236 5237 // Skip ignored values. 5238 if (ValuesToIgnore.count(I)) 5239 continue; 5240 5241 collectInLoopReductions(); 5242 5243 // For each VF find the maximum usage of registers. 5244 for (unsigned J = 0, E = VFs.size(); J < E; ++J) { 5245 // Count the number of registers used, per register class, given all open 5246 // intervals. 5247 // Note that elements in this SmallMapVector will be default constructed 5248 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5249 // there is no previous entry for ClassID. 5250 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5251 5252 if (VFs[J].isScalar()) { 5253 for (auto *Inst : OpenIntervals) { 5254 unsigned ClassID = 5255 TTI.getRegisterClassForType(false, Inst->getType()); 5256 // FIXME: The target might use more than one register for the type 5257 // even in the scalar case. 5258 RegUsage[ClassID] += 1; 5259 } 5260 } else { 5261 collectUniformsAndScalars(VFs[J]); 5262 for (auto *Inst : OpenIntervals) { 5263 // Skip ignored values for VF > 1. 5264 if (VecValuesToIgnore.count(Inst)) 5265 continue; 5266 if (isScalarAfterVectorization(Inst, VFs[J])) { 5267 unsigned ClassID = 5268 TTI.getRegisterClassForType(false, Inst->getType()); 5269 // FIXME: The target might use more than one register for the type 5270 // even in the scalar case. 5271 RegUsage[ClassID] += 1; 5272 } else { 5273 unsigned ClassID = 5274 TTI.getRegisterClassForType(true, Inst->getType()); 5275 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); 5276 } 5277 } 5278 } 5279 5280 for (const auto &Pair : RegUsage) { 5281 auto &Entry = MaxUsages[J][Pair.first]; 5282 Entry = std::max(Entry, Pair.second); 5283 } 5284 } 5285 5286 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " 5287 << OpenIntervals.size() << '\n'); 5288 5289 // Add the current instruction to the list of open intervals. 5290 OpenIntervals.insert(I); 5291 } 5292 5293 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { 5294 // Note that elements in this SmallMapVector will be default constructed 5295 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5296 // there is no previous entry for ClassID. 5297 SmallMapVector<unsigned, unsigned, 4> Invariant; 5298 5299 for (auto *Inst : LoopInvariants) { 5300 // FIXME: The target might use more than one register for the type 5301 // even in the scalar case. 5302 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5303 auto *I = cast<Instruction>(U); 5304 return TheLoop != LI->getLoopFor(I->getParent()) || 5305 isScalarAfterVectorization(I, VFs[Idx]); 5306 }); 5307 5308 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; 5309 unsigned ClassID = 5310 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5311 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5312 } 5313 5314 LLVM_DEBUG({ 5315 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; 5316 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() 5317 << " item\n"; 5318 for (const auto &pair : MaxUsages[Idx]) { 5319 dbgs() << "LV(REG): RegisterClass: " 5320 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5321 << " registers\n"; 5322 } 5323 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5324 << " item\n"; 5325 for (const auto &pair : Invariant) { 5326 dbgs() << "LV(REG): RegisterClass: " 5327 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5328 << " registers\n"; 5329 } 5330 }); 5331 5332 RU.LoopInvariantRegs = Invariant; 5333 RU.MaxLocalUsers = MaxUsages[Idx]; 5334 RUs[Idx] = RU; 5335 } 5336 5337 return RUs; 5338 } 5339 5340 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5341 ElementCount VF) { 5342 // TODO: Cost model for emulated masked load/store is completely 5343 // broken. This hack guides the cost model to use an artificially 5344 // high enough value to practically disable vectorization with such 5345 // operations, except where previously deployed legality hack allowed 5346 // using very low cost values. This is to avoid regressions coming simply 5347 // from moving "masked load/store" check from legality to cost model. 5348 // Masked Load/Gather emulation was previously never allowed. 5349 // Limited number of Masked Store/Scatter emulation was allowed. 5350 assert((isPredicatedInst(I)) && 5351 "Expecting a scalar emulated instruction"); 5352 return isa<LoadInst>(I) || 5353 (isa<StoreInst>(I) && 5354 NumPredStores > NumberOfStoresToPredicate); 5355 } 5356 5357 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5358 // If we aren't vectorizing the loop, or if we've already collected the 5359 // instructions to scalarize, there's nothing to do. Collection may already 5360 // have occurred if we have a user-selected VF and are now computing the 5361 // expected cost for interleaving. 5362 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5363 return; 5364 5365 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5366 // not profitable to scalarize any instructions, the presence of VF in the 5367 // map will indicate that we've analyzed it already. 5368 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5369 5370 PredicatedBBsAfterVectorization[VF].clear(); 5371 5372 // Find all the instructions that are scalar with predication in the loop and 5373 // determine if it would be better to not if-convert the blocks they are in. 5374 // If so, we also record the instructions to scalarize. 5375 for (BasicBlock *BB : TheLoop->blocks()) { 5376 if (!blockNeedsPredicationForAnyReason(BB)) 5377 continue; 5378 for (Instruction &I : *BB) 5379 if (isScalarWithPredication(&I, VF)) { 5380 ScalarCostsTy ScalarCosts; 5381 // Do not apply discount logic for: 5382 // 1. Scalars after vectorization, as there will only be a single copy 5383 // of the instruction. 5384 // 2. Scalable VF, as that would lead to invalid scalarization costs. 5385 // 3. Emulated masked memrefs, if a hacked cost is needed. 5386 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && 5387 !useEmulatedMaskMemRefHack(&I, VF) && 5388 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { 5389 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5390 // Check if we decided to scalarize a call. If so, update the widening 5391 // decision of the call to CM_Scalarize with the computed scalar cost. 5392 for (const auto &[I, _] : ScalarCosts) { 5393 auto *CI = dyn_cast<CallInst>(I); 5394 if (!CI || !CallWideningDecisions.contains({CI, VF})) 5395 continue; 5396 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize; 5397 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI]; 5398 } 5399 } 5400 // Remember that BB will remain after vectorization. 5401 PredicatedBBsAfterVectorization[VF].insert(BB); 5402 for (auto *Pred : predecessors(BB)) { 5403 if (Pred->getSingleSuccessor() == BB) 5404 PredicatedBBsAfterVectorization[VF].insert(Pred); 5405 } 5406 } 5407 } 5408 } 5409 5410 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5411 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5412 assert(!isUniformAfterVectorization(PredInst, VF) && 5413 "Instruction marked uniform-after-vectorization will be predicated"); 5414 5415 // Initialize the discount to zero, meaning that the scalar version and the 5416 // vector version cost the same. 5417 InstructionCost Discount = 0; 5418 5419 // Holds instructions to analyze. The instructions we visit are mapped in 5420 // ScalarCosts. Those instructions are the ones that would be scalarized if 5421 // we find that the scalar version costs less. 5422 SmallVector<Instruction *, 8> Worklist; 5423 5424 // Returns true if the given instruction can be scalarized. 5425 auto CanBeScalarized = [&](Instruction *I) -> bool { 5426 // We only attempt to scalarize instructions forming a single-use chain 5427 // from the original predicated block that would otherwise be vectorized. 5428 // Although not strictly necessary, we give up on instructions we know will 5429 // already be scalar to avoid traversing chains that are unlikely to be 5430 // beneficial. 5431 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5432 isScalarAfterVectorization(I, VF)) 5433 return false; 5434 5435 // If the instruction is scalar with predication, it will be analyzed 5436 // separately. We ignore it within the context of PredInst. 5437 if (isScalarWithPredication(I, VF)) 5438 return false; 5439 5440 // If any of the instruction's operands are uniform after vectorization, 5441 // the instruction cannot be scalarized. This prevents, for example, a 5442 // masked load from being scalarized. 5443 // 5444 // We assume we will only emit a value for lane zero of an instruction 5445 // marked uniform after vectorization, rather than VF identical values. 5446 // Thus, if we scalarize an instruction that uses a uniform, we would 5447 // create uses of values corresponding to the lanes we aren't emitting code 5448 // for. This behavior can be changed by allowing getScalarValue to clone 5449 // the lane zero values for uniforms rather than asserting. 5450 for (Use &U : I->operands()) 5451 if (auto *J = dyn_cast<Instruction>(U.get())) 5452 if (isUniformAfterVectorization(J, VF)) 5453 return false; 5454 5455 // Otherwise, we can scalarize the instruction. 5456 return true; 5457 }; 5458 5459 // Compute the expected cost discount from scalarizing the entire expression 5460 // feeding the predicated instruction. We currently only consider expressions 5461 // that are single-use instruction chains. 5462 Worklist.push_back(PredInst); 5463 while (!Worklist.empty()) { 5464 Instruction *I = Worklist.pop_back_val(); 5465 5466 // If we've already analyzed the instruction, there's nothing to do. 5467 if (ScalarCosts.contains(I)) 5468 continue; 5469 5470 // Compute the cost of the vector instruction. Note that this cost already 5471 // includes the scalarization overhead of the predicated instruction. 5472 InstructionCost VectorCost = getInstructionCost(I, VF); 5473 5474 // Compute the cost of the scalarized instruction. This cost is the cost of 5475 // the instruction as if it wasn't if-converted and instead remained in the 5476 // predicated block. We will scale this cost by block probability after 5477 // computing the scalarization overhead. 5478 InstructionCost ScalarCost = 5479 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); 5480 5481 // Compute the scalarization overhead of needed insertelement instructions 5482 // and phi nodes. 5483 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5484 ScalarCost += TTI.getScalarizationOverhead( 5485 cast<VectorType>(toVectorTy(I->getType(), VF)), 5486 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5487 /*Extract*/ false, CostKind); 5488 ScalarCost += 5489 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5490 } 5491 5492 // Compute the scalarization overhead of needed extractelement 5493 // instructions. For each of the instruction's operands, if the operand can 5494 // be scalarized, add it to the worklist; otherwise, account for the 5495 // overhead. 5496 for (Use &U : I->operands()) 5497 if (auto *J = dyn_cast<Instruction>(U.get())) { 5498 assert(VectorType::isValidElementType(J->getType()) && 5499 "Instruction has non-scalar type"); 5500 if (CanBeScalarized(J)) 5501 Worklist.push_back(J); 5502 else if (needsExtract(J, VF)) { 5503 ScalarCost += TTI.getScalarizationOverhead( 5504 cast<VectorType>(toVectorTy(J->getType(), VF)), 5505 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5506 /*Extract*/ true, CostKind); 5507 } 5508 } 5509 5510 // Scale the total scalar cost by block probability. 5511 ScalarCost /= getReciprocalPredBlockProb(); 5512 5513 // Compute the discount. A non-negative discount means the vector version 5514 // of the instruction costs more, and scalarizing would be beneficial. 5515 Discount += VectorCost - ScalarCost; 5516 ScalarCosts[I] = ScalarCost; 5517 } 5518 5519 return Discount; 5520 } 5521 5522 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { 5523 InstructionCost Cost; 5524 5525 // If the vector loop gets executed exactly once with the given VF, ignore the 5526 // costs of comparison and induction instructions, as they'll get simplified 5527 // away. 5528 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF; 5529 auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5530 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking()) 5531 addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), 5532 ValuesToIgnoreForVF); 5533 5534 // For each block. 5535 for (BasicBlock *BB : TheLoop->blocks()) { 5536 InstructionCost BlockCost; 5537 5538 // For each instruction in the old loop. 5539 for (Instruction &I : BB->instructionsWithoutDebug()) { 5540 // Skip ignored values. 5541 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) || 5542 (VF.isVector() && VecValuesToIgnore.count(&I))) 5543 continue; 5544 5545 InstructionCost C = getInstructionCost(&I, VF); 5546 5547 // Check if we should override the cost. 5548 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) 5549 C = InstructionCost(ForceTargetInstructionCost); 5550 5551 BlockCost += C; 5552 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " 5553 << VF << " For instruction: " << I << '\n'); 5554 } 5555 5556 // If we are vectorizing a predicated block, it will have been 5557 // if-converted. This means that the block's instructions (aside from 5558 // stores and instructions that may divide by zero) will now be 5559 // unconditionally executed. For the scalar case, we may not always execute 5560 // the predicated block, if it is an if-else block. Thus, scale the block's 5561 // cost by the probability of executing it. blockNeedsPredication from 5562 // Legal is used so as to not include all blocks in tail folded loops. 5563 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5564 BlockCost /= getReciprocalPredBlockProb(); 5565 5566 Cost += BlockCost; 5567 } 5568 5569 return Cost; 5570 } 5571 5572 /// Gets Address Access SCEV after verifying that the access pattern 5573 /// is loop invariant except the induction variable dependence. 5574 /// 5575 /// This SCEV can be sent to the Target in order to estimate the address 5576 /// calculation cost. 5577 static const SCEV *getAddressAccessSCEV( 5578 Value *Ptr, 5579 LoopVectorizationLegality *Legal, 5580 PredicatedScalarEvolution &PSE, 5581 const Loop *TheLoop) { 5582 5583 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5584 if (!Gep) 5585 return nullptr; 5586 5587 // We are looking for a gep with all loop invariant indices except for one 5588 // which should be an induction variable. 5589 auto *SE = PSE.getSE(); 5590 unsigned NumOperands = Gep->getNumOperands(); 5591 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { 5592 Value *Opd = Gep->getOperand(Idx); 5593 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5594 !Legal->isInductionVariable(Opd)) 5595 return nullptr; 5596 } 5597 5598 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5599 return PSE.getSCEV(Ptr); 5600 } 5601 5602 InstructionCost 5603 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5604 ElementCount VF) { 5605 assert(VF.isVector() && 5606 "Scalarization cost of instruction implies vectorization."); 5607 if (VF.isScalable()) 5608 return InstructionCost::getInvalid(); 5609 5610 Type *ValTy = getLoadStoreType(I); 5611 auto *SE = PSE.getSE(); 5612 5613 unsigned AS = getLoadStoreAddressSpace(I); 5614 Value *Ptr = getLoadStorePointerOperand(I); 5615 Type *PtrTy = toVectorTy(Ptr->getType(), VF); 5616 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5617 // that it is being called from this specific place. 5618 5619 // Figure out whether the access is strided and get the stride value 5620 // if it's known in compile time 5621 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5622 5623 // Get the cost of the scalar memory instruction and address computation. 5624 InstructionCost Cost = 5625 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5626 5627 // Don't pass *I here, since it is scalar but will actually be part of a 5628 // vectorized loop where the user of it is a vectorized instruction. 5629 const Align Alignment = getLoadStoreAlignment(I); 5630 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 5631 ValTy->getScalarType(), 5632 Alignment, AS, CostKind); 5633 5634 // Get the overhead of the extractelement and insertelement instructions 5635 // we might create due to scalarization. 5636 Cost += getScalarizationOverhead(I, VF); 5637 5638 // If we have a predicated load/store, it will need extra i1 extracts and 5639 // conditional branches, but may not be executed for each vector lane. Scale 5640 // the cost by the probability of executing the predicated block. 5641 if (isPredicatedInst(I)) { 5642 Cost /= getReciprocalPredBlockProb(); 5643 5644 // Add the cost of an i1 extract and a branch 5645 auto *VecI1Ty = 5646 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 5647 Cost += TTI.getScalarizationOverhead( 5648 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 5649 /*Insert=*/false, /*Extract=*/true, CostKind); 5650 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 5651 5652 if (useEmulatedMaskMemRefHack(I, VF)) 5653 // Artificially setting to a high enough value to practically disable 5654 // vectorization with such operations. 5655 Cost = 3000000; 5656 } 5657 5658 return Cost; 5659 } 5660 5661 InstructionCost 5662 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5663 ElementCount VF) { 5664 Type *ValTy = getLoadStoreType(I); 5665 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5666 Value *Ptr = getLoadStorePointerOperand(I); 5667 unsigned AS = getLoadStoreAddressSpace(I); 5668 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 5669 5670 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5671 "Stride should be 1 or -1 for consecutive memory access"); 5672 const Align Alignment = getLoadStoreAlignment(I); 5673 InstructionCost Cost = 0; 5674 if (Legal->isMaskRequired(I)) { 5675 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5676 CostKind); 5677 } else { 5678 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5679 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5680 CostKind, OpInfo, I); 5681 } 5682 5683 bool Reverse = ConsecutiveStride < 0; 5684 if (Reverse) 5685 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5686 CostKind, 0); 5687 return Cost; 5688 } 5689 5690 InstructionCost 5691 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5692 ElementCount VF) { 5693 assert(Legal->isUniformMemOp(*I, VF)); 5694 5695 Type *ValTy = getLoadStoreType(I); 5696 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5697 const Align Alignment = getLoadStoreAlignment(I); 5698 unsigned AS = getLoadStoreAddressSpace(I); 5699 if (isa<LoadInst>(I)) { 5700 return TTI.getAddressComputationCost(ValTy) + 5701 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5702 CostKind) + 5703 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {}, 5704 CostKind); 5705 } 5706 StoreInst *SI = cast<StoreInst>(I); 5707 5708 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 5709 return TTI.getAddressComputationCost(ValTy) + 5710 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5711 CostKind) + 5712 (IsLoopInvariantStoreValue 5713 ? 0 5714 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5715 CostKind, VF.getKnownMinValue() - 1)); 5716 } 5717 5718 InstructionCost 5719 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5720 ElementCount VF) { 5721 Type *ValTy = getLoadStoreType(I); 5722 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5723 const Align Alignment = getLoadStoreAlignment(I); 5724 const Value *Ptr = getLoadStorePointerOperand(I); 5725 5726 return TTI.getAddressComputationCost(VectorTy) + 5727 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5728 Legal->isMaskRequired(I), Alignment, 5729 CostKind, I); 5730 } 5731 5732 InstructionCost 5733 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5734 ElementCount VF) { 5735 const auto *Group = getInterleavedAccessGroup(I); 5736 assert(Group && "Fail to get an interleaved access group."); 5737 5738 Instruction *InsertPos = Group->getInsertPos(); 5739 Type *ValTy = getLoadStoreType(InsertPos); 5740 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5741 unsigned AS = getLoadStoreAddressSpace(InsertPos); 5742 5743 unsigned InterleaveFactor = Group->getFactor(); 5744 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5745 5746 // Holds the indices of existing members in the interleaved group. 5747 SmallVector<unsigned, 4> Indices; 5748 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 5749 if (Group->getMember(IF)) 5750 Indices.push_back(IF); 5751 5752 // Calculate the cost of the whole interleaved group. 5753 bool UseMaskForGaps = 5754 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 5755 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 5756 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 5757 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5758 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), 5759 UseMaskForGaps); 5760 5761 if (Group->isReverse()) { 5762 // TODO: Add support for reversed masked interleaved access. 5763 assert(!Legal->isMaskRequired(I) && 5764 "Reverse masked interleaved access not supported."); 5765 Cost += Group->getNumMembers() * 5766 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5767 CostKind, 0); 5768 } 5769 return Cost; 5770 } 5771 5772 std::optional<InstructionCost> 5773 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, 5774 ElementCount VF, 5775 Type *Ty) const { 5776 using namespace llvm::PatternMatch; 5777 // Early exit for no inloop reductions 5778 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 5779 return std::nullopt; 5780 auto *VectorTy = cast<VectorType>(Ty); 5781 5782 // We are looking for a pattern of, and finding the minimal acceptable cost: 5783 // reduce(mul(ext(A), ext(B))) or 5784 // reduce(mul(A, B)) or 5785 // reduce(ext(A)) or 5786 // reduce(A). 5787 // The basic idea is that we walk down the tree to do that, finding the root 5788 // reduction instruction in InLoopReductionImmediateChains. From there we find 5789 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 5790 // of the components. If the reduction cost is lower then we return it for the 5791 // reduction instruction and 0 for the other instructions in the pattern. If 5792 // it is not we return an invalid cost specifying the orignal cost method 5793 // should be used. 5794 Instruction *RetI = I; 5795 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 5796 if (!RetI->hasOneUser()) 5797 return std::nullopt; 5798 RetI = RetI->user_back(); 5799 } 5800 5801 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 5802 RetI->user_back()->getOpcode() == Instruction::Add) { 5803 RetI = RetI->user_back(); 5804 } 5805 5806 // Test if the found instruction is a reduction, and if not return an invalid 5807 // cost specifying the parent to use the original cost modelling. 5808 if (!InLoopReductionImmediateChains.count(RetI)) 5809 return std::nullopt; 5810 5811 // Find the reduction this chain is a part of and calculate the basic cost of 5812 // the reduction on its own. 5813 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 5814 Instruction *ReductionPhi = LastChain; 5815 while (!isa<PHINode>(ReductionPhi)) 5816 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 5817 5818 const RecurrenceDescriptor &RdxDesc = 5819 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 5820 5821 InstructionCost BaseCost; 5822 RecurKind RK = RdxDesc.getRecurrenceKind(); 5823 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 5824 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); 5825 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, 5826 RdxDesc.getFastMathFlags(), CostKind); 5827 } else { 5828 BaseCost = TTI.getArithmeticReductionCost( 5829 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 5830 } 5831 5832 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5833 // normal fmul instruction to the cost of the fadd reduction. 5834 if (RK == RecurKind::FMulAdd) 5835 BaseCost += 5836 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 5837 5838 // If we're using ordered reductions then we can just return the base cost 5839 // here, since getArithmeticReductionCost calculates the full ordered 5840 // reduction cost when FP reassociation is not allowed. 5841 if (useOrderedReductions(RdxDesc)) 5842 return BaseCost; 5843 5844 // Get the operand that was not the reduction chain and match it to one of the 5845 // patterns, returning the better cost if it is found. 5846 Instruction *RedOp = RetI->getOperand(1) == LastChain 5847 ? dyn_cast<Instruction>(RetI->getOperand(0)) 5848 : dyn_cast<Instruction>(RetI->getOperand(1)); 5849 5850 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 5851 5852 Instruction *Op0, *Op1; 5853 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5854 match(RedOp, 5855 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 5856 match(Op0, m_ZExtOrSExt(m_Value())) && 5857 Op0->getOpcode() == Op1->getOpcode() && 5858 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 5859 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 5860 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 5861 5862 // Matched reduce.add(ext(mul(ext(A), ext(B))) 5863 // Note that the extend opcodes need to all match, or if A==B they will have 5864 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5865 // which is equally fine. 5866 bool IsUnsigned = isa<ZExtInst>(Op0); 5867 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 5868 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 5869 5870 InstructionCost ExtCost = 5871 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 5872 TTI::CastContextHint::None, CostKind, Op0); 5873 InstructionCost MulCost = 5874 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 5875 InstructionCost Ext2Cost = 5876 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 5877 TTI::CastContextHint::None, CostKind, RedOp); 5878 5879 InstructionCost RedCost = TTI.getMulAccReductionCost( 5880 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5881 5882 if (RedCost.isValid() && 5883 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 5884 return I == RetI ? RedCost : 0; 5885 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 5886 !TheLoop->isLoopInvariant(RedOp)) { 5887 // Matched reduce(ext(A)) 5888 bool IsUnsigned = isa<ZExtInst>(RedOp); 5889 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 5890 InstructionCost RedCost = TTI.getExtendedReductionCost( 5891 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 5892 RdxDesc.getFastMathFlags(), CostKind); 5893 5894 InstructionCost ExtCost = 5895 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 5896 TTI::CastContextHint::None, CostKind, RedOp); 5897 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 5898 return I == RetI ? RedCost : 0; 5899 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5900 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 5901 if (match(Op0, m_ZExtOrSExt(m_Value())) && 5902 Op0->getOpcode() == Op1->getOpcode() && 5903 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 5904 bool IsUnsigned = isa<ZExtInst>(Op0); 5905 Type *Op0Ty = Op0->getOperand(0)->getType(); 5906 Type *Op1Ty = Op1->getOperand(0)->getType(); 5907 Type *LargestOpTy = 5908 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 5909 : Op0Ty; 5910 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 5911 5912 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 5913 // different sizes. We take the largest type as the ext to reduce, and add 5914 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 5915 InstructionCost ExtCost0 = TTI.getCastInstrCost( 5916 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 5917 TTI::CastContextHint::None, CostKind, Op0); 5918 InstructionCost ExtCost1 = TTI.getCastInstrCost( 5919 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 5920 TTI::CastContextHint::None, CostKind, Op1); 5921 InstructionCost MulCost = 5922 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5923 5924 InstructionCost RedCost = TTI.getMulAccReductionCost( 5925 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5926 InstructionCost ExtraExtCost = 0; 5927 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 5928 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 5929 ExtraExtCost = TTI.getCastInstrCost( 5930 ExtraExtOp->getOpcode(), ExtType, 5931 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 5932 TTI::CastContextHint::None, CostKind, ExtraExtOp); 5933 } 5934 5935 if (RedCost.isValid() && 5936 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 5937 return I == RetI ? RedCost : 0; 5938 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 5939 // Matched reduce.add(mul()) 5940 InstructionCost MulCost = 5941 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5942 5943 InstructionCost RedCost = TTI.getMulAccReductionCost( 5944 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 5945 5946 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 5947 return I == RetI ? RedCost : 0; 5948 } 5949 } 5950 5951 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 5952 } 5953 5954 InstructionCost 5955 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5956 ElementCount VF) { 5957 // Calculate scalar cost only. Vectorization cost should be ready at this 5958 // moment. 5959 if (VF.isScalar()) { 5960 Type *ValTy = getLoadStoreType(I); 5961 const Align Alignment = getLoadStoreAlignment(I); 5962 unsigned AS = getLoadStoreAddressSpace(I); 5963 5964 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5965 return TTI.getAddressComputationCost(ValTy) + 5966 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind, 5967 OpInfo, I); 5968 } 5969 return getWideningCost(I, VF); 5970 } 5971 5972 InstructionCost 5973 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5974 ElementCount VF) const { 5975 5976 // There is no mechanism yet to create a scalable scalarization loop, 5977 // so this is currently Invalid. 5978 if (VF.isScalable()) 5979 return InstructionCost::getInvalid(); 5980 5981 if (VF.isScalar()) 5982 return 0; 5983 5984 InstructionCost Cost = 0; 5985 Type *RetTy = toVectorTy(I->getType(), VF); 5986 if (!RetTy->isVoidTy() && 5987 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5988 Cost += TTI.getScalarizationOverhead( 5989 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 5990 /*Insert*/ true, 5991 /*Extract*/ false, CostKind); 5992 5993 // Some targets keep addresses scalar. 5994 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5995 return Cost; 5996 5997 // Some targets support efficient element stores. 5998 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5999 return Cost; 6000 6001 // Collect operands to consider. 6002 CallInst *CI = dyn_cast<CallInst>(I); 6003 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6004 6005 // Skip operands that do not require extraction/scalarization and do not incur 6006 // any overhead. 6007 SmallVector<Type *> Tys; 6008 for (auto *V : filterExtractingOperands(Ops, VF)) 6009 Tys.push_back(maybeVectorizeType(V->getType(), VF)); 6010 return Cost + TTI.getOperandsScalarizationOverhead( 6011 filterExtractingOperands(Ops, VF), Tys, CostKind); 6012 } 6013 6014 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6015 if (VF.isScalar()) 6016 return; 6017 NumPredStores = 0; 6018 for (BasicBlock *BB : TheLoop->blocks()) { 6019 // For each instruction in the old loop. 6020 for (Instruction &I : *BB) { 6021 Value *Ptr = getLoadStorePointerOperand(&I); 6022 if (!Ptr) 6023 continue; 6024 6025 // TODO: We should generate better code and update the cost model for 6026 // predicated uniform stores. Today they are treated as any other 6027 // predicated store (see added test cases in 6028 // invariant-store-vectorization.ll). 6029 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6030 NumPredStores++; 6031 6032 if (Legal->isUniformMemOp(I, VF)) { 6033 auto IsLegalToScalarize = [&]() { 6034 if (!VF.isScalable()) 6035 // Scalarization of fixed length vectors "just works". 6036 return true; 6037 6038 // We have dedicated lowering for unpredicated uniform loads and 6039 // stores. Note that even with tail folding we know that at least 6040 // one lane is active (i.e. generalized predication is not possible 6041 // here), and the logic below depends on this fact. 6042 if (!foldTailByMasking()) 6043 return true; 6044 6045 // For scalable vectors, a uniform memop load is always 6046 // uniform-by-parts and we know how to scalarize that. 6047 if (isa<LoadInst>(I)) 6048 return true; 6049 6050 // A uniform store isn't neccessarily uniform-by-part 6051 // and we can't assume scalarization. 6052 auto &SI = cast<StoreInst>(I); 6053 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6054 }; 6055 6056 const InstructionCost GatherScatterCost = 6057 isLegalGatherOrScatter(&I, VF) ? 6058 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6059 6060 // Load: Scalar load + broadcast 6061 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6062 // FIXME: This cost is a significant under-estimate for tail folded 6063 // memory ops. 6064 const InstructionCost ScalarizationCost = 6065 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF) 6066 : InstructionCost::getInvalid(); 6067 6068 // Choose better solution for the current VF, Note that Invalid 6069 // costs compare as maximumal large. If both are invalid, we get 6070 // scalable invalid which signals a failure and a vectorization abort. 6071 if (GatherScatterCost < ScalarizationCost) 6072 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6073 else 6074 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6075 continue; 6076 } 6077 6078 // We assume that widening is the best solution when possible. 6079 if (memoryInstructionCanBeWidened(&I, VF)) { 6080 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6081 int ConsecutiveStride = Legal->isConsecutivePtr( 6082 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6083 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6084 "Expected consecutive stride."); 6085 InstWidening Decision = 6086 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6087 setWideningDecision(&I, VF, Decision, Cost); 6088 continue; 6089 } 6090 6091 // Choose between Interleaving, Gather/Scatter or Scalarization. 6092 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6093 unsigned NumAccesses = 1; 6094 if (isAccessInterleaved(&I)) { 6095 const auto *Group = getInterleavedAccessGroup(&I); 6096 assert(Group && "Fail to get an interleaved access group."); 6097 6098 // Make one decision for the whole group. 6099 if (getWideningDecision(&I, VF) != CM_Unknown) 6100 continue; 6101 6102 NumAccesses = Group->getNumMembers(); 6103 if (interleavedAccessCanBeWidened(&I, VF)) 6104 InterleaveCost = getInterleaveGroupCost(&I, VF); 6105 } 6106 6107 InstructionCost GatherScatterCost = 6108 isLegalGatherOrScatter(&I, VF) 6109 ? getGatherScatterCost(&I, VF) * NumAccesses 6110 : InstructionCost::getInvalid(); 6111 6112 InstructionCost ScalarizationCost = 6113 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6114 6115 // Choose better solution for the current VF, 6116 // write down this decision and use it during vectorization. 6117 InstructionCost Cost; 6118 InstWidening Decision; 6119 if (InterleaveCost <= GatherScatterCost && 6120 InterleaveCost < ScalarizationCost) { 6121 Decision = CM_Interleave; 6122 Cost = InterleaveCost; 6123 } else if (GatherScatterCost < ScalarizationCost) { 6124 Decision = CM_GatherScatter; 6125 Cost = GatherScatterCost; 6126 } else { 6127 Decision = CM_Scalarize; 6128 Cost = ScalarizationCost; 6129 } 6130 // If the instructions belongs to an interleave group, the whole group 6131 // receives the same decision. The whole group receives the cost, but 6132 // the cost will actually be assigned to one instruction. 6133 if (const auto *Group = getInterleavedAccessGroup(&I)) 6134 setWideningDecision(Group, VF, Decision, Cost); 6135 else 6136 setWideningDecision(&I, VF, Decision, Cost); 6137 } 6138 } 6139 6140 // Make sure that any load of address and any other address computation 6141 // remains scalar unless there is gather/scatter support. This avoids 6142 // inevitable extracts into address registers, and also has the benefit of 6143 // activating LSR more, since that pass can't optimize vectorized 6144 // addresses. 6145 if (TTI.prefersVectorizedAddressing()) 6146 return; 6147 6148 // Start with all scalar pointer uses. 6149 SmallPtrSet<Instruction *, 8> AddrDefs; 6150 for (BasicBlock *BB : TheLoop->blocks()) 6151 for (Instruction &I : *BB) { 6152 Instruction *PtrDef = 6153 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6154 if (PtrDef && TheLoop->contains(PtrDef) && 6155 getWideningDecision(&I, VF) != CM_GatherScatter) 6156 AddrDefs.insert(PtrDef); 6157 } 6158 6159 // Add all instructions used to generate the addresses. 6160 SmallVector<Instruction *, 4> Worklist; 6161 append_range(Worklist, AddrDefs); 6162 while (!Worklist.empty()) { 6163 Instruction *I = Worklist.pop_back_val(); 6164 for (auto &Op : I->operands()) 6165 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6166 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6167 AddrDefs.insert(InstOp).second) 6168 Worklist.push_back(InstOp); 6169 } 6170 6171 for (auto *I : AddrDefs) { 6172 if (isa<LoadInst>(I)) { 6173 // Setting the desired widening decision should ideally be handled in 6174 // by cost functions, but since this involves the task of finding out 6175 // if the loaded register is involved in an address computation, it is 6176 // instead changed here when we know this is the case. 6177 InstWidening Decision = getWideningDecision(I, VF); 6178 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6179 // Scalarize a widened load of address. 6180 setWideningDecision( 6181 I, VF, CM_Scalarize, 6182 (VF.getKnownMinValue() * 6183 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6184 else if (const auto *Group = getInterleavedAccessGroup(I)) { 6185 // Scalarize an interleave group of address loads. 6186 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6187 if (Instruction *Member = Group->getMember(I)) 6188 setWideningDecision( 6189 Member, VF, CM_Scalarize, 6190 (VF.getKnownMinValue() * 6191 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6192 } 6193 } 6194 } else 6195 // Make sure I gets scalarized and a cost estimate without 6196 // scalarization overhead. 6197 ForcedScalars[VF].insert(I); 6198 } 6199 } 6200 6201 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6202 assert(!VF.isScalar() && 6203 "Trying to set a vectorization decision for a scalar VF"); 6204 6205 auto ForcedScalar = ForcedScalars.find(VF); 6206 for (BasicBlock *BB : TheLoop->blocks()) { 6207 // For each instruction in the old loop. 6208 for (Instruction &I : *BB) { 6209 CallInst *CI = dyn_cast<CallInst>(&I); 6210 6211 if (!CI) 6212 continue; 6213 6214 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6215 InstructionCost VectorCost = InstructionCost::getInvalid(); 6216 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6217 Function *ScalarFunc = CI->getCalledFunction(); 6218 Type *ScalarRetTy = CI->getType(); 6219 SmallVector<Type *, 4> Tys, ScalarTys; 6220 for (auto &ArgOp : CI->args()) 6221 ScalarTys.push_back(ArgOp->getType()); 6222 6223 // Estimate cost of scalarized vector call. The source operands are 6224 // assumed to be vectors, so we need to extract individual elements from 6225 // there, execute VF scalar calls, and then gather the result into the 6226 // vector return value. 6227 InstructionCost ScalarCallCost = 6228 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6229 6230 // Compute costs of unpacking argument values for the scalar calls and 6231 // packing the return values to a vector. 6232 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 6233 6234 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6235 // Honor ForcedScalars and UniformAfterVectorization decisions. 6236 // TODO: For calls, it might still be more profitable to widen. Use 6237 // VPlan-based cost model to compare different options. 6238 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() && 6239 ForcedScalar->second.contains(CI)) || 6240 isUniformAfterVectorization(CI, VF))) { 6241 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr, 6242 Intrinsic::not_intrinsic, std::nullopt, 6243 ScalarCost); 6244 continue; 6245 } 6246 6247 bool MaskRequired = Legal->isMaskRequired(CI); 6248 // Compute corresponding vector type for return value and arguments. 6249 Type *RetTy = toVectorTy(ScalarRetTy, VF); 6250 for (Type *ScalarTy : ScalarTys) 6251 Tys.push_back(toVectorTy(ScalarTy, VF)); 6252 6253 // An in-loop reduction using an fmuladd intrinsic is a special case; 6254 // we don't want the normal cost for that intrinsic. 6255 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6256 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) { 6257 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6258 getVectorIntrinsicIDForCall(CI, TLI), 6259 std::nullopt, *RedCost); 6260 continue; 6261 } 6262 6263 // Find the cost of vectorizing the call, if we can find a suitable 6264 // vector variant of the function. 6265 bool UsesMask = false; 6266 VFInfo FuncInfo; 6267 Function *VecFunc = nullptr; 6268 // Search through any available variants for one we can use at this VF. 6269 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6270 // Must match requested VF. 6271 if (Info.Shape.VF != VF) 6272 continue; 6273 6274 // Must take a mask argument if one is required 6275 if (MaskRequired && !Info.isMasked()) 6276 continue; 6277 6278 // Check that all parameter kinds are supported 6279 bool ParamsOk = true; 6280 for (VFParameter Param : Info.Shape.Parameters) { 6281 switch (Param.ParamKind) { 6282 case VFParamKind::Vector: 6283 break; 6284 case VFParamKind::OMP_Uniform: { 6285 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6286 // Make sure the scalar parameter in the loop is invariant. 6287 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6288 TheLoop)) 6289 ParamsOk = false; 6290 break; 6291 } 6292 case VFParamKind::OMP_Linear: { 6293 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6294 // Find the stride for the scalar parameter in this loop and see if 6295 // it matches the stride for the variant. 6296 // TODO: do we need to figure out the cost of an extract to get the 6297 // first lane? Or do we hope that it will be folded away? 6298 ScalarEvolution *SE = PSE.getSE(); 6299 const auto *SAR = 6300 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6301 6302 if (!SAR || SAR->getLoop() != TheLoop) { 6303 ParamsOk = false; 6304 break; 6305 } 6306 6307 const SCEVConstant *Step = 6308 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6309 6310 if (!Step || 6311 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6312 ParamsOk = false; 6313 6314 break; 6315 } 6316 case VFParamKind::GlobalPredicate: 6317 UsesMask = true; 6318 break; 6319 default: 6320 ParamsOk = false; 6321 break; 6322 } 6323 } 6324 6325 if (!ParamsOk) 6326 continue; 6327 6328 // Found a suitable candidate, stop here. 6329 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6330 FuncInfo = Info; 6331 break; 6332 } 6333 6334 // Add in the cost of synthesizing a mask if one wasn't required. 6335 InstructionCost MaskCost = 0; 6336 if (VecFunc && UsesMask && !MaskRequired) 6337 MaskCost = TTI.getShuffleCost( 6338 TargetTransformInfo::SK_Broadcast, 6339 VectorType::get(IntegerType::getInt1Ty( 6340 VecFunc->getFunctionType()->getContext()), 6341 VF), 6342 {}, CostKind); 6343 6344 if (TLI && VecFunc && !CI->isNoBuiltin()) 6345 VectorCost = 6346 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6347 6348 // Find the cost of an intrinsic; some targets may have instructions that 6349 // perform the operation without needing an actual call. 6350 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6351 if (IID != Intrinsic::not_intrinsic) 6352 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6353 6354 InstructionCost Cost = ScalarCost; 6355 InstWidening Decision = CM_Scalarize; 6356 6357 if (VectorCost <= Cost) { 6358 Cost = VectorCost; 6359 Decision = CM_VectorCall; 6360 } 6361 6362 if (IntrinsicCost <= Cost) { 6363 Cost = IntrinsicCost; 6364 Decision = CM_IntrinsicCall; 6365 } 6366 6367 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6368 FuncInfo.getParamIndexForOptionalMask(), Cost); 6369 } 6370 } 6371 } 6372 6373 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { 6374 if (!Legal->isInvariant(Op)) 6375 return false; 6376 // Consider Op invariant, if it or its operands aren't predicated 6377 // instruction in the loop. In that case, it is not trivially hoistable. 6378 auto *OpI = dyn_cast<Instruction>(Op); 6379 return !OpI || !TheLoop->contains(OpI) || 6380 (!isPredicatedInst(OpI) && 6381 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) && 6382 all_of(OpI->operands(), 6383 [this](Value *Op) { return shouldConsiderInvariant(Op); })); 6384 } 6385 6386 InstructionCost 6387 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6388 ElementCount VF) { 6389 // If we know that this instruction will remain uniform, check the cost of 6390 // the scalar version. 6391 if (isUniformAfterVectorization(I, VF)) 6392 VF = ElementCount::getFixed(1); 6393 6394 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6395 return InstsToScalarize[VF][I]; 6396 6397 // Forced scalars do not have any scalarization overhead. 6398 auto ForcedScalar = ForcedScalars.find(VF); 6399 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6400 auto InstSet = ForcedScalar->second; 6401 if (InstSet.count(I)) 6402 return getInstructionCost(I, ElementCount::getFixed(1)) * 6403 VF.getKnownMinValue(); 6404 } 6405 6406 Type *RetTy = I->getType(); 6407 if (canTruncateToMinimalBitwidth(I, VF)) 6408 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6409 auto *SE = PSE.getSE(); 6410 6411 auto HasSingleCopyAfterVectorization = [this](Instruction *I, 6412 ElementCount VF) -> bool { 6413 if (VF.isScalar()) 6414 return true; 6415 6416 auto Scalarized = InstsToScalarize.find(VF); 6417 assert(Scalarized != InstsToScalarize.end() && 6418 "VF not yet analyzed for scalarization profitability"); 6419 return !Scalarized->second.count(I) && 6420 llvm::all_of(I->users(), [&](User *U) { 6421 auto *UI = cast<Instruction>(U); 6422 return !Scalarized->second.count(UI); 6423 }); 6424 }; 6425 (void)HasSingleCopyAfterVectorization; 6426 6427 Type *VectorTy; 6428 if (isScalarAfterVectorization(I, VF)) { 6429 // With the exception of GEPs and PHIs, after scalarization there should 6430 // only be one copy of the instruction generated in the loop. This is 6431 // because the VF is either 1, or any instructions that need scalarizing 6432 // have already been dealt with by the time we get here. As a result, 6433 // it means we don't have to multiply the instruction cost by VF. 6434 assert(I->getOpcode() == Instruction::GetElementPtr || 6435 I->getOpcode() == Instruction::PHI || 6436 (I->getOpcode() == Instruction::BitCast && 6437 I->getType()->isPointerTy()) || 6438 HasSingleCopyAfterVectorization(I, VF)); 6439 VectorTy = RetTy; 6440 } else 6441 VectorTy = toVectorTy(RetTy, VF); 6442 6443 if (VF.isVector() && VectorTy->isVectorTy() && 6444 !TTI.getNumberOfParts(VectorTy)) 6445 return InstructionCost::getInvalid(); 6446 6447 // TODO: We need to estimate the cost of intrinsic calls. 6448 switch (I->getOpcode()) { 6449 case Instruction::GetElementPtr: 6450 // We mark this instruction as zero-cost because the cost of GEPs in 6451 // vectorized code depends on whether the corresponding memory instruction 6452 // is scalarized or not. Therefore, we handle GEPs with the memory 6453 // instruction cost. 6454 return 0; 6455 case Instruction::Br: { 6456 // In cases of scalarized and predicated instructions, there will be VF 6457 // predicated blocks in the vectorized loop. Each branch around these 6458 // blocks requires also an extract of its vector compare i1 element. 6459 // Note that the conditional branch from the loop latch will be replaced by 6460 // a single branch controlling the loop, so there is no extra overhead from 6461 // scalarization. 6462 bool ScalarPredicatedBB = false; 6463 BranchInst *BI = cast<BranchInst>(I); 6464 if (VF.isVector() && BI->isConditional() && 6465 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6466 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && 6467 BI->getParent() != TheLoop->getLoopLatch()) 6468 ScalarPredicatedBB = true; 6469 6470 if (ScalarPredicatedBB) { 6471 // Not possible to scalarize scalable vector with predicated instructions. 6472 if (VF.isScalable()) 6473 return InstructionCost::getInvalid(); 6474 // Return cost for branches around scalarized and predicated blocks. 6475 auto *VecI1Ty = 6476 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6477 return ( 6478 TTI.getScalarizationOverhead( 6479 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), 6480 /*Insert*/ false, /*Extract*/ true, CostKind) + 6481 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6482 } 6483 6484 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6485 // The back-edge branch will remain, as will all scalar branches. 6486 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6487 6488 // This branch will be eliminated by if-conversion. 6489 return 0; 6490 // Note: We currently assume zero cost for an unconditional branch inside 6491 // a predicated block since it will become a fall-through, although we 6492 // may decide in the future to call TTI for all branches. 6493 } 6494 case Instruction::Switch: { 6495 if (VF.isScalar()) 6496 return TTI.getCFInstrCost(Instruction::Switch, CostKind); 6497 auto *Switch = cast<SwitchInst>(I); 6498 return Switch->getNumCases() * 6499 TTI.getCmpSelInstrCost( 6500 Instruction::ICmp, 6501 toVectorTy(Switch->getCondition()->getType(), VF), 6502 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 6503 CmpInst::ICMP_EQ, CostKind); 6504 } 6505 case Instruction::PHI: { 6506 auto *Phi = cast<PHINode>(I); 6507 6508 // First-order recurrences are replaced by vector shuffles inside the loop. 6509 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6510 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6511 // penultimate value of the recurrence. 6512 // TODO: Consider vscale_range info. 6513 if (VF.isScalable() && VF.getKnownMinValue() == 1) 6514 return InstructionCost::getInvalid(); 6515 SmallVector<int> Mask(VF.getKnownMinValue()); 6516 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6517 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6518 cast<VectorType>(VectorTy), Mask, CostKind, 6519 VF.getKnownMinValue() - 1); 6520 } 6521 6522 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6523 // converted into select instructions. We require N - 1 selects per phi 6524 // node, where N is the number of incoming values. 6525 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { 6526 Type *ResultTy = Phi->getType(); 6527 6528 // All instructions in an Any-of reduction chain are narrowed to bool. 6529 // Check if that is the case for this phi node. 6530 auto *HeaderUser = cast_if_present<PHINode>( 6531 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * { 6532 auto *Phi = dyn_cast<PHINode>(U); 6533 if (Phi && Phi->getParent() == TheLoop->getHeader()) 6534 return Phi; 6535 return nullptr; 6536 })); 6537 if (HeaderUser) { 6538 auto &ReductionVars = Legal->getReductionVars(); 6539 auto Iter = ReductionVars.find(HeaderUser); 6540 if (Iter != ReductionVars.end() && 6541 RecurrenceDescriptor::isAnyOfRecurrenceKind( 6542 Iter->second.getRecurrenceKind())) 6543 ResultTy = Type::getInt1Ty(Phi->getContext()); 6544 } 6545 return (Phi->getNumIncomingValues() - 1) * 6546 TTI.getCmpSelInstrCost( 6547 Instruction::Select, toVectorTy(ResultTy, VF), 6548 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6549 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6550 } 6551 6552 // When tail folding with EVL, if the phi is part of an out of loop 6553 // reduction then it will be transformed into a wide vp_merge. 6554 if (VF.isVector() && foldTailWithEVL() && 6555 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { 6556 IntrinsicCostAttributes ICA( 6557 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), 6558 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); 6559 return TTI.getIntrinsicInstrCost(ICA, CostKind); 6560 } 6561 6562 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6563 } 6564 case Instruction::UDiv: 6565 case Instruction::SDiv: 6566 case Instruction::URem: 6567 case Instruction::SRem: 6568 if (VF.isVector() && isPredicatedInst(I)) { 6569 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6570 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6571 ScalarCost : SafeDivisorCost; 6572 } 6573 // We've proven all lanes safe to speculate, fall through. 6574 [[fallthrough]]; 6575 case Instruction::Add: 6576 case Instruction::Sub: { 6577 auto Info = Legal->getHistogramInfo(I); 6578 if (Info && VF.isVector()) { 6579 const HistogramInfo *HGram = Info.value(); 6580 // Assume that a non-constant update value (or a constant != 1) requires 6581 // a multiply, and add that into the cost. 6582 InstructionCost MulCost = TTI::TCC_Free; 6583 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); 6584 if (!RHS || RHS->getZExtValue() != 1) 6585 MulCost = 6586 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6587 6588 // Find the cost of the histogram operation itself. 6589 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); 6590 Type *ScalarTy = I->getType(); 6591 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); 6592 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 6593 Type::getVoidTy(I->getContext()), 6594 {PtrTy, ScalarTy, MaskTy}); 6595 6596 // Add the costs together with the add/sub operation. 6597 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost + 6598 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind); 6599 } 6600 [[fallthrough]]; 6601 } 6602 case Instruction::FAdd: 6603 case Instruction::FSub: 6604 case Instruction::Mul: 6605 case Instruction::FMul: 6606 case Instruction::FDiv: 6607 case Instruction::FRem: 6608 case Instruction::Shl: 6609 case Instruction::LShr: 6610 case Instruction::AShr: 6611 case Instruction::And: 6612 case Instruction::Or: 6613 case Instruction::Xor: { 6614 // If we're speculating on the stride being 1, the multiplication may 6615 // fold away. We can generalize this for all operations using the notion 6616 // of neutral elements. (TODO) 6617 if (I->getOpcode() == Instruction::Mul && 6618 (PSE.getSCEV(I->getOperand(0))->isOne() || 6619 PSE.getSCEV(I->getOperand(1))->isOne())) 6620 return 0; 6621 6622 // Detect reduction patterns 6623 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) 6624 return *RedCost; 6625 6626 // Certain instructions can be cheaper to vectorize if they have a constant 6627 // second vector operand. One example of this are shifts on x86. 6628 Value *Op2 = I->getOperand(1); 6629 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) && 6630 isa<SCEVConstant>(PSE.getSCEV(Op2))) { 6631 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue(); 6632 } 6633 auto Op2Info = TTI.getOperandInfo(Op2); 6634 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6635 shouldConsiderInvariant(Op2)) 6636 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6637 6638 SmallVector<const Value *, 4> Operands(I->operand_values()); 6639 return TTI.getArithmeticInstrCost( 6640 I->getOpcode(), VectorTy, CostKind, 6641 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6642 Op2Info, Operands, I, TLI); 6643 } 6644 case Instruction::FNeg: { 6645 return TTI.getArithmeticInstrCost( 6646 I->getOpcode(), VectorTy, CostKind, 6647 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6648 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6649 I->getOperand(0), I); 6650 } 6651 case Instruction::Select: { 6652 SelectInst *SI = cast<SelectInst>(I); 6653 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6654 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6655 6656 const Value *Op0, *Op1; 6657 using namespace llvm::PatternMatch; 6658 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6659 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6660 // select x, y, false --> x & y 6661 // select x, true, y --> x | y 6662 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6663 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6664 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6665 Op1->getType()->getScalarSizeInBits() == 1); 6666 6667 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6668 return TTI.getArithmeticInstrCost( 6669 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6670 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6671 } 6672 6673 Type *CondTy = SI->getCondition()->getType(); 6674 if (!ScalarCond) 6675 CondTy = VectorType::get(CondTy, VF); 6676 6677 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6678 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6679 Pred = Cmp->getPredicate(); 6680 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6681 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 6682 {TTI::OK_AnyValue, TTI::OP_None}, I); 6683 } 6684 case Instruction::ICmp: 6685 case Instruction::FCmp: { 6686 Type *ValTy = I->getOperand(0)->getType(); 6687 6688 if (canTruncateToMinimalBitwidth(I, VF)) { 6689 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6690 (void)Op0AsInstruction; 6691 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || 6692 MinBWs[I] == MinBWs[Op0AsInstruction]) && 6693 "if both the operand and the compare are marked for " 6694 "truncation, they must have the same bitwidth"); 6695 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); 6696 } 6697 6698 VectorTy = toVectorTy(ValTy, VF); 6699 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 6700 cast<CmpInst>(I)->getPredicate(), CostKind, 6701 {TTI::OK_AnyValue, TTI::OP_None}, 6702 {TTI::OK_AnyValue, TTI::OP_None}, I); 6703 } 6704 case Instruction::Store: 6705 case Instruction::Load: { 6706 ElementCount Width = VF; 6707 if (Width.isVector()) { 6708 InstWidening Decision = getWideningDecision(I, Width); 6709 assert(Decision != CM_Unknown && 6710 "CM decision should be taken at this point"); 6711 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 6712 return InstructionCost::getInvalid(); 6713 if (Decision == CM_Scalarize) 6714 Width = ElementCount::getFixed(1); 6715 } 6716 VectorTy = toVectorTy(getLoadStoreType(I), Width); 6717 return getMemoryInstructionCost(I, VF); 6718 } 6719 case Instruction::BitCast: 6720 if (I->getType()->isPointerTy()) 6721 return 0; 6722 [[fallthrough]]; 6723 case Instruction::ZExt: 6724 case Instruction::SExt: 6725 case Instruction::FPToUI: 6726 case Instruction::FPToSI: 6727 case Instruction::FPExt: 6728 case Instruction::PtrToInt: 6729 case Instruction::IntToPtr: 6730 case Instruction::SIToFP: 6731 case Instruction::UIToFP: 6732 case Instruction::Trunc: 6733 case Instruction::FPTrunc: { 6734 // Computes the CastContextHint from a Load/Store instruction. 6735 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6736 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6737 "Expected a load or a store!"); 6738 6739 if (VF.isScalar() || !TheLoop->contains(I)) 6740 return TTI::CastContextHint::Normal; 6741 6742 switch (getWideningDecision(I, VF)) { 6743 case LoopVectorizationCostModel::CM_GatherScatter: 6744 return TTI::CastContextHint::GatherScatter; 6745 case LoopVectorizationCostModel::CM_Interleave: 6746 return TTI::CastContextHint::Interleave; 6747 case LoopVectorizationCostModel::CM_Scalarize: 6748 case LoopVectorizationCostModel::CM_Widen: 6749 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6750 : TTI::CastContextHint::Normal; 6751 case LoopVectorizationCostModel::CM_Widen_Reverse: 6752 return TTI::CastContextHint::Reversed; 6753 case LoopVectorizationCostModel::CM_Unknown: 6754 llvm_unreachable("Instr did not go through cost modelling?"); 6755 case LoopVectorizationCostModel::CM_VectorCall: 6756 case LoopVectorizationCostModel::CM_IntrinsicCall: 6757 llvm_unreachable_internal("Instr has invalid widening decision"); 6758 } 6759 6760 llvm_unreachable("Unhandled case!"); 6761 }; 6762 6763 unsigned Opcode = I->getOpcode(); 6764 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6765 // For Trunc, the context is the only user, which must be a StoreInst. 6766 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6767 if (I->hasOneUse()) 6768 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6769 CCH = ComputeCCH(Store); 6770 } 6771 // For Z/Sext, the context is the operand, which must be a LoadInst. 6772 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6773 Opcode == Instruction::FPExt) { 6774 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6775 CCH = ComputeCCH(Load); 6776 } 6777 6778 // We optimize the truncation of induction variables having constant 6779 // integer steps. The cost of these truncations is the same as the scalar 6780 // operation. 6781 if (isOptimizableIVTruncate(I, VF)) { 6782 auto *Trunc = cast<TruncInst>(I); 6783 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6784 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6785 } 6786 6787 // Detect reduction patterns 6788 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) 6789 return *RedCost; 6790 6791 Type *SrcScalarTy = I->getOperand(0)->getType(); 6792 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6793 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6794 SrcScalarTy = 6795 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); 6796 Type *SrcVecTy = 6797 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6798 6799 if (canTruncateToMinimalBitwidth(I, VF)) { 6800 // If the result type is <= the source type, there will be no extend 6801 // after truncating the users to the minimal required bitwidth. 6802 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && 6803 (I->getOpcode() == Instruction::ZExt || 6804 I->getOpcode() == Instruction::SExt)) 6805 return 0; 6806 } 6807 6808 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6809 } 6810 case Instruction::Call: 6811 return getVectorCallCost(cast<CallInst>(I), VF); 6812 case Instruction::ExtractValue: 6813 return TTI.getInstructionCost(I, CostKind); 6814 case Instruction::Alloca: 6815 // We cannot easily widen alloca to a scalable alloca, as 6816 // the result would need to be a vector of pointers. 6817 if (VF.isScalable()) 6818 return InstructionCost::getInvalid(); 6819 [[fallthrough]]; 6820 default: 6821 // This opcode is unknown. Assume that it is the same as 'mul'. 6822 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6823 } // end of switch. 6824 } 6825 6826 void LoopVectorizationCostModel::collectValuesToIgnore() { 6827 // Ignore ephemeral values. 6828 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6829 6830 SmallVector<Value *, 4> DeadInterleavePointerOps; 6831 SmallVector<Value *, 4> DeadOps; 6832 6833 // If a scalar epilogue is required, users outside the loop won't use 6834 // live-outs from the vector loop but from the scalar epilogue. Ignore them if 6835 // that is the case. 6836 bool RequiresScalarEpilogue = requiresScalarEpilogue(true); 6837 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) { 6838 return RequiresScalarEpilogue && 6839 !TheLoop->contains(cast<Instruction>(U)->getParent()); 6840 }; 6841 6842 LoopBlocksDFS DFS(TheLoop); 6843 DFS.perform(LI); 6844 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps; 6845 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO()))) 6846 for (Instruction &I : reverse(*BB)) { 6847 // Find all stores to invariant variables. Since they are going to sink 6848 // outside the loop we do not need calculate cost for them. 6849 StoreInst *SI; 6850 if ((SI = dyn_cast<StoreInst>(&I)) && 6851 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 6852 ValuesToIgnore.insert(&I); 6853 DeadInvariantStoreOps[SI->getPointerOperand()].push_back( 6854 SI->getValueOperand()); 6855 } 6856 6857 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) 6858 continue; 6859 6860 // Add instructions that would be trivially dead and are only used by 6861 // values already ignored to DeadOps to seed worklist. 6862 if (wouldInstructionBeTriviallyDead(&I, TLI) && 6863 all_of(I.users(), [this, IsLiveOutDead](User *U) { 6864 return VecValuesToIgnore.contains(U) || 6865 ValuesToIgnore.contains(U) || IsLiveOutDead(U); 6866 })) 6867 DeadOps.push_back(&I); 6868 6869 // For interleave groups, we only create a pointer for the start of the 6870 // interleave group. Queue up addresses of group members except the insert 6871 // position for further processing. 6872 if (isAccessInterleaved(&I)) { 6873 auto *Group = getInterleavedAccessGroup(&I); 6874 if (Group->getInsertPos() == &I) 6875 continue; 6876 Value *PointerOp = getLoadStorePointerOperand(&I); 6877 DeadInterleavePointerOps.push_back(PointerOp); 6878 } 6879 6880 // Queue branches for analysis. They are dead, if their successors only 6881 // contain dead instructions. 6882 if (auto *Br = dyn_cast<BranchInst>(&I)) { 6883 if (Br->isConditional()) 6884 DeadOps.push_back(&I); 6885 } 6886 } 6887 6888 // Mark ops feeding interleave group members as free, if they are only used 6889 // by other dead computations. 6890 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { 6891 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); 6892 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { 6893 Instruction *UI = cast<Instruction>(U); 6894 return !VecValuesToIgnore.contains(U) && 6895 (!isAccessInterleaved(UI) || 6896 getInterleavedAccessGroup(UI)->getInsertPos() == UI); 6897 })) 6898 continue; 6899 VecValuesToIgnore.insert(Op); 6900 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); 6901 } 6902 6903 for (const auto &[_, Ops] : DeadInvariantStoreOps) { 6904 for (Value *Op : ArrayRef(Ops).drop_back()) 6905 DeadOps.push_back(Op); 6906 } 6907 // Mark ops that would be trivially dead and are only used by ignored 6908 // instructions as free. 6909 BasicBlock *Header = TheLoop->getHeader(); 6910 6911 // Returns true if the block contains only dead instructions. Such blocks will 6912 // be removed by VPlan-to-VPlan transforms and won't be considered by the 6913 // VPlan-based cost model, so skip them in the legacy cost-model as well. 6914 auto IsEmptyBlock = [this](BasicBlock *BB) { 6915 return all_of(*BB, [this](Instruction &I) { 6916 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) || 6917 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional()); 6918 }); 6919 }; 6920 for (unsigned I = 0; I != DeadOps.size(); ++I) { 6921 auto *Op = dyn_cast<Instruction>(DeadOps[I]); 6922 6923 // Check if the branch should be considered dead. 6924 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) { 6925 BasicBlock *ThenBB = Br->getSuccessor(0); 6926 BasicBlock *ElseBB = Br->getSuccessor(1); 6927 // Don't considers branches leaving the loop for simplification. 6928 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB)) 6929 continue; 6930 bool ThenEmpty = IsEmptyBlock(ThenBB); 6931 bool ElseEmpty = IsEmptyBlock(ElseBB); 6932 if ((ThenEmpty && ElseEmpty) || 6933 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB && 6934 ElseBB->phis().empty()) || 6935 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB && 6936 ThenBB->phis().empty())) { 6937 VecValuesToIgnore.insert(Br); 6938 DeadOps.push_back(Br->getCondition()); 6939 } 6940 continue; 6941 } 6942 6943 // Skip any op that shouldn't be considered dead. 6944 if (!Op || !TheLoop->contains(Op) || 6945 (isa<PHINode>(Op) && Op->getParent() == Header) || 6946 !wouldInstructionBeTriviallyDead(Op, TLI) || 6947 any_of(Op->users(), [this, IsLiveOutDead](User *U) { 6948 return !VecValuesToIgnore.contains(U) && 6949 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U); 6950 })) 6951 continue; 6952 6953 if (!TheLoop->contains(Op->getParent())) 6954 continue; 6955 6956 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore 6957 // which applies for both scalar and vector versions. Otherwise it is only 6958 // dead in vector versions, so only add it to VecValuesToIgnore. 6959 if (all_of(Op->users(), 6960 [this](User *U) { return ValuesToIgnore.contains(U); })) 6961 ValuesToIgnore.insert(Op); 6962 6963 VecValuesToIgnore.insert(Op); 6964 DeadOps.append(Op->op_begin(), Op->op_end()); 6965 } 6966 6967 // Ignore type-promoting instructions we identified during reduction 6968 // detection. 6969 for (const auto &Reduction : Legal->getReductionVars()) { 6970 const RecurrenceDescriptor &RedDes = Reduction.second; 6971 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6972 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6973 } 6974 // Ignore type-casting instructions we identified during induction 6975 // detection. 6976 for (const auto &Induction : Legal->getInductionVars()) { 6977 const InductionDescriptor &IndDes = Induction.second; 6978 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6979 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6980 } 6981 } 6982 6983 void LoopVectorizationCostModel::collectInLoopReductions() { 6984 for (const auto &Reduction : Legal->getReductionVars()) { 6985 PHINode *Phi = Reduction.first; 6986 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6987 6988 // We don't collect reductions that are type promoted (yet). 6989 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6990 continue; 6991 6992 // If the target would prefer this reduction to happen "in-loop", then we 6993 // want to record it as such. 6994 unsigned Opcode = RdxDesc.getOpcode(); 6995 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 6996 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 6997 TargetTransformInfo::ReductionFlags())) 6998 continue; 6999 7000 // Check that we can correctly put the reductions into the loop, by 7001 // finding the chain of operations that leads from the phi to the loop 7002 // exit value. 7003 SmallVector<Instruction *, 4> ReductionOperations = 7004 RdxDesc.getReductionOpChain(Phi, TheLoop); 7005 bool InLoop = !ReductionOperations.empty(); 7006 7007 if (InLoop) { 7008 InLoopReductions.insert(Phi); 7009 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7010 Instruction *LastChain = Phi; 7011 for (auto *I : ReductionOperations) { 7012 InLoopReductionImmediateChains[I] = LastChain; 7013 LastChain = I; 7014 } 7015 } 7016 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7017 << " reduction for phi: " << *Phi << "\n"); 7018 } 7019 } 7020 7021 // This function will select a scalable VF if the target supports scalable 7022 // vectors and a fixed one otherwise. 7023 // TODO: we could return a pair of values that specify the max VF and 7024 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7025 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7026 // doesn't have a cost model that can choose which plan to execute if 7027 // more than one is generated. 7028 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7029 LoopVectorizationCostModel &CM) { 7030 unsigned WidestType; 7031 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7032 7033 TargetTransformInfo::RegisterKind RegKind = 7034 TTI.enableScalableVectorization() 7035 ? TargetTransformInfo::RGK_ScalableVector 7036 : TargetTransformInfo::RGK_FixedWidthVector; 7037 7038 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7039 unsigned N = RegSize.getKnownMinValue() / WidestType; 7040 return ElementCount::get(N, RegSize.isScalable()); 7041 } 7042 7043 VectorizationFactor 7044 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7045 ElementCount VF = UserVF; 7046 // Outer loop handling: They may require CFG and instruction level 7047 // transformations before even evaluating whether vectorization is profitable. 7048 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7049 // the vectorization pipeline. 7050 if (!OrigLoop->isInnermost()) { 7051 // If the user doesn't provide a vectorization factor, determine a 7052 // reasonable one. 7053 if (UserVF.isZero()) { 7054 VF = determineVPlanVF(TTI, CM); 7055 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7056 7057 // Make sure we have a VF > 1 for stress testing. 7058 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7059 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7060 << "overriding computed VF.\n"); 7061 VF = ElementCount::getFixed(4); 7062 } 7063 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7064 !ForceTargetSupportsScalableVectors) { 7065 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7066 << "not supported by the target.\n"); 7067 reportVectorizationFailure( 7068 "Scalable vectorization requested but not supported by the target", 7069 "the scalable user-specified vectorization width for outer-loop " 7070 "vectorization cannot be used because the target does not support " 7071 "scalable vectors.", 7072 "ScalableVFUnfeasible", ORE, OrigLoop); 7073 return VectorizationFactor::Disabled(); 7074 } 7075 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7076 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7077 "VF needs to be a power of two"); 7078 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7079 << "VF " << VF << " to build VPlans.\n"); 7080 buildVPlans(VF, VF); 7081 7082 // For VPlan build stress testing, we bail out after VPlan construction. 7083 if (VPlanBuildStressTest) 7084 return VectorizationFactor::Disabled(); 7085 7086 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7087 } 7088 7089 LLVM_DEBUG( 7090 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7091 "VPlan-native path.\n"); 7092 return VectorizationFactor::Disabled(); 7093 } 7094 7095 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7096 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7097 CM.collectValuesToIgnore(); 7098 CM.collectElementTypesForWidening(); 7099 7100 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7101 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7102 return; 7103 7104 // Invalidate interleave groups if all blocks of loop will be predicated. 7105 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7106 !useMaskedInterleavedAccesses(TTI)) { 7107 LLVM_DEBUG( 7108 dbgs() 7109 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7110 "which requires masked-interleaved support.\n"); 7111 if (CM.InterleaveInfo.invalidateGroups()) 7112 // Invalidating interleave groups also requires invalidating all decisions 7113 // based on them, which includes widening decisions and uniform and scalar 7114 // values. 7115 CM.invalidateCostModelingDecisions(); 7116 } 7117 7118 if (CM.foldTailByMasking()) 7119 Legal->prepareToFoldTailByMasking(); 7120 7121 ElementCount MaxUserVF = 7122 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7123 if (UserVF) { 7124 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) { 7125 reportVectorizationInfo( 7126 "UserVF ignored because it may be larger than the maximal safe VF", 7127 "InvalidUserVF", ORE, OrigLoop); 7128 } else { 7129 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7130 "VF needs to be a power of two"); 7131 // Collect the instructions (and their associated costs) that will be more 7132 // profitable to scalarize. 7133 CM.collectInLoopReductions(); 7134 if (CM.selectUserVectorizationFactor(UserVF)) { 7135 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7136 buildVPlansWithVPRecipes(UserVF, UserVF); 7137 LLVM_DEBUG(printPlans(dbgs())); 7138 return; 7139 } 7140 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7141 "InvalidCost", ORE, OrigLoop); 7142 } 7143 } 7144 7145 // Collect the Vectorization Factor Candidates. 7146 SmallVector<ElementCount> VFCandidates; 7147 for (auto VF = ElementCount::getFixed(1); 7148 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7149 VFCandidates.push_back(VF); 7150 for (auto VF = ElementCount::getScalable(1); 7151 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7152 VFCandidates.push_back(VF); 7153 7154 CM.collectInLoopReductions(); 7155 for (const auto &VF : VFCandidates) { 7156 // Collect Uniform and Scalar instructions after vectorization with VF. 7157 CM.collectUniformsAndScalars(VF); 7158 7159 // Collect the instructions (and their associated costs) that will be more 7160 // profitable to scalarize. 7161 if (VF.isVector()) 7162 CM.collectInstsToScalarize(VF); 7163 } 7164 7165 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7166 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7167 7168 LLVM_DEBUG(printPlans(dbgs())); 7169 } 7170 7171 InstructionCost VPCostContext::getLegacyCost(Instruction *UI, 7172 ElementCount VF) const { 7173 if (ForceTargetInstructionCost.getNumOccurrences()) 7174 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); 7175 return CM.getInstructionCost(UI, VF); 7176 } 7177 7178 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { 7179 return CM.ValuesToIgnore.contains(UI) || 7180 (IsVector && CM.VecValuesToIgnore.contains(UI)) || 7181 SkipCostComputation.contains(UI); 7182 } 7183 7184 InstructionCost 7185 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, 7186 VPCostContext &CostCtx) const { 7187 InstructionCost Cost; 7188 // Cost modeling for inductions is inaccurate in the legacy cost model 7189 // compared to the recipes that are generated. To match here initially during 7190 // VPlan cost model bring up directly use the induction costs from the legacy 7191 // cost model. Note that we do this as pre-processing; the VPlan may not have 7192 // any recipes associated with the original induction increment instruction 7193 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 7194 // the cost of induction phis and increments (both that are represented by 7195 // recipes and those that are not), to avoid distinguishing between them here, 7196 // and skip all recipes that represent induction phis and increments (the 7197 // former case) later on, if they exist, to avoid counting them twice. 7198 // Similarly we pre-compute the cost of any optimized truncates. 7199 // TODO: Switch to more accurate costing based on VPlan. 7200 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { 7201 Instruction *IVInc = cast<Instruction>( 7202 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 7203 SmallVector<Instruction *> IVInsts = {IVInc}; 7204 for (unsigned I = 0; I != IVInsts.size(); I++) { 7205 for (Value *Op : IVInsts[I]->operands()) { 7206 auto *OpI = dyn_cast<Instruction>(Op); 7207 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse()) 7208 continue; 7209 IVInsts.push_back(OpI); 7210 } 7211 } 7212 IVInsts.push_back(IV); 7213 for (User *U : IV->users()) { 7214 auto *CI = cast<Instruction>(U); 7215 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) 7216 continue; 7217 IVInsts.push_back(CI); 7218 } 7219 7220 // If the vector loop gets executed exactly once with the given VF, ignore 7221 // the costs of comparison and induction instructions, as they'll get 7222 // simplified away. 7223 // TODO: Remove this code after stepping away from the legacy cost model and 7224 // adding code to simplify VPlans before calculating their costs. 7225 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); 7226 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking()) 7227 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), 7228 CostCtx.SkipCostComputation); 7229 7230 for (Instruction *IVInst : IVInsts) { 7231 if (CostCtx.skipCostComputation(IVInst, VF.isVector())) 7232 continue; 7233 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); 7234 LLVM_DEBUG({ 7235 dbgs() << "Cost of " << InductionCost << " for VF " << VF 7236 << ": induction instruction " << *IVInst << "\n"; 7237 }); 7238 Cost += InductionCost; 7239 CostCtx.SkipCostComputation.insert(IVInst); 7240 } 7241 } 7242 7243 /// Compute the cost of all exiting conditions of the loop using the legacy 7244 /// cost model. This is to match the legacy behavior, which adds the cost of 7245 /// all exit conditions. Note that this over-estimates the cost, as there will 7246 /// be a single condition to control the vector loop. 7247 SmallVector<BasicBlock *> Exiting; 7248 CM.TheLoop->getExitingBlocks(Exiting); 7249 SetVector<Instruction *> ExitInstrs; 7250 // Collect all exit conditions. 7251 for (BasicBlock *EB : Exiting) { 7252 auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); 7253 if (!Term) 7254 continue; 7255 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { 7256 ExitInstrs.insert(CondI); 7257 } 7258 } 7259 // Compute the cost of all instructions only feeding the exit conditions. 7260 for (unsigned I = 0; I != ExitInstrs.size(); ++I) { 7261 Instruction *CondI = ExitInstrs[I]; 7262 if (!OrigLoop->contains(CondI) || 7263 !CostCtx.SkipCostComputation.insert(CondI).second) 7264 continue; 7265 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); 7266 LLVM_DEBUG({ 7267 dbgs() << "Cost of " << CondICost << " for VF " << VF 7268 << ": exit condition instruction " << *CondI << "\n"; 7269 }); 7270 Cost += CondICost; 7271 for (Value *Op : CondI->operands()) { 7272 auto *OpI = dyn_cast<Instruction>(Op); 7273 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { 7274 return OrigLoop->contains(cast<Instruction>(U)->getParent()) && 7275 !ExitInstrs.contains(cast<Instruction>(U)); 7276 })) 7277 continue; 7278 ExitInstrs.insert(OpI); 7279 } 7280 } 7281 7282 // The legacy cost model has special logic to compute the cost of in-loop 7283 // reductions, which may be smaller than the sum of all instructions involved 7284 // in the reduction. 7285 // TODO: Switch to costing based on VPlan once the logic has been ported. 7286 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { 7287 if (ForceTargetInstructionCost.getNumOccurrences()) 7288 continue; 7289 7290 if (!CM.isInLoopReduction(RedPhi)) 7291 continue; 7292 7293 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); 7294 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), 7295 ChainOps.end()); 7296 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { 7297 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; 7298 }; 7299 // Also include the operands of instructions in the chain, as the cost-model 7300 // may mark extends as free. 7301 // 7302 // For ARM, some of the instruction can folded into the reducion 7303 // instruction. So we need to mark all folded instructions free. 7304 // For example: We can fold reduce(mul(ext(A), ext(B))) into one 7305 // instruction. 7306 for (auto *ChainOp : ChainOps) { 7307 for (Value *Op : ChainOp->operands()) { 7308 if (auto *I = dyn_cast<Instruction>(Op)) { 7309 ChainOpsAndOperands.insert(I); 7310 if (I->getOpcode() == Instruction::Mul) { 7311 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0)); 7312 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1)); 7313 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && 7314 Ext0->getOpcode() == Ext1->getOpcode()) { 7315 ChainOpsAndOperands.insert(Ext0); 7316 ChainOpsAndOperands.insert(Ext1); 7317 } 7318 } 7319 } 7320 } 7321 } 7322 7323 // Pre-compute the cost for I, if it has a reduction pattern cost. 7324 for (Instruction *I : ChainOpsAndOperands) { 7325 auto ReductionCost = 7326 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF)); 7327 if (!ReductionCost) 7328 continue; 7329 7330 assert(!CostCtx.SkipCostComputation.contains(I) && 7331 "reduction op visited multiple times"); 7332 CostCtx.SkipCostComputation.insert(I); 7333 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7334 << ":\n in-loop reduction " << *I << "\n"); 7335 Cost += *ReductionCost; 7336 } 7337 } 7338 7339 // Pre-compute the costs for branches except for the backedge, as the number 7340 // of replicate regions in a VPlan may not directly match the number of 7341 // branches, which would lead to different decisions. 7342 // TODO: Compute cost of branches for each replicate region in the VPlan, 7343 // which is more accurate than the legacy cost model. 7344 for (BasicBlock *BB : OrigLoop->blocks()) { 7345 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector())) 7346 continue; 7347 CostCtx.SkipCostComputation.insert(BB->getTerminator()); 7348 if (BB == OrigLoop->getLoopLatch()) 7349 continue; 7350 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); 7351 Cost += BranchCost; 7352 } 7353 7354 // Pre-compute costs for instructions that are forced-scalar or profitable to 7355 // scalarize. Their costs will be computed separately in the legacy cost 7356 // model. 7357 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) { 7358 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector())) 7359 continue; 7360 CostCtx.SkipCostComputation.insert(ForcedScalar); 7361 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF); 7362 LLVM_DEBUG({ 7363 dbgs() << "Cost of " << ForcedCost << " for VF " << VF 7364 << ": forced scalar " << *ForcedScalar << "\n"; 7365 }); 7366 Cost += ForcedCost; 7367 } 7368 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) { 7369 if (CostCtx.skipCostComputation(Scalarized, VF.isVector())) 7370 continue; 7371 CostCtx.SkipCostComputation.insert(Scalarized); 7372 LLVM_DEBUG({ 7373 dbgs() << "Cost of " << ScalarCost << " for VF " << VF 7374 << ": profitable to scalarize " << *Scalarized << "\n"; 7375 }); 7376 Cost += ScalarCost; 7377 } 7378 7379 return Cost; 7380 } 7381 7382 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, 7383 ElementCount VF) const { 7384 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, 7385 CM.CostKind); 7386 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); 7387 7388 // Now compute and add the VPlan-based cost. 7389 Cost += Plan.cost(VF, CostCtx); 7390 #ifndef NDEBUG 7391 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); 7392 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost 7393 << " (Estimated cost per lane: "); 7394 if (Cost.isValid()) { 7395 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth; 7396 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane)); 7397 } else /* No point dividing an invalid cost - it will still be invalid */ 7398 LLVM_DEBUG(dbgs() << "Invalid"); 7399 LLVM_DEBUG(dbgs() << ")\n"); 7400 #endif 7401 return Cost; 7402 } 7403 7404 #ifndef NDEBUG 7405 /// Return true if the original loop \ TheLoop contains any instructions that do 7406 /// not have corresponding recipes in \p Plan and are not marked to be ignored 7407 /// in \p CostCtx. This means the VPlan contains simplification that the legacy 7408 /// cost-model did not account for. 7409 static bool planContainsAdditionalSimplifications(VPlan &Plan, 7410 VPCostContext &CostCtx, 7411 Loop *TheLoop) { 7412 // First collect all instructions for the recipes in Plan. 7413 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { 7414 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 7415 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 7416 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 7417 return &WidenMem->getIngredient(); 7418 return nullptr; 7419 }; 7420 7421 DenseSet<Instruction *> SeenInstrs; 7422 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); 7423 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 7424 for (VPRecipeBase &R : *VPBB) { 7425 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) { 7426 auto *IG = IR->getInterleaveGroup(); 7427 unsigned NumMembers = IG->getNumMembers(); 7428 for (unsigned I = 0; I != NumMembers; ++I) { 7429 if (Instruction *M = IG->getMember(I)) 7430 SeenInstrs.insert(M); 7431 } 7432 continue; 7433 } 7434 // The VPlan-based cost model is more accurate for partial reduction and 7435 // comparing against the legacy cost isn't desirable. 7436 if (isa<VPPartialReductionRecipe>(&R)) 7437 return true; 7438 if (Instruction *UI = GetInstructionForCost(&R)) 7439 SeenInstrs.insert(UI); 7440 } 7441 } 7442 7443 // Return true if the loop contains any instructions that are not also part of 7444 // the VPlan or are skipped for VPlan-based cost computations. This indicates 7445 // that the VPlan contains extra simplifications. 7446 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx, 7447 TheLoop](BasicBlock *BB) { 7448 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) { 7449 if (isa<PHINode>(&I) && BB == TheLoop->getHeader()) 7450 return false; 7451 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); 7452 }); 7453 }); 7454 } 7455 #endif 7456 7457 VectorizationFactor LoopVectorizationPlanner::computeBestVF() { 7458 if (VPlans.empty()) 7459 return VectorizationFactor::Disabled(); 7460 // If there is a single VPlan with a single VF, return it directly. 7461 VPlan &FirstPlan = *VPlans[0]; 7462 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) 7463 return {*FirstPlan.vectorFactors().begin(), 0, 0}; 7464 7465 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: " 7466 << (CM.CostKind == TTI::TCK_RecipThroughput 7467 ? "Reciprocal Throughput\n" 7468 : CM.CostKind == TTI::TCK_Latency 7469 ? "Instruction Latency\n" 7470 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n" 7471 : CM.CostKind == TTI::TCK_SizeAndLatency 7472 ? "Code Size and Latency\n" 7473 : "Unknown\n")); 7474 7475 ElementCount ScalarVF = ElementCount::getFixed(1); 7476 assert(hasPlanWithVF(ScalarVF) && 7477 "More than a single plan/VF w/o any plan having scalar VF"); 7478 7479 // TODO: Compute scalar cost using VPlan-based cost model. 7480 InstructionCost ScalarCost = CM.expectedCost(ScalarVF); 7481 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); 7482 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost); 7483 VectorizationFactor BestFactor = ScalarFactor; 7484 7485 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 7486 if (ForceVectorization) { 7487 // Ignore scalar width, because the user explicitly wants vectorization. 7488 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 7489 // evaluation. 7490 BestFactor.Cost = InstructionCost::getMax(); 7491 } 7492 7493 for (auto &P : VPlans) { 7494 for (ElementCount VF : P->vectorFactors()) { 7495 if (VF.isScalar()) 7496 continue; 7497 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 7498 LLVM_DEBUG( 7499 dbgs() 7500 << "LV: Not considering vector loop of width " << VF 7501 << " because it will not generate any vector instructions.\n"); 7502 continue; 7503 } 7504 7505 InstructionCost Cost = cost(*P, VF); 7506 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); 7507 if (isMoreProfitable(CurrentFactor, BestFactor)) 7508 BestFactor = CurrentFactor; 7509 7510 // If profitable add it to ProfitableVF list. 7511 if (isMoreProfitable(CurrentFactor, ScalarFactor)) 7512 ProfitableVFs.push_back(CurrentFactor); 7513 } 7514 } 7515 7516 #ifndef NDEBUG 7517 // Select the optimal vectorization factor according to the legacy cost-model. 7518 // This is now only used to verify the decisions by the new VPlan-based 7519 // cost-model and will be retired once the VPlan-based cost-model is 7520 // stabilized. 7521 VectorizationFactor LegacyVF = selectVectorizationFactor(); 7522 VPlan &BestPlan = getPlanFor(BestFactor.Width); 7523 7524 // Pre-compute the cost and use it to check if BestPlan contains any 7525 // simplifications not accounted for in the legacy cost model. If that's the 7526 // case, don't trigger the assertion, as the extra simplifications may cause a 7527 // different VF to be picked by the VPlan-based cost model. 7528 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, 7529 CM.CostKind); 7530 precomputeCosts(BestPlan, BestFactor.Width, CostCtx); 7531 assert((BestFactor.Width == LegacyVF.Width || 7532 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), 7533 CostCtx, OrigLoop) || 7534 planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), 7535 CostCtx, OrigLoop)) && 7536 " VPlan cost model and legacy cost model disagreed"); 7537 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && 7538 "when vectorizing, the scalar cost must be computed."); 7539 #endif 7540 7541 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); 7542 return BestFactor; 7543 } 7544 7545 static void addRuntimeUnrollDisableMetaData(Loop *L) { 7546 SmallVector<Metadata *, 4> MDs; 7547 // Reserve first location for self reference to the LoopID metadata node. 7548 MDs.push_back(nullptr); 7549 bool IsUnrollMetadata = false; 7550 MDNode *LoopID = L->getLoopID(); 7551 if (LoopID) { 7552 // First find existing loop unrolling disable metadata. 7553 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { 7554 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); 7555 if (MD) { 7556 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7557 IsUnrollMetadata = 7558 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7559 } 7560 MDs.push_back(LoopID->getOperand(I)); 7561 } 7562 } 7563 7564 if (!IsUnrollMetadata) { 7565 // Add runtime unroll disable metadata. 7566 LLVMContext &Context = L->getHeader()->getContext(); 7567 SmallVector<Metadata *, 1> DisableOperands; 7568 DisableOperands.push_back( 7569 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7570 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7571 MDs.push_back(DisableNode); 7572 MDNode *NewLoopID = MDNode::get(Context, MDs); 7573 // Set operand 0 to refer to the loop id itself. 7574 NewLoopID->replaceOperandWith(0, NewLoopID); 7575 L->setLoopID(NewLoopID); 7576 } 7577 } 7578 7579 // If \p R is a ComputeReductionResult when vectorizing the epilog loop, 7580 // fix the reduction's scalar PHI node by adding the incoming value from the 7581 // main vector loop. 7582 static void fixReductionScalarResumeWhenVectorizingEpilog( 7583 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, 7584 BasicBlock *BypassBlock) { 7585 auto *EpiRedResult = dyn_cast<VPInstruction>(R); 7586 if (!EpiRedResult || 7587 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7588 return; 7589 7590 auto *EpiRedHeaderPhi = 7591 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0)); 7592 const RecurrenceDescriptor &RdxDesc = 7593 EpiRedHeaderPhi->getRecurrenceDescriptor(); 7594 Value *MainResumeValue = 7595 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); 7596 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 7597 RdxDesc.getRecurrenceKind())) { 7598 auto *Cmp = cast<ICmpInst>(MainResumeValue); 7599 assert(Cmp->getPredicate() == CmpInst::ICMP_NE && 7600 "AnyOf expected to start with ICMP_NE"); 7601 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && 7602 "AnyOf expected to start by comparing main resume value to original " 7603 "start value"); 7604 MainResumeValue = Cmp->getOperand(0); 7605 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 7606 RdxDesc.getRecurrenceKind())) { 7607 using namespace llvm::PatternMatch; 7608 Value *Cmp, *OrigResumeV; 7609 bool IsExpectedPattern = 7610 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), 7611 m_Specific(RdxDesc.getSentinelValue()), 7612 m_Value(OrigResumeV))) && 7613 match(Cmp, 7614 m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), 7615 m_Specific(RdxDesc.getRecurrenceStartValue()))); 7616 assert(IsExpectedPattern && "Unexpected reduction resume pattern"); 7617 (void)IsExpectedPattern; 7618 MainResumeValue = OrigResumeV; 7619 } 7620 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue); 7621 7622 // When fixing reductions in the epilogue loop we should already have 7623 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry 7624 // over the incoming values correctly. 7625 using namespace VPlanPatternMatch; 7626 auto IsResumePhi = [](VPUser *U) { 7627 return match( 7628 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue())); 7629 }; 7630 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && 7631 "ResumePhi must have a single user"); 7632 auto *EpiResumePhiVPI = 7633 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi)); 7634 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true)); 7635 EpiResumePhi->setIncomingValueForBlock( 7636 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); 7637 } 7638 7639 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( 7640 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7641 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue, 7642 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7643 assert(BestVPlan.hasVF(BestVF) && 7644 "Trying to execute plan with unsupported VF"); 7645 assert(BestVPlan.hasUF(BestUF) && 7646 "Trying to execute plan with unsupported UF"); 7647 assert( 7648 ((VectorizingEpilogue && ExpandedSCEVs) || 7649 (!VectorizingEpilogue && !ExpandedSCEVs)) && 7650 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7651 7652 // TODO: Move to VPlan transform stage once the transition to the VPlan-based 7653 // cost model is complete for better cost estimates. 7654 VPlanTransforms::unrollByUF(BestVPlan, BestUF, 7655 OrigLoop->getHeader()->getContext()); 7656 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7657 VPlanTransforms::convertToConcreteRecipes(BestVPlan); 7658 7659 // Perform the actual loop transformation. 7660 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, 7661 &BestVPlan, OrigLoop->getParentLoop(), 7662 Legal->getWidestInductionType()); 7663 7664 #ifdef EXPENSIVE_CHECKS 7665 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 7666 #endif 7667 7668 // 0. Generate SCEV-dependent code in the entry, including TripCount, before 7669 // making any changes to the CFG. 7670 if (!BestVPlan.getEntry()->empty()) 7671 BestVPlan.getEntry()->execute(&State); 7672 7673 if (!ILV.getTripCount()) 7674 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); 7675 else 7676 assert(VectorizingEpilogue && "should only re-use the existing trip " 7677 "count during epilogue vectorization"); 7678 7679 // 1. Set up the skeleton for vectorization, including vector pre-header and 7680 // middle block. The vector loop is created during VPlan execution. 7681 VPBasicBlock *VectorPH = 7682 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor()); 7683 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( 7684 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); 7685 if (VectorizingEpilogue) 7686 VPlanTransforms::removeDeadRecipes(BestVPlan); 7687 7688 // Only use noalias metadata when using memory checks guaranteeing no overlap 7689 // across all iterations. 7690 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7691 std::unique_ptr<LoopVersioning> LVer = nullptr; 7692 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7693 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7694 7695 // We currently don't use LoopVersioning for the actual loop cloning but we 7696 // still use it to add the noalias metadata. 7697 // TODO: Find a better way to re-use LoopVersioning functionality to add 7698 // metadata. 7699 LVer = std::make_unique<LoopVersioning>( 7700 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7701 PSE.getSE()); 7702 State.LVer = &*LVer; 7703 State.LVer->prepareNoAliasMetadata(); 7704 } 7705 7706 ILV.printDebugTracesAtStart(); 7707 7708 //===------------------------------------------------===// 7709 // 7710 // Notice: any optimization or new instruction that go 7711 // into the code below should also be implemented in 7712 // the cost-model. 7713 // 7714 //===------------------------------------------------===// 7715 7716 // 2. Copy and widen instructions from the old loop into the new loop. 7717 BestVPlan.prepareToExecute( 7718 ILV.getTripCount(), 7719 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); 7720 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); 7721 7722 BestVPlan.execute(&State); 7723 7724 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7725 // 2.5 When vectorizing the epilogue, fix reduction and induction resume 7726 // values from the additional bypass block. 7727 if (VectorizingEpilogue) { 7728 assert(!ILV.Legal->hasUncountableEarlyExit() && 7729 "Epilogue vectorisation not yet supported with early exits"); 7730 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); 7731 for (VPRecipeBase &R : *MiddleVPBB) { 7732 fixReductionScalarResumeWhenVectorizingEpilog( 7733 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); 7734 } 7735 BasicBlock *PH = OrigLoop->getLoopPreheader(); 7736 for (const auto &[IVPhi, _] : Legal->getInductionVars()) { 7737 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); 7738 Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); 7739 Inc->setIncomingValueForBlock(BypassBlock, V); 7740 } 7741 } 7742 7743 // 2.6. Maintain Loop Hints 7744 // Keep all loop hints from the original loop on the vector loop (we'll 7745 // replace the vectorizer-specific hints below). 7746 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { 7747 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7748 7749 std::optional<MDNode *> VectorizedLoopID = 7750 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7751 LLVMLoopVectorizeFollowupVectorized}); 7752 7753 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); 7754 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7755 if (VectorizedLoopID) { 7756 L->setLoopID(*VectorizedLoopID); 7757 } else { 7758 // Keep all loop hints from the original loop on the vector loop (we'll 7759 // replace the vectorizer-specific hints below). 7760 if (MDNode *LID = OrigLoop->getLoopID()) 7761 L->setLoopID(LID); 7762 7763 LoopVectorizeHints Hints(L, true, *ORE); 7764 Hints.setAlreadyVectorized(); 7765 } 7766 TargetTransformInfo::UnrollingPreferences UP; 7767 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7768 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) 7769 addRuntimeUnrollDisableMetaData(L); 7770 } 7771 7772 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7773 // predication, updating analyses. 7774 ILV.fixVectorizedLoop(State); 7775 7776 ILV.printDebugTracesAtEnd(); 7777 7778 // 4. Adjust branch weight of the branch in the middle block. 7779 if (BestVPlan.getVectorLoopRegion()) { 7780 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7781 auto *MiddleTerm = 7782 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); 7783 if (MiddleTerm->isConditional() && 7784 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7785 // Assume that `Count % VectorTripCount` is equally distributed. 7786 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); 7787 assert(TripCount > 0 && "trip count should not be zero"); 7788 const uint32_t Weights[] = {1, TripCount - 1}; 7789 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); 7790 } 7791 } 7792 7793 return State.ExpandedSCEVs; 7794 } 7795 7796 //===--------------------------------------------------------------------===// 7797 // EpilogueVectorizerMainLoop 7798 //===--------------------------------------------------------------------===// 7799 7800 /// This function is partially responsible for generating the control flow 7801 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7802 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7803 const SCEV2ValueTy &ExpandedSCEVs) { 7804 createVectorLoopSkeleton(""); 7805 7806 // Generate the code to check the minimum iteration count of the vector 7807 // epilogue (see below). 7808 EPI.EpilogueIterationCountCheck = 7809 emitIterationCountCheck(LoopScalarPreHeader, true); 7810 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7811 7812 // Generate the code to check any assumptions that we've made for SCEV 7813 // expressions. 7814 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7815 7816 // Generate the code that checks at runtime if arrays overlap. We put the 7817 // checks into a separate block to make the more common case of few elements 7818 // faster. 7819 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7820 7821 // Generate the iteration count check for the main loop, *after* the check 7822 // for the epilogue loop, so that the path-length is shorter for the case 7823 // that goes directly through the vector epilogue. The longer-path length for 7824 // the main loop is compensated for, by the gain from vectorizing the larger 7825 // trip count. Note: the branch will get updated later on when we vectorize 7826 // the epilogue. 7827 EPI.MainLoopIterationCountCheck = 7828 emitIterationCountCheck(LoopScalarPreHeader, false); 7829 7830 // Generate the induction variable. 7831 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7832 7833 return LoopVectorPreHeader; 7834 } 7835 7836 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7837 LLVM_DEBUG({ 7838 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7839 << "Main Loop VF:" << EPI.MainLoopVF 7840 << ", Main Loop UF:" << EPI.MainLoopUF 7841 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7842 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7843 }); 7844 } 7845 7846 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7847 DEBUG_WITH_TYPE(VerboseDebug, { 7848 dbgs() << "intermediate fn:\n" 7849 << *OrigLoop->getHeader()->getParent() << "\n"; 7850 }); 7851 } 7852 7853 BasicBlock * 7854 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7855 bool ForEpilogue) { 7856 assert(Bypass && "Expected valid bypass basic block."); 7857 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7858 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7859 Value *Count = getTripCount(); 7860 // Reuse existing vector loop preheader for TC checks. 7861 // Note that new preheader block is generated for vector loop. 7862 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7863 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7864 7865 // Generate code to check if the loop's trip count is less than VF * UF of the 7866 // main vector loop. 7867 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7868 : VF.isVector()) 7869 ? ICmpInst::ICMP_ULE 7870 : ICmpInst::ICMP_ULT; 7871 7872 Value *CheckMinIters = Builder.CreateICmp( 7873 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7874 "min.iters.check"); 7875 7876 if (!ForEpilogue) 7877 TCCheckBlock->setName("vector.main.loop.iter.check"); 7878 7879 // Create new preheader for vector loop. 7880 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7881 DT, LI, nullptr, "vector.ph"); 7882 7883 if (ForEpilogue) { 7884 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7885 DT->getNode(Bypass)->getIDom()) && 7886 "TC check is expected to dominate Bypass"); 7887 7888 LoopBypassBlocks.push_back(TCCheckBlock); 7889 7890 // Save the trip count so we don't have to regenerate it in the 7891 // vec.epilog.iter.check. This is safe to do because the trip count 7892 // generated here dominates the vector epilog iter check. 7893 EPI.TripCount = Count; 7894 } 7895 7896 BranchInst &BI = 7897 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7898 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7899 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 7900 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7901 7902 introduceCheckBlockInVPlan(TCCheckBlock); 7903 return TCCheckBlock; 7904 } 7905 7906 //===--------------------------------------------------------------------===// 7907 // EpilogueVectorizerEpilogueLoop 7908 //===--------------------------------------------------------------------===// 7909 7910 /// This function is partially responsible for generating the control flow 7911 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7912 BasicBlock * 7913 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7914 const SCEV2ValueTy &ExpandedSCEVs) { 7915 createVectorLoopSkeleton("vec.epilog."); 7916 7917 // Now, compare the remaining count and if there aren't enough iterations to 7918 // execute the vectorized epilogue skip to the scalar part. 7919 LoopVectorPreHeader->setName("vec.epilog.ph"); 7920 BasicBlock *VecEpilogueIterationCountCheck = 7921 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, 7922 nullptr, "vec.epilog.iter.check", true); 7923 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7924 VecEpilogueIterationCountCheck); 7925 AdditionalBypassBlock = VecEpilogueIterationCountCheck; 7926 7927 // Adjust the control flow taking the state info from the main loop 7928 // vectorization into account. 7929 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7930 "expected this to be saved from the previous pass."); 7931 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7932 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7933 7934 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7935 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7936 7937 if (EPI.SCEVSafetyCheck) 7938 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7939 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7940 if (EPI.MemSafetyCheck) 7941 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7942 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7943 7944 DT->changeImmediateDominator(LoopScalarPreHeader, 7945 EPI.EpilogueIterationCountCheck); 7946 // Keep track of bypass blocks, as they feed start values to the induction and 7947 // reduction phis in the scalar loop preheader. 7948 if (EPI.SCEVSafetyCheck) 7949 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7950 if (EPI.MemSafetyCheck) 7951 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7952 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7953 7954 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7955 // reductions which merge control-flow from the latch block and the middle 7956 // block. Update the incoming values here and move the Phi into the preheader. 7957 SmallVector<PHINode *, 4> PhisInBlock; 7958 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7959 PhisInBlock.push_back(&Phi); 7960 7961 for (PHINode *Phi : PhisInBlock) { 7962 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7963 Phi->replaceIncomingBlockWith( 7964 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7965 VecEpilogueIterationCountCheck); 7966 7967 // If the phi doesn't have an incoming value from the 7968 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7969 // value and also those from other check blocks. This is needed for 7970 // reduction phis only. 7971 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7972 return EPI.EpilogueIterationCountCheck == IncB; 7973 })) 7974 continue; 7975 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7976 if (EPI.SCEVSafetyCheck) 7977 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7978 if (EPI.MemSafetyCheck) 7979 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7980 } 7981 7982 // Generate bypass values from the additional bypass block. Note that when the 7983 // vectorized epilogue is skipped due to iteration count check, then the 7984 // resume value for the induction variable comes from the trip count of the 7985 // main vector loop, passed as the second argument. 7986 createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); 7987 return LoopVectorPreHeader; 7988 } 7989 7990 BasicBlock * 7991 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7992 BasicBlock *Bypass, BasicBlock *Insert) { 7993 7994 assert(EPI.TripCount && 7995 "Expected trip count to have been saved in the first pass."); 7996 assert( 7997 (!isa<Instruction>(EPI.TripCount) || 7998 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7999 "saved trip count does not dominate insertion point."); 8000 Value *TC = EPI.TripCount; 8001 IRBuilder<> Builder(Insert->getTerminator()); 8002 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8003 8004 // Generate code to check if the loop's trip count is less than VF * UF of the 8005 // vector epilogue loop. 8006 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8007 ? ICmpInst::ICMP_ULE 8008 : ICmpInst::ICMP_ULT; 8009 8010 Value *CheckMinIters = 8011 Builder.CreateICmp(P, Count, 8012 createStepForVF(Builder, Count->getType(), 8013 EPI.EpilogueVF, EPI.EpilogueUF), 8014 "min.epilog.iters.check"); 8015 8016 BranchInst &BI = 8017 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8018 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 8019 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 8020 unsigned EpilogueLoopStep = 8021 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 8022 // We assume the remaining `Count` is equally distributed in 8023 // [0, MainLoopStep) 8024 // So the probability for `Count < EpilogueLoopStep` should be 8025 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 8026 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 8027 const uint32_t Weights[] = {EstimatedSkipCount, 8028 MainLoopStep - EstimatedSkipCount}; 8029 setBranchWeights(BI, Weights, /*IsExpected=*/false); 8030 } 8031 ReplaceInstWithInst(Insert->getTerminator(), &BI); 8032 LoopBypassBlocks.push_back(Insert); 8033 8034 // A new entry block has been created for the epilogue VPlan. Hook it in, as 8035 // otherwise we would try to modify the entry to the main vector loop. 8036 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); 8037 VPBasicBlock *OldEntry = Plan.getEntry(); 8038 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); 8039 Plan.setEntry(NewEntry); 8040 // OldEntry is now dead and will be cleaned up when the plan gets destroyed. 8041 8042 introduceCheckBlockInVPlan(Insert); 8043 return Insert; 8044 } 8045 8046 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8047 LLVM_DEBUG({ 8048 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8049 << "Epilogue Loop VF:" << EPI.EpilogueVF 8050 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8051 }); 8052 } 8053 8054 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8055 DEBUG_WITH_TYPE(VerboseDebug, { 8056 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8057 }); 8058 } 8059 8060 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 8061 VPRecipeBuilder::mapToVPValues(User::op_range Operands) { 8062 std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 8063 return getVPValueOrAddLiveIn(Op); 8064 }; 8065 return map_range(Operands, Fn); 8066 } 8067 8068 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { 8069 BasicBlock *Src = SI->getParent(); 8070 assert(!OrigLoop->isLoopExiting(Src) && 8071 all_of(successors(Src), 8072 [this](BasicBlock *Succ) { 8073 return OrigLoop->getHeader() != Succ; 8074 }) && 8075 "unsupported switch either exiting loop or continuing to header"); 8076 // Create masks where the terminator in Src is a switch. We create mask for 8077 // all edges at the same time. This is more efficient, as we can create and 8078 // collect compares for all cases once. 8079 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); 8080 BasicBlock *DefaultDst = SI->getDefaultDest(); 8081 MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares; 8082 for (auto &C : SI->cases()) { 8083 BasicBlock *Dst = C.getCaseSuccessor(); 8084 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); 8085 // Cases whose destination is the same as default are redundant and can be 8086 // ignored - they will get there anyhow. 8087 if (Dst == DefaultDst) 8088 continue; 8089 auto &Compares = Dst2Compares[Dst]; 8090 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); 8091 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); 8092 } 8093 8094 // We need to handle 2 separate cases below for all entries in Dst2Compares, 8095 // which excludes destinations matching the default destination. 8096 VPValue *SrcMask = getBlockInMask(Src); 8097 VPValue *DefaultMask = nullptr; 8098 for (const auto &[Dst, Conds] : Dst2Compares) { 8099 // 1. Dst is not the default destination. Dst is reached if any of the cases 8100 // with destination == Dst are taken. Join the conditions for each case 8101 // whose destination == Dst using an OR. 8102 VPValue *Mask = Conds[0]; 8103 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) 8104 Mask = Builder.createOr(Mask, V); 8105 if (SrcMask) 8106 Mask = Builder.createLogicalAnd(SrcMask, Mask); 8107 EdgeMaskCache[{Src, Dst}] = Mask; 8108 8109 // 2. Create the mask for the default destination, which is reached if none 8110 // of the cases with destination != default destination are taken. Join the 8111 // conditions for each case where the destination is != Dst using an OR and 8112 // negate it. 8113 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; 8114 } 8115 8116 if (DefaultMask) { 8117 DefaultMask = Builder.createNot(DefaultMask); 8118 if (SrcMask) 8119 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); 8120 } 8121 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; 8122 } 8123 8124 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 8125 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8126 8127 // Look for cached value. 8128 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8129 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8130 if (ECEntryIt != EdgeMaskCache.end()) 8131 return ECEntryIt->second; 8132 8133 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { 8134 createSwitchEdgeMasks(SI); 8135 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); 8136 return EdgeMaskCache[Edge]; 8137 } 8138 8139 VPValue *SrcMask = getBlockInMask(Src); 8140 8141 // The terminator has to be a branch inst! 8142 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8143 assert(BI && "Unexpected terminator found"); 8144 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8145 return EdgeMaskCache[Edge] = SrcMask; 8146 8147 // If source is an exiting block, we know the exit edge is dynamically dead 8148 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8149 // adding uses of an otherwise potentially dead instruction unless we are 8150 // vectorizing a loop with uncountable exits. In that case, we always 8151 // materialize the mask. 8152 if (OrigLoop->isLoopExiting(Src) && 8153 Src != Legal->getUncountableEarlyExitingBlock()) 8154 return EdgeMaskCache[Edge] = SrcMask; 8155 8156 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); 8157 assert(EdgeMask && "No Edge Mask found for condition"); 8158 8159 if (BI->getSuccessor(0) != Dst) 8160 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8161 8162 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8163 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 8164 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 8165 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8166 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); 8167 } 8168 8169 return EdgeMaskCache[Edge] = EdgeMask; 8170 } 8171 8172 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { 8173 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8174 8175 // Look for cached value. 8176 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8177 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); 8178 assert(ECEntryIt != EdgeMaskCache.end() && 8179 "looking up mask for edge which has not been created"); 8180 return ECEntryIt->second; 8181 } 8182 8183 void VPRecipeBuilder::createHeaderMask() { 8184 BasicBlock *Header = OrigLoop->getHeader(); 8185 8186 // When not folding the tail, use nullptr to model all-true mask. 8187 if (!CM.foldTailByMasking()) { 8188 BlockMaskCache[Header] = nullptr; 8189 return; 8190 } 8191 8192 // Introduce the early-exit compare IV <= BTC to form header block mask. 8193 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8194 // constructing the desired canonical IV in the header block as its first 8195 // non-phi instructions. 8196 8197 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8198 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8199 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8200 HeaderVPBB->insert(IV, NewInsertionPoint); 8201 8202 VPBuilder::InsertPointGuard Guard(Builder); 8203 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8204 VPValue *BlockMask = nullptr; 8205 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8206 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8207 BlockMaskCache[Header] = BlockMask; 8208 } 8209 8210 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8211 // Return the cached value. 8212 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8213 assert(BCEntryIt != BlockMaskCache.end() && 8214 "Trying to access mask for block without one."); 8215 return BCEntryIt->second; 8216 } 8217 8218 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { 8219 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8220 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8221 assert(OrigLoop->getHeader() != BB && 8222 "Loop header must have cached block mask"); 8223 8224 // All-one mask is modelled as no-mask following the convention for masked 8225 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8226 VPValue *BlockMask = nullptr; 8227 // This is the block mask. We OR all unique incoming edges. 8228 for (auto *Predecessor : 8229 SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) { 8230 VPValue *EdgeMask = createEdgeMask(Predecessor, BB); 8231 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8232 BlockMaskCache[BB] = EdgeMask; 8233 return; 8234 } 8235 8236 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8237 BlockMask = EdgeMask; 8238 continue; 8239 } 8240 8241 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8242 } 8243 8244 BlockMaskCache[BB] = BlockMask; 8245 } 8246 8247 VPWidenMemoryRecipe * 8248 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, 8249 VFRange &Range) { 8250 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8251 "Must be called with either a load or store"); 8252 8253 auto WillWiden = [&](ElementCount VF) -> bool { 8254 LoopVectorizationCostModel::InstWidening Decision = 8255 CM.getWideningDecision(I, VF); 8256 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8257 "CM decision should be taken at this point."); 8258 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8259 return true; 8260 if (CM.isScalarAfterVectorization(I, VF) || 8261 CM.isProfitableToScalarize(I, VF)) 8262 return false; 8263 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8264 }; 8265 8266 if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range)) 8267 return nullptr; 8268 8269 VPValue *Mask = nullptr; 8270 if (Legal->isMaskRequired(I)) 8271 Mask = getBlockInMask(I->getParent()); 8272 8273 // Determine if the pointer operand of the access is either consecutive or 8274 // reverse consecutive. 8275 LoopVectorizationCostModel::InstWidening Decision = 8276 CM.getWideningDecision(I, Range.Start); 8277 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8278 bool Consecutive = 8279 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8280 8281 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8282 if (Consecutive) { 8283 auto *GEP = dyn_cast<GetElementPtrInst>( 8284 Ptr->getUnderlyingValue()->stripPointerCasts()); 8285 VPSingleDefRecipe *VectorPtr; 8286 if (Reverse) { 8287 // When folding the tail, we may compute an address that we don't in the 8288 // original scalar loop and it may not be inbounds. Drop Inbounds in that 8289 // case. 8290 GEPNoWrapFlags Flags = 8291 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) 8292 ? GEPNoWrapFlags::none() 8293 : GEPNoWrapFlags::inBounds(); 8294 VectorPtr = new VPReverseVectorPointerRecipe( 8295 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); 8296 } else { 8297 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), 8298 GEP ? GEP->getNoWrapFlags() 8299 : GEPNoWrapFlags::none(), 8300 I->getDebugLoc()); 8301 } 8302 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8303 Ptr = VectorPtr; 8304 } 8305 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8306 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, 8307 I->getDebugLoc()); 8308 8309 StoreInst *Store = cast<StoreInst>(I); 8310 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, 8311 Reverse, I->getDebugLoc()); 8312 } 8313 8314 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8315 /// insert a recipe to expand the step for the induction recipe. 8316 static VPWidenIntOrFpInductionRecipe * 8317 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8318 VPValue *Start, const InductionDescriptor &IndDesc, 8319 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { 8320 assert(IndDesc.getStartValue() == 8321 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8322 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8323 "step must be loop invariant"); 8324 8325 VPValue *Step = 8326 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8327 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8328 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8329 IndDesc, TruncI, 8330 TruncI->getDebugLoc()); 8331 } 8332 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8333 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8334 IndDesc, Phi->getDebugLoc()); 8335 } 8336 8337 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8338 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { 8339 8340 // Check if this is an integer or fp induction. If so, build the recipe that 8341 // produces its scalar and vector values. 8342 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8343 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8344 *PSE.getSE(), *OrigLoop); 8345 8346 // Check if this is pointer induction. If so, build the recipe for it. 8347 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8348 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8349 *PSE.getSE()); 8350 return new VPWidenPointerInductionRecipe( 8351 Phi, Operands[0], Step, *II, 8352 LoopVectorizationPlanner::getDecisionAndClampRange( 8353 [&](ElementCount VF) { 8354 return CM.isScalarAfterVectorization(Phi, VF); 8355 }, 8356 Range), 8357 Phi->getDebugLoc()); 8358 } 8359 return nullptr; 8360 } 8361 8362 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8363 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { 8364 // Optimize the special case where the source is a constant integer 8365 // induction variable. Notice that we can only optimize the 'trunc' case 8366 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8367 // (c) other casts depend on pointer size. 8368 8369 // Determine whether \p K is a truncation based on an induction variable that 8370 // can be optimized. 8371 auto IsOptimizableIVTruncate = 8372 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8373 return [=](ElementCount VF) -> bool { 8374 return CM.isOptimizableIVTruncate(K, VF); 8375 }; 8376 }; 8377 8378 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8379 IsOptimizableIVTruncate(I), Range)) { 8380 8381 auto *Phi = cast<PHINode>(I->getOperand(0)); 8382 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8383 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); 8384 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8385 *OrigLoop); 8386 } 8387 return nullptr; 8388 } 8389 8390 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, 8391 ArrayRef<VPValue *> Operands) { 8392 unsigned NumIncoming = Phi->getNumIncomingValues(); 8393 8394 // We know that all PHIs in non-header blocks are converted into selects, so 8395 // we don't have to worry about the insertion order and we can just use the 8396 // builder. At this point we generate the predication tree. There may be 8397 // duplications since this is a simple recursive scan, but future 8398 // optimizations will clean it up. 8399 SmallVector<VPValue *, 2> OperandsWithMask; 8400 8401 for (unsigned In = 0; In < NumIncoming; In++) { 8402 OperandsWithMask.push_back(Operands[In]); 8403 VPValue *EdgeMask = 8404 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); 8405 if (!EdgeMask) { 8406 assert(In == 0 && "Both null and non-null edge masks found"); 8407 assert(all_equal(Operands) && 8408 "Distinct incoming values with one having a full mask"); 8409 break; 8410 } 8411 OperandsWithMask.push_back(EdgeMask); 8412 } 8413 return new VPBlendRecipe(Phi, OperandsWithMask); 8414 } 8415 8416 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8417 ArrayRef<VPValue *> Operands, 8418 VFRange &Range) { 8419 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8420 [this, CI](ElementCount VF) { 8421 return CM.isScalarWithPredication(CI, VF); 8422 }, 8423 Range); 8424 8425 if (IsPredicated) 8426 return nullptr; 8427 8428 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8429 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8430 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8431 ID == Intrinsic::pseudoprobe || 8432 ID == Intrinsic::experimental_noalias_scope_decl)) 8433 return nullptr; 8434 8435 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8436 8437 // Is it beneficial to perform intrinsic call compared to lib call? 8438 bool ShouldUseVectorIntrinsic = 8439 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8440 [&](ElementCount VF) -> bool { 8441 return CM.getCallWideningDecision(CI, VF).Kind == 8442 LoopVectorizationCostModel::CM_IntrinsicCall; 8443 }, 8444 Range); 8445 if (ShouldUseVectorIntrinsic) 8446 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), 8447 CI->getDebugLoc()); 8448 8449 Function *Variant = nullptr; 8450 std::optional<unsigned> MaskPos; 8451 // Is better to call a vectorized version of the function than to to scalarize 8452 // the call? 8453 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8454 [&](ElementCount VF) -> bool { 8455 // The following case may be scalarized depending on the VF. 8456 // The flag shows whether we can use a usual Call for vectorized 8457 // version of the instruction. 8458 8459 // If we've found a variant at a previous VF, then stop looking. A 8460 // vectorized variant of a function expects input in a certain shape 8461 // -- basically the number of input registers, the number of lanes 8462 // per register, and whether there's a mask required. 8463 // We store a pointer to the variant in the VPWidenCallRecipe, so 8464 // once we have an appropriate variant it's only valid for that VF. 8465 // This will force a different vplan to be generated for each VF that 8466 // finds a valid variant. 8467 if (Variant) 8468 return false; 8469 LoopVectorizationCostModel::CallWideningDecision Decision = 8470 CM.getCallWideningDecision(CI, VF); 8471 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8472 Variant = Decision.Variant; 8473 MaskPos = Decision.MaskPos; 8474 return true; 8475 } 8476 8477 return false; 8478 }, 8479 Range); 8480 if (ShouldUseVectorCall) { 8481 if (MaskPos.has_value()) { 8482 // We have 2 cases that would require a mask: 8483 // 1) The block needs to be predicated, either due to a conditional 8484 // in the scalar loop or use of an active lane mask with 8485 // tail-folding, and we use the appropriate mask for the block. 8486 // 2) No mask is required for the block, but the only available 8487 // vector variant at this VF requires a mask, so we synthesize an 8488 // all-true mask. 8489 VPValue *Mask = nullptr; 8490 if (Legal->isMaskRequired(CI)) 8491 Mask = getBlockInMask(CI->getParent()); 8492 else 8493 Mask = Plan.getOrAddLiveIn( 8494 ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); 8495 8496 Ops.insert(Ops.begin() + *MaskPos, Mask); 8497 } 8498 8499 Ops.push_back(Operands.back()); 8500 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); 8501 } 8502 8503 return nullptr; 8504 } 8505 8506 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8507 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8508 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8509 // Instruction should be widened, unless it is scalar after vectorization, 8510 // scalarization is profitable or it is predicated. 8511 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8512 return CM.isScalarAfterVectorization(I, VF) || 8513 CM.isProfitableToScalarize(I, VF) || 8514 CM.isScalarWithPredication(I, VF); 8515 }; 8516 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8517 Range); 8518 } 8519 8520 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8521 ArrayRef<VPValue *> Operands, 8522 VPBasicBlock *VPBB) { 8523 switch (I->getOpcode()) { 8524 default: 8525 return nullptr; 8526 case Instruction::SDiv: 8527 case Instruction::UDiv: 8528 case Instruction::SRem: 8529 case Instruction::URem: { 8530 // If not provably safe, use a select to form a safe divisor before widening the 8531 // div/rem operation itself. Otherwise fall through to general handling below. 8532 if (CM.isPredicatedInst(I)) { 8533 SmallVector<VPValue *> Ops(Operands); 8534 VPValue *Mask = getBlockInMask(I->getParent()); 8535 VPValue *One = 8536 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); 8537 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); 8538 Ops[1] = SafeRHS; 8539 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8540 } 8541 [[fallthrough]]; 8542 } 8543 case Instruction::Add: 8544 case Instruction::And: 8545 case Instruction::AShr: 8546 case Instruction::FAdd: 8547 case Instruction::FCmp: 8548 case Instruction::FDiv: 8549 case Instruction::FMul: 8550 case Instruction::FNeg: 8551 case Instruction::FRem: 8552 case Instruction::FSub: 8553 case Instruction::ICmp: 8554 case Instruction::LShr: 8555 case Instruction::Mul: 8556 case Instruction::Or: 8557 case Instruction::Select: 8558 case Instruction::Shl: 8559 case Instruction::Sub: 8560 case Instruction::Xor: 8561 case Instruction::Freeze: 8562 SmallVector<VPValue *> NewOps(Operands); 8563 if (Instruction::isBinaryOp(I->getOpcode())) { 8564 // The legacy cost model uses SCEV to check if some of the operands are 8565 // constants. To match the legacy cost model's behavior, use SCEV to try 8566 // to replace operands with constants. 8567 ScalarEvolution &SE = *PSE.getSE(); 8568 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) { 8569 Value *V = Op->getUnderlyingValue(); 8570 if (isa<Constant>(V) || !SE.isSCEVable(V->getType())) 8571 return Op; 8572 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V)); 8573 if (!C) 8574 return Op; 8575 return Plan.getOrAddLiveIn(C->getValue()); 8576 }; 8577 // For Mul, the legacy cost model checks both operands. 8578 if (I->getOpcode() == Instruction::Mul) 8579 NewOps[0] = GetConstantViaSCEV(NewOps[0]); 8580 // For other binops, the legacy cost model only checks the second operand. 8581 NewOps[1] = GetConstantViaSCEV(NewOps[1]); 8582 } 8583 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); 8584 }; 8585 } 8586 8587 VPHistogramRecipe * 8588 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, 8589 ArrayRef<VPValue *> Operands) { 8590 // FIXME: Support other operations. 8591 unsigned Opcode = HI->Update->getOpcode(); 8592 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && 8593 "Histogram update operation must be an Add or Sub"); 8594 8595 SmallVector<VPValue *, 3> HGramOps; 8596 // Bucket address. 8597 HGramOps.push_back(Operands[1]); 8598 // Increment value. 8599 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); 8600 8601 // In case of predicated execution (due to tail-folding, or conditional 8602 // execution, or both), pass the relevant mask. 8603 if (Legal->isMaskRequired(HI->Store)) 8604 HGramOps.push_back(getBlockInMask(HI->Store->getParent())); 8605 8606 return new VPHistogramRecipe(Opcode, 8607 make_range(HGramOps.begin(), HGramOps.end()), 8608 HI->Store->getDebugLoc()); 8609 } 8610 8611 void VPRecipeBuilder::fixHeaderPhis() { 8612 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8613 for (VPHeaderPHIRecipe *R : PhisToFix) { 8614 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8615 VPRecipeBase *IncR = 8616 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8617 R->addOperand(IncR->getVPSingleValue()); 8618 } 8619 } 8620 8621 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, 8622 VFRange &Range) { 8623 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8624 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8625 Range); 8626 8627 bool IsPredicated = CM.isPredicatedInst(I); 8628 8629 // Even if the instruction is not marked as uniform, there are certain 8630 // intrinsic calls that can be effectively treated as such, so we check for 8631 // them here. Conservatively, we only do this for scalable vectors, since 8632 // for fixed-width VFs we can always fall back on full scalarization. 8633 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8634 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8635 case Intrinsic::assume: 8636 case Intrinsic::lifetime_start: 8637 case Intrinsic::lifetime_end: 8638 // For scalable vectors if one of the operands is variant then we still 8639 // want to mark as uniform, which will generate one instruction for just 8640 // the first lane of the vector. We can't scalarize the call in the same 8641 // way as for fixed-width vectors because we don't know how many lanes 8642 // there are. 8643 // 8644 // The reasons for doing it this way for scalable vectors are: 8645 // 1. For the assume intrinsic generating the instruction for the first 8646 // lane is still be better than not generating any at all. For 8647 // example, the input may be a splat across all lanes. 8648 // 2. For the lifetime start/end intrinsics the pointer operand only 8649 // does anything useful when the input comes from a stack object, 8650 // which suggests it should always be uniform. For non-stack objects 8651 // the effect is to poison the object, which still allows us to 8652 // remove the call. 8653 IsUniform = true; 8654 break; 8655 default: 8656 break; 8657 } 8658 } 8659 VPValue *BlockInMask = nullptr; 8660 if (!IsPredicated) { 8661 // Finalize the recipe for Instr, first if it is not predicated. 8662 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8663 } else { 8664 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8665 // Instructions marked for predication are replicated and a mask operand is 8666 // added initially. Masked replicate recipes will later be placed under an 8667 // if-then construct to prevent side-effects. Generate recipes to compute 8668 // the block mask for this region. 8669 BlockInMask = getBlockInMask(I->getParent()); 8670 } 8671 8672 // Note that there is some custom logic to mark some intrinsics as uniform 8673 // manually above for scalable vectors, which this assert needs to account for 8674 // as well. 8675 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || 8676 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && 8677 "Should not predicate a uniform recipe"); 8678 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), 8679 IsUniform, BlockInMask); 8680 return Recipe; 8681 } 8682 8683 /// Find all possible partial reductions in the loop and track all of those that 8684 /// are valid so recipes can be formed later. 8685 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { 8686 // Find all possible partial reductions. 8687 SmallVector<std::pair<PartialReductionChain, unsigned>> 8688 PartialReductionChains; 8689 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { 8690 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range, 8691 PartialReductionChains); 8692 } 8693 8694 // A partial reduction is invalid if any of its extends are used by 8695 // something that isn't another partial reduction. This is because the 8696 // extends are intended to be lowered along with the reduction itself. 8697 8698 // Build up a set of partial reduction bin ops for efficient use checking. 8699 SmallSet<User *, 4> PartialReductionBinOps; 8700 for (const auto &[PartialRdx, _] : PartialReductionChains) 8701 PartialReductionBinOps.insert(PartialRdx.BinOp); 8702 8703 auto ExtendIsOnlyUsedByPartialReductions = 8704 [&PartialReductionBinOps](Instruction *Extend) { 8705 return all_of(Extend->users(), [&](const User *U) { 8706 return PartialReductionBinOps.contains(U); 8707 }); 8708 }; 8709 8710 // Check if each use of a chain's two extends is a partial reduction 8711 // and only add those that don't have non-partial reduction users. 8712 for (auto Pair : PartialReductionChains) { 8713 PartialReductionChain Chain = Pair.first; 8714 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && 8715 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) 8716 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second)); 8717 } 8718 } 8719 8720 bool VPRecipeBuilder::getScaledReductions( 8721 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range, 8722 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) { 8723 8724 if (!CM.TheLoop->contains(RdxExitInstr)) 8725 return false; 8726 8727 // TODO: Allow scaling reductions when predicating. The select at 8728 // the end of the loop chooses between the phi value and most recent 8729 // reduction result, both of which have different VFs to the active lane 8730 // mask when scaling. 8731 if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent())) 8732 return false; 8733 8734 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr); 8735 if (!Update) 8736 return false; 8737 8738 Value *Op = Update->getOperand(0); 8739 Value *PhiOp = Update->getOperand(1); 8740 if (Op == PHI) 8741 std::swap(Op, PhiOp); 8742 8743 // Try and get a scaled reduction from the first non-phi operand. 8744 // If one is found, we use the discovered reduction instruction in 8745 // place of the accumulator for costing. 8746 if (auto *OpInst = dyn_cast<Instruction>(Op)) { 8747 if (getScaledReductions(PHI, OpInst, Range, Chains)) { 8748 PHI = Chains.rbegin()->first.Reduction; 8749 8750 Op = Update->getOperand(0); 8751 PhiOp = Update->getOperand(1); 8752 if (Op == PHI) 8753 std::swap(Op, PhiOp); 8754 } 8755 } 8756 if (PhiOp != PHI) 8757 return false; 8758 8759 auto *BinOp = dyn_cast<BinaryOperator>(Op); 8760 if (!BinOp || !BinOp->hasOneUse()) 8761 return false; 8762 8763 using namespace llvm::PatternMatch; 8764 Value *A, *B; 8765 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || 8766 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) 8767 return false; 8768 8769 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0)); 8770 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1)); 8771 8772 TTI::PartialReductionExtendKind OpAExtend = 8773 TargetTransformInfo::getPartialReductionExtendKind(ExtA); 8774 TTI::PartialReductionExtendKind OpBExtend = 8775 TargetTransformInfo::getPartialReductionExtendKind(ExtB); 8776 8777 PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp); 8778 8779 unsigned TargetScaleFactor = 8780 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( 8781 A->getType()->getPrimitiveSizeInBits()); 8782 8783 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8784 [&](ElementCount VF) { 8785 InstructionCost Cost = TTI->getPartialReductionCost( 8786 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), 8787 VF, OpAExtend, OpBExtend, 8788 std::make_optional(BinOp->getOpcode())); 8789 return Cost.isValid(); 8790 }, 8791 Range)) { 8792 Chains.push_back(std::make_pair(Chain, TargetScaleFactor)); 8793 return true; 8794 } 8795 8796 return false; 8797 } 8798 8799 VPRecipeBase * 8800 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8801 ArrayRef<VPValue *> Operands, 8802 VFRange &Range, VPBasicBlock *VPBB) { 8803 // First, check for specific widening recipes that deal with inductions, Phi 8804 // nodes, calls and memory operations. 8805 VPRecipeBase *Recipe; 8806 if (auto *Phi = dyn_cast<PHINode>(Instr)) { 8807 if (Phi->getParent() != OrigLoop->getHeader()) 8808 return tryToBlend(Phi, Operands); 8809 8810 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8811 return Recipe; 8812 8813 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8814 assert((Legal->isReductionVariable(Phi) || 8815 Legal->isFixedOrderRecurrence(Phi)) && 8816 "can only widen reductions and fixed-order recurrences here"); 8817 VPValue *StartV = Operands[0]; 8818 if (Legal->isReductionVariable(Phi)) { 8819 const RecurrenceDescriptor &RdxDesc = 8820 Legal->getReductionVars().find(Phi)->second; 8821 assert(RdxDesc.getRecurrenceStartValue() == 8822 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8823 8824 // If the PHI is used by a partial reduction, set the scale factor. 8825 unsigned ScaleFactor = 8826 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); 8827 PhiRecipe = new VPReductionPHIRecipe( 8828 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), 8829 CM.useOrderedReductions(RdxDesc), ScaleFactor); 8830 } else { 8831 // TODO: Currently fixed-order recurrences are modeled as chains of 8832 // first-order recurrences. If there are no users of the intermediate 8833 // recurrences in the chain, the fixed order recurrence should be modeled 8834 // directly, enabling more efficient codegen. 8835 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8836 } 8837 8838 PhisToFix.push_back(PhiRecipe); 8839 return PhiRecipe; 8840 } 8841 8842 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8843 cast<TruncInst>(Instr), Operands, Range))) 8844 return Recipe; 8845 8846 // All widen recipes below deal only with VF > 1. 8847 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8848 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8849 return nullptr; 8850 8851 if (auto *CI = dyn_cast<CallInst>(Instr)) 8852 return tryToWidenCall(CI, Operands, Range); 8853 8854 if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) 8855 if (auto HistInfo = Legal->getHistogramInfo(SI)) 8856 return tryToWidenHistogram(*HistInfo, Operands); 8857 8858 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8859 return tryToWidenMemory(Instr, Operands, Range); 8860 8861 if (getScalingForReduction(Instr)) 8862 return tryToCreatePartialReduction(Instr, Operands); 8863 8864 if (!shouldWiden(Instr, Range)) 8865 return nullptr; 8866 8867 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) 8868 return new VPWidenGEPRecipe(GEP, 8869 make_range(Operands.begin(), Operands.end())); 8870 8871 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8872 return new VPWidenSelectRecipe( 8873 *SI, make_range(Operands.begin(), Operands.end())); 8874 } 8875 8876 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8877 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), 8878 *CI); 8879 } 8880 8881 return tryToWiden(Instr, Operands, VPBB); 8882 } 8883 8884 VPRecipeBase * 8885 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, 8886 ArrayRef<VPValue *> Operands) { 8887 assert(Operands.size() == 2 && 8888 "Unexpected number of operands for partial reduction"); 8889 8890 VPValue *BinOp = Operands[0]; 8891 VPValue *Accumulator = Operands[1]; 8892 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); 8893 if (isa<VPReductionPHIRecipe>(BinOpRecipe) || 8894 isa<VPPartialReductionRecipe>(BinOpRecipe)) 8895 std::swap(BinOp, Accumulator); 8896 8897 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, 8898 Accumulator, Reduction); 8899 } 8900 8901 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8902 ElementCount MaxVF) { 8903 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8904 8905 auto MaxVFTimes2 = MaxVF * 2; 8906 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8907 VFRange SubRange = {VF, MaxVFTimes2}; 8908 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8909 // Now optimize the initial VPlan. 8910 if (!Plan->hasVF(ElementCount::getFixed(1))) 8911 VPlanTransforms::truncateToMinimalBitwidths(*Plan, 8912 CM.getMinimalBitwidths()); 8913 VPlanTransforms::optimize(*Plan); 8914 // TODO: try to put it close to addActiveLaneMask(). 8915 // Discard the plan if it is not EVL-compatible 8916 if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength( 8917 *Plan, CM.getMaxSafeElements())) 8918 break; 8919 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 8920 VPlans.push_back(std::move(Plan)); 8921 } 8922 VF = SubRange.End; 8923 } 8924 } 8925 8926 // Add the necessary canonical IV and branch recipes required to control the 8927 // loop. 8928 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8929 DebugLoc DL) { 8930 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8931 auto *StartV = Plan.getOrAddLiveIn(StartIdx); 8932 8933 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8934 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8935 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8936 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8937 Header->insert(CanonicalIVPHI, Header->begin()); 8938 8939 VPBuilder Builder(TopRegion->getExitingBasicBlock()); 8940 // Add a VPInstruction to increment the scalar canonical IV by VF * UF. 8941 auto *CanonicalIVIncrement = Builder.createOverflowingOp( 8942 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, 8943 "index.next"); 8944 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8945 8946 // Add the BranchOnCount VPInstruction to the latch. 8947 Builder.createNaryOp(VPInstruction::BranchOnCount, 8948 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8949 } 8950 8951 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the 8952 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute 8953 /// the end value of the induction. 8954 static VPInstruction *addResumePhiRecipeForInduction( 8955 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, 8956 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { 8957 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); 8958 // Truncated wide inductions resume from the last lane of their vector value 8959 // in the last vector iteration which is handled elsewhere. 8960 if (WideIntOrFp && WideIntOrFp->getTruncInst()) 8961 return nullptr; 8962 8963 VPValue *Start = WideIV->getStartValue(); 8964 VPValue *Step = WideIV->getStepValue(); 8965 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 8966 VPValue *EndValue = VectorTC; 8967 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { 8968 EndValue = VectorPHBuilder.createDerivedIV( 8969 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), 8970 Start, VectorTC, Step); 8971 } 8972 8973 // EndValue is derived from the vector trip count (which has the same type as 8974 // the widest induction) and thus may be wider than the induction here. 8975 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); 8976 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { 8977 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, 8978 ScalarTypeOfWideIV, 8979 WideIV->getDebugLoc()); 8980 } 8981 8982 auto *ResumePhiRecipe = 8983 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, 8984 WideIV->getDebugLoc(), "bc.resume.val"); 8985 return ResumePhiRecipe; 8986 } 8987 8988 /// Create resume phis in the scalar preheader for first-order recurrences, 8989 /// reductions and inductions, and update the VPIRInstructions wrapping the 8990 /// original phis in the scalar header. End values for inductions are added to 8991 /// \p IVEndValues. 8992 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, 8993 DenseMap<VPValue *, VPValue *> &IVEndValues) { 8994 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 8995 auto *ScalarPH = Plan.getScalarPreheader(); 8996 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); 8997 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 8998 VPBuilder VectorPHBuilder( 8999 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); 9000 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9001 VPBuilder ScalarPHBuilder(ScalarPH); 9002 VPValue *OneVPV = Plan.getOrAddLiveIn( 9003 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); 9004 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { 9005 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR); 9006 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); 9007 if (!ScalarPhiI) 9008 break; 9009 9010 // TODO: Extract final value from induction recipe initially, optimize to 9011 // pre-computed end value together in optimizeInductionExitUsers. 9012 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); 9013 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { 9014 if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( 9015 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, 9016 &Plan.getVectorTripCount())) { 9017 assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi && 9018 "Expected a ResumePhi"); 9019 IVEndValues[WideIVR] = ResumePhi->getOperand(0); 9020 ScalarPhiIRI->addOperand(ResumePhi); 9021 continue; 9022 } 9023 // TODO: Also handle truncated inductions here. Computing end-values 9024 // separately should be done as VPlan-to-VPlan optimization, after 9025 // legalizing all resume values to use the last lane from the loop. 9026 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && 9027 "should only skip truncated wide inductions"); 9028 continue; 9029 } 9030 9031 // The backedge value provides the value to resume coming out of a loop, 9032 // which for FORs is a vector whose last element needs to be extracted. The 9033 // start value provides the value if the loop is bypassed. 9034 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); 9035 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); 9036 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9037 "Cannot handle loops with uncountable early exits"); 9038 if (IsFOR) 9039 ResumeFromVectorLoop = MiddleBuilder.createNaryOp( 9040 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, 9041 "vector.recur.extract"); 9042 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; 9043 auto *ResumePhiR = ScalarPHBuilder.createNaryOp( 9044 VPInstruction::ResumePhi, 9045 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); 9046 ScalarPhiIRI->addOperand(ResumePhiR); 9047 } 9048 } 9049 9050 // Collect VPIRInstructions for phis in the exit blocks that are modeled 9051 // in VPlan and add the exiting VPValue as operand. 9052 static SetVector<VPIRInstruction *> 9053 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, 9054 VPlan &Plan) { 9055 SetVector<VPIRInstruction *> ExitUsersToFix; 9056 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { 9057 for (VPRecipeBase &R : *ExitVPBB) { 9058 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); 9059 if (!ExitIRI) 9060 continue; 9061 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); 9062 if (!ExitPhi) 9063 break; 9064 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) { 9065 assert(ExitIRI->getNumOperands() == 9066 ExitVPBB->getPredecessors().size() && 9067 "early-exit must update exit values on construction"); 9068 continue; 9069 } 9070 BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); 9071 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); 9072 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); 9073 ExitIRI->addOperand(V); 9074 if (V->isLiveIn()) 9075 continue; 9076 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && 9077 "Only recipes defined inside a region should need fixing."); 9078 ExitUsersToFix.insert(ExitIRI); 9079 } 9080 } 9081 return ExitUsersToFix; 9082 } 9083 9084 // Add exit values to \p Plan. Extracts are added for each entry in \p 9085 // ExitUsersToFix if needed and their operands are updated. 9086 static void 9087 addUsersInExitBlocks(VPlan &Plan, 9088 const SetVector<VPIRInstruction *> &ExitUsersToFix) { 9089 if (ExitUsersToFix.empty()) 9090 return; 9091 9092 auto *MiddleVPBB = Plan.getMiddleBlock(); 9093 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9094 9095 // Introduce extract for exiting values and update the VPIRInstructions 9096 // modeling the corresponding LCSSA phis. 9097 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9098 assert(ExitIRI->getNumOperands() == 1 && 9099 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && 9100 "exit values from early exits must be fixed when branch to " 9101 "early-exit is added"); 9102 ExitIRI->extractLastLaneOfOperand(B); 9103 } 9104 } 9105 9106 /// Handle users in the exit block for first order reductions in the original 9107 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi 9108 /// users in the original exit block using the VPIRInstruction wrapping to the 9109 /// LCSSA phi. 9110 static void addExitUsersForFirstOrderRecurrences( 9111 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { 9112 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9113 auto *ScalarPHVPBB = Plan.getScalarPreheader(); 9114 auto *MiddleVPBB = Plan.getMiddleBlock(); 9115 VPBuilder ScalarPHBuilder(ScalarPHVPBB); 9116 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9117 VPValue *TwoVPV = Plan.getOrAddLiveIn( 9118 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); 9119 9120 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { 9121 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); 9122 if (!FOR) 9123 continue; 9124 9125 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9126 "Cannot handle loops with uncountable early exits"); 9127 9128 // This is the second phase of vectorizing first-order recurrences, creating 9129 // extract for users outside the loop. An overview of the transformation is 9130 // described below. Suppose we have the following loop with some use after 9131 // the loop of the last a[i-1], 9132 // 9133 // for (int i = 0; i < n; ++i) { 9134 // t = a[i - 1]; 9135 // b[i] = a[i] - t; 9136 // } 9137 // use t; 9138 // 9139 // There is a first-order recurrence on "a". For this loop, the shorthand 9140 // scalar IR looks like: 9141 // 9142 // scalar.ph: 9143 // s.init = a[-1] 9144 // br scalar.body 9145 // 9146 // scalar.body: 9147 // i = phi [0, scalar.ph], [i+1, scalar.body] 9148 // s1 = phi [s.init, scalar.ph], [s2, scalar.body] 9149 // s2 = a[i] 9150 // b[i] = s2 - s1 9151 // br cond, scalar.body, exit.block 9152 // 9153 // exit.block: 9154 // use = lcssa.phi [s1, scalar.body] 9155 // 9156 // In this example, s1 is a recurrence because it's value depends on the 9157 // previous iteration. In the first phase of vectorization, we created a 9158 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts 9159 // for users in the scalar preheader and exit block. 9160 // 9161 // vector.ph: 9162 // v_init = vector(..., ..., ..., a[-1]) 9163 // br vector.body 9164 // 9165 // vector.body 9166 // i = phi [0, vector.ph], [i+4, vector.body] 9167 // v1 = phi [v_init, vector.ph], [v2, vector.body] 9168 // v2 = a[i, i+1, i+2, i+3] 9169 // b[i] = v2 - v1 9170 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) 9171 // b[i, i+1, i+2, i+3] = v2 - v1 9172 // br cond, vector.body, middle.block 9173 // 9174 // middle.block: 9175 // vector.recur.extract.for.phi = v2(2) 9176 // vector.recur.extract = v2(3) 9177 // br cond, scalar.ph, exit.block 9178 // 9179 // scalar.ph: 9180 // scalar.recur.init = phi [vector.recur.extract, middle.block], 9181 // [s.init, otherwise] 9182 // br scalar.body 9183 // 9184 // scalar.body: 9185 // i = phi [0, scalar.ph], [i+1, scalar.body] 9186 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] 9187 // s2 = a[i] 9188 // b[i] = s2 - s1 9189 // br cond, scalar.body, exit.block 9190 // 9191 // exit.block: 9192 // lo = lcssa.phi [s1, scalar.body], 9193 // [vector.recur.extract.for.phi, middle.block] 9194 // 9195 // Now update VPIRInstructions modeling LCSSA phis in the exit block. 9196 // Extract the penultimate value of the recurrence and use it as operand for 9197 // the VPIRInstruction modeling the phi. 9198 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9199 if (ExitIRI->getOperand(0) != FOR) 9200 continue; 9201 VPValue *PenultimateElement = MiddleBuilder.createNaryOp( 9202 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, 9203 "vector.recur.extract.for.phi"); 9204 ExitIRI->setOperand(0, PenultimateElement); 9205 ExitUsersToFix.remove(ExitIRI); 9206 } 9207 } 9208 } 9209 9210 VPlanPtr 9211 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 9212 9213 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9214 9215 // --------------------------------------------------------------------------- 9216 // Build initial VPlan: Scan the body of the loop in a topological order to 9217 // visit each basic block after having visited its predecessor basic blocks. 9218 // --------------------------------------------------------------------------- 9219 9220 // Create initial VPlan skeleton, having a basic block for the pre-header 9221 // which contains SCEV expansions that need to happen before the CFG is 9222 // modified; a basic block for the vector pre-header, followed by a region for 9223 // the vector loop, followed by the middle basic block. The skeleton vector 9224 // loop region contains a header and latch basic blocks. 9225 9226 bool RequiresScalarEpilogueCheck = 9227 LoopVectorizationPlanner::getDecisionAndClampRange( 9228 [this](ElementCount VF) { 9229 return !CM.requiresScalarEpilogue(VF.isVector()); 9230 }, 9231 Range); 9232 VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), 9233 PSE, RequiresScalarEpilogueCheck, 9234 CM.foldTailByMasking(), OrigLoop); 9235 9236 // Don't use getDecisionAndClampRange here, because we don't know the UF 9237 // so this function is better to be conservative, rather than to split 9238 // it up into different VPlans. 9239 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 9240 bool IVUpdateMayOverflow = false; 9241 for (ElementCount VF : Range) 9242 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 9243 9244 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9245 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 9246 // Use NUW for the induction increment if we proved that it won't overflow in 9247 // the vector loop or when not folding the tail. In the later case, we know 9248 // that the canonical induction increment will not overflow as the vector trip 9249 // count is >= increment and a multiple of the increment. 9250 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; 9251 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 9252 9253 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9254 Builder); 9255 9256 // --------------------------------------------------------------------------- 9257 // Pre-construction: record ingredients whose recipes we'll need to further 9258 // process after constructing the initial VPlan. 9259 // --------------------------------------------------------------------------- 9260 9261 // For each interleave group which is relevant for this (possibly trimmed) 9262 // Range, add it to the set of groups to be later applied to the VPlan and add 9263 // placeholders for its members' Recipes which we'll be replacing with a 9264 // single VPInterleaveRecipe. 9265 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9266 auto ApplyIG = [IG, this](ElementCount VF) -> bool { 9267 bool Result = (VF.isVector() && // Query is illegal for VF == 1 9268 CM.getWideningDecision(IG->getInsertPos(), VF) == 9269 LoopVectorizationCostModel::CM_Interleave); 9270 // For scalable vectors, the only interleave factor currently supported 9271 // is 2 since we require the (de)interleave2 intrinsics instead of 9272 // shufflevectors. 9273 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 9274 "Unsupported interleave factor for scalable vectors"); 9275 return Result; 9276 }; 9277 if (!getDecisionAndClampRange(ApplyIG, Range)) 9278 continue; 9279 InterleaveGroups.insert(IG); 9280 } 9281 9282 // --------------------------------------------------------------------------- 9283 // Construct recipes for the instructions in the loop 9284 // --------------------------------------------------------------------------- 9285 9286 // Scan the body of the loop in a topological order to visit each basic block 9287 // after having visited its predecessor basic blocks. 9288 LoopBlocksDFS DFS(OrigLoop); 9289 DFS.perform(LI); 9290 9291 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 9292 VPBasicBlock *VPBB = HeaderVPBB; 9293 BasicBlock *HeaderBB = OrigLoop->getHeader(); 9294 bool NeedsMasks = 9295 CM.foldTailByMasking() || 9296 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { 9297 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); 9298 return Legal->blockNeedsPredication(BB) || NeedsBlends; 9299 }); 9300 9301 RecipeBuilder.collectScaledReductions(Range); 9302 9303 auto *MiddleVPBB = Plan->getMiddleBlock(); 9304 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); 9305 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9306 // Relevant instructions from basic block BB will be grouped into VPRecipe 9307 // ingredients and fill a new VPBasicBlock. 9308 if (VPBB != HeaderVPBB) 9309 VPBB->setName(BB->getName()); 9310 Builder.setInsertPoint(VPBB); 9311 9312 if (VPBB == HeaderVPBB) 9313 RecipeBuilder.createHeaderMask(); 9314 else if (NeedsMasks) 9315 RecipeBuilder.createBlockInMask(BB); 9316 9317 // Introduce each ingredient into VPlan. 9318 // TODO: Model and preserve debug intrinsics in VPlan. 9319 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 9320 Instruction *Instr = &I; 9321 SmallVector<VPValue *, 4> Operands; 9322 auto *Phi = dyn_cast<PHINode>(Instr); 9323 if (Phi && Phi->getParent() == HeaderBB) { 9324 Operands.push_back(Plan->getOrAddLiveIn( 9325 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9326 } else { 9327 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); 9328 Operands = {OpRange.begin(), OpRange.end()}; 9329 } 9330 9331 // The stores with invariant address inside the loop will be deleted, and 9332 // in the exit block, a uniform store recipe will be created for the final 9333 // invariant store of the reduction. 9334 StoreInst *SI; 9335 if ((SI = dyn_cast<StoreInst>(&I)) && 9336 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 9337 // Only create recipe for the final invariant store of the reduction. 9338 if (!Legal->isInvariantStoreOfReduction(SI)) 9339 continue; 9340 auto *Recipe = new VPReplicateRecipe( 9341 SI, RecipeBuilder.mapToVPValues(Instr->operands()), 9342 true /* IsUniform */); 9343 Recipe->insertBefore(*MiddleVPBB, MBIP); 9344 continue; 9345 } 9346 9347 VPRecipeBase *Recipe = 9348 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); 9349 if (!Recipe) 9350 Recipe = RecipeBuilder.handleReplication(Instr, Range); 9351 9352 RecipeBuilder.setRecipe(Instr, Recipe); 9353 if (isa<VPHeaderPHIRecipe>(Recipe)) { 9354 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 9355 // the following cases, VPHeaderPHIRecipes may be created after non-phi 9356 // recipes and need to be moved to the phi section of HeaderVPBB: 9357 // * tail-folding (non-phi recipes computing the header mask are 9358 // introduced earlier than regular header phi recipes, and should appear 9359 // after them) 9360 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 9361 9362 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 9363 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 9364 "unexpected recipe needs moving"); 9365 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9366 } else 9367 VPBB->appendRecipe(Recipe); 9368 } 9369 9370 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); 9371 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9372 } 9373 9374 // After here, VPBB should not be used. 9375 VPBB = nullptr; 9376 9377 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9378 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9379 "entry block must be set to a VPRegionBlock having a non-empty entry " 9380 "VPBasicBlock"); 9381 RecipeBuilder.fixHeaderPhis(); 9382 9383 // Update wide induction increments to use the same step as the corresponding 9384 // wide induction. This enables detecting induction increments directly in 9385 // VPlan and removes redundant splats. 9386 for (const auto &[Phi, ID] : Legal->getInductionVars()) { 9387 auto *IVInc = cast<Instruction>( 9388 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 9389 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) 9390 continue; 9391 VPWidenInductionRecipe *WideIV = 9392 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); 9393 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); 9394 R->setOperand(1, WideIV->getStepValue()); 9395 } 9396 9397 if (auto *UncountableExitingBlock = 9398 Legal->getUncountableEarlyExitingBlock()) { 9399 if (!VPlanTransforms::handleUncountableEarlyExit( 9400 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, 9401 RecipeBuilder)) { 9402 reportVectorizationFailure( 9403 "Some exit values in loop with uncountable exit not supported yet", 9404 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); 9405 return nullptr; 9406 } 9407 } 9408 DenseMap<VPValue *, VPValue *> IVEndValues; 9409 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); 9410 SetVector<VPIRInstruction *> ExitUsersToFix = 9411 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); 9412 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); 9413 addUsersInExitBlocks(*Plan, ExitUsersToFix); 9414 9415 // --------------------------------------------------------------------------- 9416 // Transform initial VPlan: Apply previously taken decisions, in order, to 9417 // bring the VPlan to its final state. 9418 // --------------------------------------------------------------------------- 9419 9420 // Adjust the recipes for any inloop reductions. 9421 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); 9422 9423 // Interleave memory: for each Interleave Group we marked earlier as relevant 9424 // for this VPlan, replace the Recipes widening its memory instructions with a 9425 // single VPInterleaveRecipe at its insertion point. 9426 VPlanTransforms::createInterleaveGroups( 9427 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); 9428 9429 for (ElementCount VF : Range) 9430 Plan->addVF(VF); 9431 Plan->setName("Initial VPlan"); 9432 9433 // Replace VPValues for known constant strides guaranteed by predicate scalar 9434 // evolution. 9435 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { 9436 auto *R = cast<VPRecipeBase>(&U); 9437 return R->getParent()->getParent() || 9438 R->getParent() == 9439 Plan->getVectorLoopRegion()->getSinglePredecessor(); 9440 }; 9441 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9442 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9443 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9444 // Only handle constant strides for now. 9445 if (!ScevStride) 9446 continue; 9447 9448 auto *CI = Plan->getOrAddLiveIn( 9449 ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); 9450 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) 9451 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9452 9453 // The versioned value may not be used in the loop directly but through a 9454 // sext/zext. Add new live-ins in those cases. 9455 for (Value *U : StrideV->users()) { 9456 if (!isa<SExtInst, ZExtInst>(U)) 9457 continue; 9458 VPValue *StrideVPV = Plan->getLiveIn(U); 9459 if (!StrideVPV) 9460 continue; 9461 unsigned BW = U->getType()->getScalarSizeInBits(); 9462 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) 9463 : ScevStride->getAPInt().zext(BW); 9464 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); 9465 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9466 } 9467 } 9468 9469 VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) { 9470 return Legal->blockNeedsPredication(BB); 9471 }); 9472 9473 // Sink users of fixed-order recurrence past the recipe defining the previous 9474 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9475 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 9476 return nullptr; 9477 9478 if (useActiveLaneMask(Style)) { 9479 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 9480 // TailFoldingStyle is visible there. 9481 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 9482 bool WithoutRuntimeCheck = 9483 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 9484 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 9485 WithoutRuntimeCheck); 9486 } 9487 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues); 9488 9489 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9490 return Plan; 9491 } 9492 9493 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9494 // Outer loop handling: They may require CFG and instruction level 9495 // transformations before even evaluating whether vectorization is profitable. 9496 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9497 // the vectorization pipeline. 9498 assert(!OrigLoop->isInnermost()); 9499 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9500 9501 // Create new empty VPlan 9502 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE, 9503 true, false, OrigLoop); 9504 9505 // Build hierarchical CFG 9506 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9507 HCFGBuilder.buildHierarchicalCFG(); 9508 9509 for (ElementCount VF : Range) 9510 Plan->addVF(VF); 9511 9512 VPlanTransforms::VPInstructionsToVPRecipes( 9513 Plan, 9514 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9515 *PSE.getSE(), *TLI); 9516 9517 // Tail folding is not supported for outer loops, so the induction increment 9518 // is guaranteed to not wrap. 9519 bool HasNUW = true; 9520 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 9521 DebugLoc()); 9522 9523 // Collect mapping of IR header phis to header phi recipes, to be used in 9524 // addScalarResumePhis. 9525 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9526 Builder); 9527 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9528 if (isa<VPCanonicalIVPHIRecipe>(&R)) 9529 continue; 9530 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); 9531 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); 9532 } 9533 DenseMap<VPValue *, VPValue *> IVEndValues; 9534 // TODO: IVEndValues are not used yet in the native path, to optimize exit 9535 // values. 9536 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); 9537 9538 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9539 return Plan; 9540 } 9541 9542 // Adjust the recipes for reductions. For in-loop reductions the chain of 9543 // instructions leading from the loop exit instr to the phi need to be converted 9544 // to reductions, with one operand being vector and the other being the scalar 9545 // reduction chain. For other reductions, a select is introduced between the phi 9546 // and users outside the vector region when folding the tail. 9547 // 9548 // A ComputeReductionResult recipe is added to the middle block, also for 9549 // in-loop reductions which compute their result in-loop, because generating 9550 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 9551 // 9552 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9553 // with a boolean reduction phi node to check if the condition is true in any 9554 // iteration. The final value is selected by the final ComputeReductionResult. 9555 void LoopVectorizationPlanner::adjustRecipesForReductions( 9556 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 9557 using namespace VPlanPatternMatch; 9558 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 9559 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 9560 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); 9561 SmallVector<VPRecipeBase *> ToDelete; 9562 9563 for (VPRecipeBase &R : Header->phis()) { 9564 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9565 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9566 continue; 9567 9568 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9569 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9570 assert( 9571 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9572 !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && 9573 "AnyOf and FindLast reductions are not allowed for in-loop reductions"); 9574 9575 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9576 SetVector<VPSingleDefRecipe *> Worklist; 9577 Worklist.insert(PhiR); 9578 for (unsigned I = 0; I != Worklist.size(); ++I) { 9579 VPSingleDefRecipe *Cur = Worklist[I]; 9580 for (VPUser *U : Cur->users()) { 9581 auto *UserRecipe = cast<VPSingleDefRecipe>(U); 9582 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { 9583 assert((UserRecipe->getParent() == MiddleVPBB || 9584 UserRecipe->getParent() == Plan->getScalarPreheader()) && 9585 "U must be either in the loop region, the middle block or the " 9586 "scalar preheader."); 9587 continue; 9588 } 9589 Worklist.insert(UserRecipe); 9590 } 9591 } 9592 9593 // Visit operation "Links" along the reduction chain top-down starting from 9594 // the phi until LoopExitValue. We keep track of the previous item 9595 // (PreviousLink) to tell which of the two operands of a Link will remain 9596 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9597 // the select instructions. Blend recipes of in-loop reduction phi's will 9598 // get folded to their non-phi operand, as the reduction recipe handles the 9599 // condition directly. 9600 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 9601 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 9602 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9603 9604 // Index of the first operand which holds a non-mask vector operand. 9605 unsigned IndexOfFirstOperand; 9606 // Recognize a call to the llvm.fmuladd intrinsic. 9607 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9608 VPValue *VecOp; 9609 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9610 if (IsFMulAdd) { 9611 assert( 9612 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9613 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9614 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9615 isa<VPWidenIntrinsicRecipe>(CurrentLink)) && 9616 CurrentLink->getOperand(2) == PreviousLink && 9617 "expected a call where the previous link is the added operand"); 9618 9619 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9620 // need to create an fmul recipe (multiplying the first two operands of 9621 // the fmuladd together) to use as the vector operand for the fadd 9622 // reduction. 9623 VPInstruction *FMulRecipe = new VPInstruction( 9624 Instruction::FMul, 9625 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9626 CurrentLinkI->getFastMathFlags()); 9627 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9628 VecOp = FMulRecipe; 9629 } else { 9630 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); 9631 if (PhiR->isInLoop() && Blend) { 9632 assert(Blend->getNumIncomingValues() == 2 && 9633 "Blend must have 2 incoming values"); 9634 if (Blend->getIncomingValue(0) == PhiR) 9635 Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); 9636 else { 9637 assert(Blend->getIncomingValue(1) == PhiR && 9638 "PhiR must be an operand of the blend"); 9639 Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); 9640 } 9641 continue; 9642 } 9643 9644 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9645 if (isa<VPWidenRecipe>(CurrentLink)) { 9646 assert(isa<CmpInst>(CurrentLinkI) && 9647 "need to have the compare of the select"); 9648 continue; 9649 } 9650 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9651 "must be a select recipe"); 9652 IndexOfFirstOperand = 1; 9653 } else { 9654 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9655 "Expected to replace a VPWidenSC"); 9656 IndexOfFirstOperand = 0; 9657 } 9658 // Note that for non-commutable operands (cmp-selects), the semantics of 9659 // the cmp-select are captured in the recurrence kind. 9660 unsigned VecOpId = 9661 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9662 ? IndexOfFirstOperand + 1 9663 : IndexOfFirstOperand; 9664 VecOp = CurrentLink->getOperand(VecOpId); 9665 assert(VecOp != PreviousLink && 9666 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9667 (VecOpId - IndexOfFirstOperand)) == 9668 PreviousLink && 9669 "PreviousLink must be the operand other than VecOp"); 9670 } 9671 9672 BasicBlock *BB = CurrentLinkI->getParent(); 9673 VPValue *CondOp = nullptr; 9674 if (CM.blockNeedsPredicationForAnyReason(BB)) 9675 CondOp = RecipeBuilder.getBlockInMask(BB); 9676 9677 auto *RedRecipe = new VPReductionRecipe( 9678 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, 9679 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); 9680 // Append the recipe to the end of the VPBasicBlock because we need to 9681 // ensure that it comes after all of it's inputs, including CondOp. 9682 // Delete CurrentLink as it will be invalid if its operand is replaced 9683 // with a reduction defined at the bottom of the block in the next link. 9684 LinkVPBB->appendRecipe(RedRecipe); 9685 CurrentLink->replaceAllUsesWith(RedRecipe); 9686 ToDelete.push_back(CurrentLink); 9687 PreviousLink = RedRecipe; 9688 } 9689 } 9690 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); 9691 Builder.setInsertPoint(&*LatchVPBB->begin()); 9692 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); 9693 for (VPRecipeBase &R : 9694 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9695 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9696 if (!PhiR) 9697 continue; 9698 9699 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9700 // If tail is folded by masking, introduce selects between the phi 9701 // and the users outside the vector region of each reduction, at the 9702 // beginning of the dedicated latch block. 9703 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9704 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9705 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9706 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9707 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9708 "reduction recipe must be defined before latch"); 9709 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9710 std::optional<FastMathFlags> FMFs = 9711 PhiTy->isFloatingPointTy() 9712 ? std::make_optional(RdxDesc.getFastMathFlags()) 9713 : std::nullopt; 9714 NewExitingVPV = 9715 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9716 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9717 return isa<VPInstruction>(&U) && 9718 cast<VPInstruction>(&U)->getOpcode() == 9719 VPInstruction::ComputeReductionResult; 9720 }); 9721 if (CM.usePredicatedReductionSelect( 9722 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) 9723 PhiR->setOperand(1, NewExitingVPV); 9724 } 9725 9726 // If the vector reduction can be performed in a smaller type, we truncate 9727 // then extend the loop exit value to enable InstCombine to evaluate the 9728 // entire expression in the smaller type. 9729 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9730 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && 9731 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 9732 RdxDesc.getRecurrenceKind())) { 9733 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9734 Type *RdxTy = RdxDesc.getRecurrenceType(); 9735 auto *Trunc = 9736 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9737 auto *Extnd = 9738 RdxDesc.isSigned() 9739 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9740 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9741 9742 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9743 Extnd->insertAfter(Trunc); 9744 if (PhiR->getOperand(1) == NewExitingVPV) 9745 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9746 NewExitingVPV = Extnd; 9747 } 9748 9749 // We want code in the middle block to appear to execute on the location of 9750 // the scalar loop's latch terminator because: (a) it is all compiler 9751 // generated, (b) these instructions are always executed after evaluating 9752 // the latch conditional branch, and (c) other passes may add new 9753 // predecessors which terminate on this line. This is the easiest way to 9754 // ensure we don't accidentally cause an extra step back into the loop while 9755 // debugging. 9756 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9757 9758 // TODO: At the moment ComputeReductionResult also drives creation of the 9759 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9760 // even for in-loop reductions, until the reduction resume value handling is 9761 // also modeled in VPlan. 9762 auto *FinalReductionResult = new VPInstruction( 9763 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9764 // Update all users outside the vector region. 9765 OrigExitingVPV->replaceUsesWithIf( 9766 FinalReductionResult, [](VPUser &User, unsigned) { 9767 auto *Parent = cast<VPRecipeBase>(&User)->getParent(); 9768 return Parent && !Parent->getParent(); 9769 }); 9770 FinalReductionResult->insertBefore(*MiddleVPBB, IP); 9771 9772 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9773 // with a boolean reduction phi node to check if the condition is true in 9774 // any iteration. The final value is selected by the final 9775 // ComputeReductionResult. 9776 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 9777 RdxDesc.getRecurrenceKind())) { 9778 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { 9779 return isa<VPWidenSelectRecipe>(U) || 9780 (isa<VPReplicateRecipe>(U) && 9781 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == 9782 Instruction::Select); 9783 })); 9784 VPValue *Cmp = Select->getOperand(0); 9785 // If the compare is checking the reduction PHI node, adjust it to check 9786 // the start value. 9787 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { 9788 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) 9789 if (CmpR->getOperand(I) == PhiR) 9790 CmpR->setOperand(I, PhiR->getStartValue()); 9791 } 9792 VPBuilder::InsertPointGuard Guard(Builder); 9793 Builder.setInsertPoint(Select); 9794 9795 // If the true value of the select is the reduction phi, the new value is 9796 // selected if the negated condition is true in any iteration. 9797 if (Select->getOperand(1) == PhiR) 9798 Cmp = Builder.createNot(Cmp); 9799 VPValue *Or = Builder.createOr(PhiR, Cmp); 9800 Select->getVPSingleValue()->replaceAllUsesWith(Or); 9801 // Delete Select now that it has invalid types. 9802 ToDelete.push_back(Select); 9803 9804 // Convert the reduction phi to operate on bools. 9805 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( 9806 OrigLoop->getHeader()->getContext()))); 9807 continue; 9808 } 9809 9810 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 9811 RdxDesc.getRecurrenceKind())) { 9812 // Adjust the start value for FindLastIV recurrences to use the sentinel 9813 // value after generating the ResumePhi recipe, which uses the original 9814 // start value. 9815 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); 9816 } 9817 } 9818 9819 VPlanTransforms::clearReductionWrapFlags(*Plan); 9820 for (VPRecipeBase *R : ToDelete) 9821 R->eraseFromParent(); 9822 } 9823 9824 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9825 assert(!State.Lane && "VPDerivedIVRecipe being replicated."); 9826 9827 // Fast-math-flags propagate from the original induction instruction. 9828 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9829 if (FPBinOp) 9830 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9831 9832 Value *Step = State.get(getStepValue(), VPLane(0)); 9833 Value *Index = State.get(getOperand(1), VPLane(0)); 9834 Value *DerivedIV = emitTransformedIndex( 9835 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, 9836 cast_if_present<BinaryOperator>(FPBinOp)); 9837 DerivedIV->setName(Name); 9838 // If index is the vector trip count, the concrete value will only be set in 9839 // prepareToExecute, leading to missed simplifications, e.g. if it is 0. 9840 // TODO: Remove the special case for the vector trip count once it is computed 9841 // in VPlan and can be used during VPlan simplification. 9842 assert((DerivedIV != Index || 9843 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && 9844 "IV didn't need transforming?"); 9845 State.set(this, DerivedIV, VPLane(0)); 9846 } 9847 9848 void VPReplicateRecipe::execute(VPTransformState &State) { 9849 Instruction *UI = getUnderlyingInstr(); 9850 if (State.Lane) { // Generate a single instance. 9851 assert((State.VF.isScalar() || !isUniform()) && 9852 "uniform recipe shouldn't be predicated"); 9853 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9854 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State); 9855 // Insert scalar instance packing it into a vector. 9856 if (State.VF.isVector() && shouldPack()) { 9857 // If we're constructing lane 0, initialize to start from poison. 9858 if (State.Lane->isFirstLane()) { 9859 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9860 Value *Poison = PoisonValue::get( 9861 VectorType::get(UI->getType(), State.VF)); 9862 State.set(this, Poison); 9863 } 9864 State.packScalarIntoVectorValue(this, *State.Lane); 9865 } 9866 return; 9867 } 9868 9869 if (IsUniform) { 9870 // Uniform within VL means we need to generate lane 0. 9871 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State); 9872 return; 9873 } 9874 9875 // A store of a loop varying value to a uniform address only needs the last 9876 // copy of the store. 9877 if (isa<StoreInst>(UI) && 9878 vputils::isUniformAfterVectorization(getOperand(1))) { 9879 auto Lane = VPLane::getLastLaneForVF(State.VF); 9880 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 9881 return; 9882 } 9883 9884 // Generate scalar instances for all VF lanes. 9885 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9886 const unsigned EndLane = State.VF.getKnownMinValue(); 9887 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9888 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 9889 } 9890 9891 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9892 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9893 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9894 // for predication. 9895 static ScalarEpilogueLowering getScalarEpilogueLowering( 9896 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9897 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9898 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9899 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9900 // don't look at hints or options, and don't request a scalar epilogue. 9901 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9902 // LoopAccessInfo (due to code dependency and not being able to reliably get 9903 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9904 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9905 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9906 // back to the old way and vectorize with versioning when forced. See D81345.) 9907 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9908 PGSOQueryType::IRPass) && 9909 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9910 return CM_ScalarEpilogueNotAllowedOptSize; 9911 9912 // 2) If set, obey the directives 9913 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9914 switch (PreferPredicateOverEpilogue) { 9915 case PreferPredicateTy::ScalarEpilogue: 9916 return CM_ScalarEpilogueAllowed; 9917 case PreferPredicateTy::PredicateElseScalarEpilogue: 9918 return CM_ScalarEpilogueNotNeededUsePredicate; 9919 case PreferPredicateTy::PredicateOrDontVectorize: 9920 return CM_ScalarEpilogueNotAllowedUsePredicate; 9921 }; 9922 } 9923 9924 // 3) If set, obey the hints 9925 switch (Hints.getPredicate()) { 9926 case LoopVectorizeHints::FK_Enabled: 9927 return CM_ScalarEpilogueNotNeededUsePredicate; 9928 case LoopVectorizeHints::FK_Disabled: 9929 return CM_ScalarEpilogueAllowed; 9930 }; 9931 9932 // 4) if the TTI hook indicates this is profitable, request predication. 9933 TailFoldingInfo TFI(TLI, &LVL, IAI); 9934 if (TTI->preferPredicateOverEpilogue(&TFI)) 9935 return CM_ScalarEpilogueNotNeededUsePredicate; 9936 9937 return CM_ScalarEpilogueAllowed; 9938 } 9939 9940 // Process the loop in the VPlan-native vectorization path. This path builds 9941 // VPlan upfront in the vectorization pipeline, which allows to apply 9942 // VPlan-to-VPlan transformations from the very beginning without modifying the 9943 // input LLVM IR. 9944 static bool processLoopInVPlanNativePath( 9945 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9946 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9947 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9948 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9949 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9950 LoopVectorizationRequirements &Requirements) { 9951 9952 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9953 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9954 return false; 9955 } 9956 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9957 Function *F = L->getHeader()->getParent(); 9958 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9959 9960 ScalarEpilogueLowering SEL = 9961 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9962 9963 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9964 &Hints, IAI); 9965 // Use the planner for outer loop vectorization. 9966 // TODO: CM is not used at this point inside the planner. Turn CM into an 9967 // optional argument if we don't need it in the future. 9968 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9969 ORE); 9970 9971 // Get user vectorization factor. 9972 ElementCount UserVF = Hints.getWidth(); 9973 9974 CM.collectElementTypesForWidening(); 9975 9976 // Plan how to best vectorize, return the best VF and its cost. 9977 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9978 9979 // If we are stress testing VPlan builds, do not attempt to generate vector 9980 // code. Masked vector code generation support will follow soon. 9981 // Also, do not attempt to vectorize if no vector code will be produced. 9982 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9983 return false; 9984 9985 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 9986 9987 { 9988 bool AddBranchWeights = 9989 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9990 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 9991 AddBranchWeights, CM.CostKind); 9992 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9993 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); 9994 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9995 << L->getHeader()->getParent()->getName() << "\"\n"); 9996 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9997 } 9998 9999 reportVectorization(ORE, L, VF, 1); 10000 10001 // Mark the loop as already vectorized to avoid vectorizing again. 10002 Hints.setAlreadyVectorized(); 10003 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10004 return true; 10005 } 10006 10007 // Emit a remark if there are stores to floats that required a floating point 10008 // extension. If the vectorized loop was generated with floating point there 10009 // will be a performance penalty from the conversion overhead and the change in 10010 // the vector width. 10011 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10012 SmallVector<Instruction *, 4> Worklist; 10013 for (BasicBlock *BB : L->getBlocks()) { 10014 for (Instruction &Inst : *BB) { 10015 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10016 if (S->getValueOperand()->getType()->isFloatTy()) 10017 Worklist.push_back(S); 10018 } 10019 } 10020 } 10021 10022 // Traverse the floating point stores upwards searching, for floating point 10023 // conversions. 10024 SmallPtrSet<const Instruction *, 4> Visited; 10025 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10026 while (!Worklist.empty()) { 10027 auto *I = Worklist.pop_back_val(); 10028 if (!L->contains(I)) 10029 continue; 10030 if (!Visited.insert(I).second) 10031 continue; 10032 10033 // Emit a remark if the floating point store required a floating 10034 // point conversion. 10035 // TODO: More work could be done to identify the root cause such as a 10036 // constant or a function return type and point the user to it. 10037 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10038 ORE->emit([&]() { 10039 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10040 I->getDebugLoc(), L->getHeader()) 10041 << "floating point conversion changes vector width. " 10042 << "Mixed floating point precision requires an up/down " 10043 << "cast that will negatively impact performance."; 10044 }); 10045 10046 for (Use &Op : I->operands()) 10047 if (auto *OpI = dyn_cast<Instruction>(Op)) 10048 Worklist.push_back(OpI); 10049 } 10050 } 10051 10052 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10053 VectorizationFactor &VF, Loop *L, 10054 const TargetTransformInfo &TTI, 10055 PredicatedScalarEvolution &PSE, 10056 ScalarEpilogueLowering SEL) { 10057 InstructionCost CheckCost = Checks.getCost(); 10058 if (!CheckCost.isValid()) 10059 return false; 10060 10061 // When interleaving only scalar and vector cost will be equal, which in turn 10062 // would lead to a divide by 0. Fall back to hard threshold. 10063 if (VF.Width.isScalar()) { 10064 if (CheckCost > VectorizeMemoryCheckThreshold) { 10065 LLVM_DEBUG( 10066 dbgs() 10067 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10068 return false; 10069 } 10070 return true; 10071 } 10072 10073 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10074 uint64_t ScalarC = *VF.ScalarCost.getValue(); 10075 if (ScalarC == 0) 10076 return true; 10077 10078 // First, compute the minimum iteration count required so that the vector 10079 // loop outperforms the scalar loop. 10080 // The total cost of the scalar loop is 10081 // ScalarC * TC 10082 // where 10083 // * TC is the actual trip count of the loop. 10084 // * ScalarC is the cost of a single scalar iteration. 10085 // 10086 // The total cost of the vector loop is 10087 // RtC + VecC * (TC / VF) + EpiC 10088 // where 10089 // * RtC is the cost of the generated runtime checks 10090 // * VecC is the cost of a single vector iteration. 10091 // * TC is the actual trip count of the loop 10092 // * VF is the vectorization factor 10093 // * EpiCost is the cost of the generated epilogue, including the cost 10094 // of the remaining scalar operations. 10095 // 10096 // Vectorization is profitable once the total vector cost is less than the 10097 // total scalar cost: 10098 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10099 // 10100 // Now we can compute the minimum required trip count TC as 10101 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 10102 // 10103 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10104 // the computations are performed on doubles, not integers and the result 10105 // is rounded up, hence we get an upper estimate of the TC. 10106 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); 10107 uint64_t RtC = *CheckCost.getValue(); 10108 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); 10109 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); 10110 10111 // Second, compute a minimum iteration count so that the cost of the 10112 // runtime checks is only a fraction of the total scalar loop cost. This 10113 // adds a loop-dependent bound on the overhead incurred if the runtime 10114 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10115 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10116 // cost, compute 10117 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10118 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); 10119 10120 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 10121 // epilogue is allowed, choose the next closest multiple of VF. This should 10122 // partly compensate for ignoring the epilogue cost. 10123 uint64_t MinTC = std::max(MinTC1, MinTC2); 10124 if (SEL == CM_ScalarEpilogueAllowed) 10125 MinTC = alignTo(MinTC, IntVF); 10126 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 10127 10128 LLVM_DEBUG( 10129 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10130 << VF.MinProfitableTripCount << "\n"); 10131 10132 // Skip vectorization if the expected trip count is less than the minimum 10133 // required trip count. 10134 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { 10135 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10136 VF.MinProfitableTripCount)) { 10137 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10138 "trip count < minimum profitable VF (" 10139 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10140 << ")\n"); 10141 10142 return false; 10143 } 10144 } 10145 return true; 10146 } 10147 10148 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10149 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10150 !EnableLoopInterleaving), 10151 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10152 !EnableLoopVectorization) {} 10153 10154 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue 10155 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that 10156 /// don't have a corresponding wide induction in \p EpiPlan. 10157 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { 10158 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those 10159 // will need their resume-values computed in the main vector loop. Others 10160 // can be removed from the main VPlan. 10161 SmallPtrSet<PHINode *, 2> EpiWidenedPhis; 10162 for (VPRecipeBase &R : 10163 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 10164 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10165 continue; 10166 EpiWidenedPhis.insert( 10167 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); 10168 } 10169 for (VPRecipeBase &R : make_early_inc_range( 10170 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { 10171 auto *VPIRInst = cast<VPIRInstruction>(&R); 10172 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); 10173 if (!IRI) 10174 break; 10175 if (EpiWidenedPhis.contains(IRI)) 10176 continue; 10177 // There is no corresponding wide induction in the epilogue plan that would 10178 // need a resume value. Remove the VPIRInst wrapping the scalar header phi 10179 // together with the corresponding ResumePhi. The resume values for the 10180 // scalar loop will be created during execution of EpiPlan. 10181 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); 10182 VPIRInst->eraseFromParent(); 10183 ResumePhi->eraseFromParent(); 10184 } 10185 VPlanTransforms::removeDeadRecipes(MainPlan); 10186 10187 using namespace VPlanPatternMatch; 10188 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); 10189 VPValue *VectorTC = &MainPlan.getVectorTripCount(); 10190 // If there is a suitable resume value for the canonical induction in the 10191 // scalar (which will become vector) epilogue loop we are done. Otherwise 10192 // create it below. 10193 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { 10194 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( 10195 m_Specific(VectorTC), m_SpecificInt(0))); 10196 })) 10197 return; 10198 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); 10199 ScalarPHBuilder.createNaryOp( 10200 VPInstruction::ResumePhi, 10201 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, 10202 "vec.epilog.resume.val"); 10203 } 10204 10205 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded 10206 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. 10207 static void 10208 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, 10209 const SCEV2ValueTy &ExpandedSCEVs, 10210 const EpilogueLoopVectorizationInfo &EPI) { 10211 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); 10212 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10213 Header->setName("vec.epilog.vector.body"); 10214 10215 // Re-use the trip count and steps expanded for the main loop, as 10216 // skeleton creation needs it as a value that dominates both the scalar 10217 // and vector epilogue loops 10218 // TODO: This is a workaround needed for epilogue vectorization and it 10219 // should be removed once induction resume value creation is done 10220 // directly in VPlan. 10221 for (auto &R : make_early_inc_range(*Plan.getEntry())) { 10222 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); 10223 if (!ExpandR) 10224 continue; 10225 auto *ExpandedVal = 10226 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10227 ExpandR->replaceAllUsesWith(ExpandedVal); 10228 if (Plan.getTripCount() == ExpandR) 10229 Plan.resetTripCount(ExpandedVal); 10230 ExpandR->eraseFromParent(); 10231 } 10232 10233 // Ensure that the start values for all header phi recipes are updated before 10234 // vectorizing the epilogue loop. 10235 for (VPRecipeBase &R : Header->phis()) { 10236 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { 10237 // When vectorizing the epilogue loop, the canonical induction start 10238 // value needs to be changed from zero to the value after the main 10239 // vector loop. Find the resume value created during execution of the main 10240 // VPlan. 10241 // FIXME: Improve modeling for canonical IV start values in the epilogue 10242 // loop. 10243 BasicBlock *MainMiddle = find_singleton<BasicBlock>( 10244 predecessors(L->getLoopPreheader()), 10245 [&EPI](BasicBlock *BB, bool) -> BasicBlock * { 10246 if (BB != EPI.MainLoopIterationCountCheck && 10247 BB != EPI.EpilogueIterationCountCheck && 10248 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck) 10249 return BB; 10250 return nullptr; 10251 }); 10252 using namespace llvm::PatternMatch; 10253 Type *IdxTy = IV->getScalarType(); 10254 PHINode *EPResumeVal = find_singleton<PHINode>( 10255 L->getLoopPreheader()->phis(), 10256 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * { 10257 if (P.getType() == IdxTy && 10258 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount && 10259 match( 10260 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), 10261 m_SpecificInt(0))) 10262 return &P; 10263 return nullptr; 10264 }); 10265 assert(EPResumeVal && "must have a resume value for the canonical IV"); 10266 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); 10267 assert(all_of(IV->users(), 10268 [](const VPUser *U) { 10269 return isa<VPScalarIVStepsRecipe>(U) || 10270 isa<VPScalarCastRecipe>(U) || 10271 isa<VPDerivedIVRecipe>(U) || 10272 cast<VPInstruction>(U)->getOpcode() == 10273 Instruction::Add; 10274 }) && 10275 "the canonical IV should only be used by its increment or " 10276 "ScalarIVSteps when resetting the start value"); 10277 IV->setOperand(0, VPV); 10278 continue; 10279 } 10280 10281 Value *ResumeV = nullptr; 10282 // TODO: Move setting of resume values to prepareToExecute. 10283 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10284 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) 10285 ->getIncomingValueForBlock(L->getLoopPreheader()); 10286 const RecurrenceDescriptor &RdxDesc = 10287 ReductionPhi->getRecurrenceDescriptor(); 10288 RecurKind RK = RdxDesc.getRecurrenceKind(); 10289 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 10290 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10291 // start value; compare the final value from the main vector loop 10292 // to the start value. 10293 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent(); 10294 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt()); 10295 ResumeV = 10296 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); 10297 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 10298 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment 10299 // to the resume value. The resume value is adjusted to the sentinel 10300 // value when the final value from the main vector loop equals the start 10301 // value. This ensures correctness when the start value might not be 10302 // less than the minimum value of a monotonically increasing induction 10303 // variable. 10304 IRBuilder<> Builder( 10305 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10306 Value *Cmp = 10307 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); 10308 ResumeV = 10309 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); 10310 } 10311 } else { 10312 // Retrieve the induction resume values for wide inductions from 10313 // their original phi nodes in the scalar loop. 10314 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode(); 10315 // Hook up to the PHINode generated by a ResumePhi recipe of main 10316 // loop VPlan, which feeds the scalar loop. 10317 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); 10318 } 10319 assert(ResumeV && "Must have a resume value"); 10320 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); 10321 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10322 } 10323 } 10324 10325 bool LoopVectorizePass::processLoop(Loop *L) { 10326 assert((EnableVPlanNativePath || L->isInnermost()) && 10327 "VPlan-native path is not enabled. Only process inner loops."); 10328 10329 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10330 << L->getHeader()->getParent()->getName() << "' from " 10331 << L->getLocStr() << "\n"); 10332 10333 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10334 10335 LLVM_DEBUG( 10336 dbgs() << "LV: Loop hints:" 10337 << " force=" 10338 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10339 ? "disabled" 10340 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10341 ? "enabled" 10342 : "?")) 10343 << " width=" << Hints.getWidth() 10344 << " interleave=" << Hints.getInterleave() << "\n"); 10345 10346 // Function containing loop 10347 Function *F = L->getHeader()->getParent(); 10348 10349 // Looking at the diagnostic output is the only way to determine if a loop 10350 // was vectorized (other than looking at the IR or machine code), so it 10351 // is important to generate an optimization remark for each loop. Most of 10352 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10353 // generated as OptimizationRemark and OptimizationRemarkMissed are 10354 // less verbose reporting vectorized loops and unvectorized loops that may 10355 // benefit from vectorization, respectively. 10356 10357 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10358 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10359 return false; 10360 } 10361 10362 PredicatedScalarEvolution PSE(*SE, *L); 10363 10364 // Check if it is legal to vectorize the loop. 10365 LoopVectorizationRequirements Requirements; 10366 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10367 &Requirements, &Hints, DB, AC, BFI, PSI); 10368 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10369 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10370 Hints.emitRemarkWithHints(); 10371 return false; 10372 } 10373 10374 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { 10375 reportVectorizationFailure("Auto-vectorization of loops with uncountable " 10376 "early exit is not enabled", 10377 "UncountableEarlyExitLoopsDisabled", ORE, L); 10378 return false; 10379 } 10380 10381 if (LVL.hasStructVectorCall()) { 10382 reportVectorizationFailure("Auto-vectorization of calls that return struct " 10383 "types is not yet supported", 10384 "StructCallVectorizationUnsupported", ORE, L); 10385 return false; 10386 } 10387 10388 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10389 // here. They may require CFG and instruction level transformations before 10390 // even evaluating whether vectorization is profitable. Since we cannot modify 10391 // the incoming IR, we need to build VPlan upfront in the vectorization 10392 // pipeline. 10393 if (!L->isInnermost()) 10394 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10395 ORE, BFI, PSI, Hints, Requirements); 10396 10397 assert(L->isInnermost() && "Inner loop expected."); 10398 10399 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10400 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10401 10402 // If an override option has been passed in for interleaved accesses, use it. 10403 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10404 UseInterleaved = EnableInterleavedMemAccesses; 10405 10406 // Analyze interleaved memory accesses. 10407 if (UseInterleaved) 10408 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10409 10410 if (LVL.hasUncountableEarlyExit()) { 10411 BasicBlock *LoopLatch = L->getLoopLatch(); 10412 if (IAI.requiresScalarEpilogue() || 10413 any_of(LVL.getCountableExitingBlocks(), 10414 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { 10415 reportVectorizationFailure("Auto-vectorization of early exit loops " 10416 "requiring a scalar epilogue is unsupported", 10417 "UncountableEarlyExitUnsupported", ORE, L); 10418 return false; 10419 } 10420 } 10421 10422 // Check the function attributes and profiles to find out if this function 10423 // should be optimized for size. 10424 ScalarEpilogueLowering SEL = 10425 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10426 10427 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10428 // count by optimizing for size, to minimize overheads. 10429 auto ExpectedTC = getSmallBestKnownTC(PSE, L); 10430 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10431 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10432 << "This loop is worth vectorizing only if no scalar " 10433 << "iteration overheads are incurred."); 10434 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10435 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10436 else { 10437 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10438 LLVM_DEBUG(dbgs() << "\n"); 10439 // Predicate tail-folded loops are efficient even when the loop 10440 // iteration count is low. However, setting the epilogue policy to 10441 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 10442 // with runtime checks. It's more effective to let 10443 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 10444 // for the loop. 10445 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 10446 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10447 } else { 10448 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10449 "small to consider vectorizing.\n"); 10450 reportVectorizationFailure( 10451 "The trip count is below the minial threshold value.", 10452 "loop trip count is too low, avoiding vectorization", 10453 "LowTripCount", ORE, L); 10454 Hints.emitRemarkWithHints(); 10455 return false; 10456 } 10457 } 10458 } 10459 10460 // Check the function attributes to see if implicit floats or vectors are 10461 // allowed. 10462 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10463 reportVectorizationFailure( 10464 "Can't vectorize when the NoImplicitFloat attribute is used", 10465 "loop not vectorized due to NoImplicitFloat attribute", 10466 "NoImplicitFloat", ORE, L); 10467 Hints.emitRemarkWithHints(); 10468 return false; 10469 } 10470 10471 // Check if the target supports potentially unsafe FP vectorization. 10472 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10473 // for the target we're vectorizing for, to make sure none of the 10474 // additional fp-math flags can help. 10475 if (Hints.isPotentiallyUnsafe() && 10476 TTI->isFPVectorizationPotentiallyUnsafe()) { 10477 reportVectorizationFailure( 10478 "Potentially unsafe FP op prevents vectorization", 10479 "loop not vectorized due to unsafe FP support.", 10480 "UnsafeFP", ORE, L); 10481 Hints.emitRemarkWithHints(); 10482 return false; 10483 } 10484 10485 bool AllowOrderedReductions; 10486 // If the flag is set, use that instead and override the TTI behaviour. 10487 if (ForceOrderedReductions.getNumOccurrences() > 0) 10488 AllowOrderedReductions = ForceOrderedReductions; 10489 else 10490 AllowOrderedReductions = TTI->enableOrderedReductions(); 10491 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10492 ORE->emit([&]() { 10493 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10494 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10495 ExactFPMathInst->getDebugLoc(), 10496 ExactFPMathInst->getParent()) 10497 << "loop not vectorized: cannot prove it is safe to reorder " 10498 "floating-point operations"; 10499 }); 10500 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10501 "reorder floating-point operations\n"); 10502 Hints.emitRemarkWithHints(); 10503 return false; 10504 } 10505 10506 // Use the cost model. 10507 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10508 F, &Hints, IAI); 10509 // Use the planner for vectorization. 10510 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10511 ORE); 10512 10513 // Get user vectorization factor and interleave count. 10514 ElementCount UserVF = Hints.getWidth(); 10515 unsigned UserIC = Hints.getInterleave(); 10516 10517 // Plan how to best vectorize. 10518 LVP.plan(UserVF, UserIC); 10519 VectorizationFactor VF = LVP.computeBestVF(); 10520 unsigned IC = 1; 10521 10522 if (ORE->allowExtraAnalysis(LV_NAME)) 10523 LVP.emitInvalidCostRemarks(ORE); 10524 10525 bool AddBranchWeights = 10526 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10527 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10528 AddBranchWeights, CM.CostKind); 10529 if (LVP.hasPlanWithVF(VF.Width)) { 10530 // Select the interleave count. 10531 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10532 10533 unsigned SelectedIC = std::max(IC, UserIC); 10534 // Optimistically generate runtime checks if they are needed. Drop them if 10535 // they turn out to not be profitable. 10536 if (VF.Width.isVector() || SelectedIC > 1) 10537 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10538 10539 // Check if it is profitable to vectorize with runtime checks. 10540 bool ForceVectorization = 10541 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10542 if (!ForceVectorization && 10543 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { 10544 ORE->emit([&]() { 10545 return OptimizationRemarkAnalysisAliasing( 10546 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10547 L->getHeader()) 10548 << "loop not vectorized: cannot prove it is safe to reorder " 10549 "memory operations"; 10550 }); 10551 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10552 Hints.emitRemarkWithHints(); 10553 return false; 10554 } 10555 } 10556 10557 // Identify the diagnostic messages that should be produced. 10558 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10559 bool VectorizeLoop = true, InterleaveLoop = true; 10560 if (VF.Width.isScalar()) { 10561 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10562 VecDiagMsg = std::make_pair( 10563 "VectorizationNotBeneficial", 10564 "the cost-model indicates that vectorization is not beneficial"); 10565 VectorizeLoop = false; 10566 } 10567 10568 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { 10569 // Tell the user interleaving was avoided up-front, despite being explicitly 10570 // requested. 10571 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10572 "interleaving should be avoided up front\n"); 10573 IntDiagMsg = std::make_pair( 10574 "InterleavingAvoided", 10575 "Ignoring UserIC, because interleaving was avoided up front"); 10576 InterleaveLoop = false; 10577 } else if (IC == 1 && UserIC <= 1) { 10578 // Tell the user interleaving is not beneficial. 10579 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10580 IntDiagMsg = std::make_pair( 10581 "InterleavingNotBeneficial", 10582 "the cost-model indicates that interleaving is not beneficial"); 10583 InterleaveLoop = false; 10584 if (UserIC == 1) { 10585 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10586 IntDiagMsg.second += 10587 " and is explicitly disabled or interleave count is set to 1"; 10588 } 10589 } else if (IC > 1 && UserIC == 1) { 10590 // Tell the user interleaving is beneficial, but it explicitly disabled. 10591 LLVM_DEBUG( 10592 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10593 IntDiagMsg = std::make_pair( 10594 "InterleavingBeneficialButDisabled", 10595 "the cost-model indicates that interleaving is beneficial " 10596 "but is explicitly disabled or interleave count is set to 1"); 10597 InterleaveLoop = false; 10598 } 10599 10600 // If there is a histogram in the loop, do not just interleave without 10601 // vectorizing. The order of operations will be incorrect without the 10602 // histogram intrinsics, which are only used for recipes with VF > 1. 10603 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) { 10604 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " 10605 << "to histogram operations.\n"); 10606 IntDiagMsg = std::make_pair( 10607 "HistogramPreventsScalarInterleaving", 10608 "Unable to interleave without vectorization due to constraints on " 10609 "the order of histogram operations"); 10610 InterleaveLoop = false; 10611 } 10612 10613 // Override IC if user provided an interleave count. 10614 IC = UserIC > 0 ? UserIC : IC; 10615 10616 // Emit diagnostic messages, if any. 10617 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10618 if (!VectorizeLoop && !InterleaveLoop) { 10619 // Do not vectorize or interleaving the loop. 10620 ORE->emit([&]() { 10621 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10622 L->getStartLoc(), L->getHeader()) 10623 << VecDiagMsg.second; 10624 }); 10625 ORE->emit([&]() { 10626 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10627 L->getStartLoc(), L->getHeader()) 10628 << IntDiagMsg.second; 10629 }); 10630 return false; 10631 } 10632 10633 if (!VectorizeLoop && InterleaveLoop) { 10634 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10635 ORE->emit([&]() { 10636 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10637 L->getStartLoc(), L->getHeader()) 10638 << VecDiagMsg.second; 10639 }); 10640 } else if (VectorizeLoop && !InterleaveLoop) { 10641 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10642 << ") in " << L->getLocStr() << '\n'); 10643 ORE->emit([&]() { 10644 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10645 L->getStartLoc(), L->getHeader()) 10646 << IntDiagMsg.second; 10647 }); 10648 } else if (VectorizeLoop && InterleaveLoop) { 10649 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10650 << ") in " << L->getLocStr() << '\n'); 10651 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10652 } 10653 10654 bool DisableRuntimeUnroll = false; 10655 MDNode *OrigLoopID = L->getLoopID(); 10656 { 10657 using namespace ore; 10658 if (!VectorizeLoop) { 10659 assert(IC > 1 && "interleave count should not be 1 or 0"); 10660 // If we decided that it is not legal to vectorize the loop, then 10661 // interleave it. 10662 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10663 InnerLoopVectorizer Unroller( 10664 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), 10665 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan); 10666 10667 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10668 10669 ORE->emit([&]() { 10670 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10671 L->getHeader()) 10672 << "interleaved loop (interleaved count: " 10673 << NV("InterleaveCount", IC) << ")"; 10674 }); 10675 } else { 10676 // If we decided that it is *legal* to vectorize the loop, then do it. 10677 10678 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10679 // Consider vectorizing the epilogue too if it's profitable. 10680 VectorizationFactor EpilogueVF = 10681 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10682 if (EpilogueVF.Width.isVector()) { 10683 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); 10684 10685 // The first pass vectorizes the main loop and creates a scalar epilogue 10686 // to be vectorized by executing the plan (potentially with a different 10687 // factor) again shortly afterwards. 10688 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); 10689 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); 10690 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, 10691 BestEpiPlan); 10692 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10693 EPI, &LVL, &CM, BFI, PSI, Checks, 10694 *BestMainPlan); 10695 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10696 *BestMainPlan, MainILV, DT, false); 10697 ++LoopsVectorized; 10698 10699 // Second pass vectorizes the epilogue and adjusts the control flow 10700 // edges from the first pass. 10701 EPI.MainLoopVF = EPI.EpilogueVF; 10702 EPI.MainLoopUF = EPI.EpilogueUF; 10703 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10704 ORE, EPI, &LVL, &CM, BFI, PSI, 10705 Checks, BestEpiPlan); 10706 EpilogILV.setTripCount(MainILV.getTripCount()); 10707 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); 10708 10709 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10710 DT, true, &ExpandedSCEVs); 10711 ++LoopsEpilogueVectorized; 10712 10713 if (!MainILV.areSafetyChecksAdded()) 10714 DisableRuntimeUnroll = true; 10715 } else { 10716 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10717 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10718 PSI, Checks, BestPlan); 10719 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10720 ++LoopsVectorized; 10721 10722 // Add metadata to disable runtime unrolling a scalar loop when there 10723 // are no runtime checks about strides and memory. A scalar loop that is 10724 // rarely used is not worth unrolling. 10725 if (!LB.areSafetyChecksAdded()) 10726 DisableRuntimeUnroll = true; 10727 } 10728 // Report the vectorization decision. 10729 reportVectorization(ORE, L, VF, IC); 10730 } 10731 10732 if (ORE->allowExtraAnalysis(LV_NAME)) 10733 checkMixedPrecision(L, ORE); 10734 } 10735 10736 assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 10737 "DT not preserved correctly"); 10738 10739 std::optional<MDNode *> RemainderLoopID = 10740 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10741 LLVMLoopVectorizeFollowupEpilogue}); 10742 if (RemainderLoopID) { 10743 L->setLoopID(*RemainderLoopID); 10744 } else { 10745 if (DisableRuntimeUnroll) 10746 addRuntimeUnrollDisableMetaData(L); 10747 10748 // Mark the loop as already vectorized to avoid vectorizing again. 10749 Hints.setAlreadyVectorized(); 10750 } 10751 10752 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10753 return true; 10754 } 10755 10756 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { 10757 10758 // Don't attempt if 10759 // 1. the target claims to have no vector registers, and 10760 // 2. interleaving won't help ILP. 10761 // 10762 // The second condition is necessary because, even if the target has no 10763 // vector registers, loop vectorization may still enable scalar 10764 // interleaving. 10765 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10766 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10767 return LoopVectorizeResult(false, false); 10768 10769 bool Changed = false, CFGChanged = false; 10770 10771 // The vectorizer requires loops to be in simplified form. 10772 // Since simplification may add new inner loops, it has to run before the 10773 // legality and profitability checks. This means running the loop vectorizer 10774 // will simplify all loops, regardless of whether anything end up being 10775 // vectorized. 10776 for (const auto &L : *LI) 10777 Changed |= CFGChanged |= 10778 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10779 10780 // Build up a worklist of inner-loops to vectorize. This is necessary as 10781 // the act of vectorizing or partially unrolling a loop creates new loops 10782 // and can invalidate iterators across the loops. 10783 SmallVector<Loop *, 8> Worklist; 10784 10785 for (Loop *L : *LI) 10786 collectSupportedLoops(*L, LI, ORE, Worklist); 10787 10788 LoopsAnalyzed += Worklist.size(); 10789 10790 // Now walk the identified inner loops. 10791 while (!Worklist.empty()) { 10792 Loop *L = Worklist.pop_back_val(); 10793 10794 // For the inner loops we actually process, form LCSSA to simplify the 10795 // transform. 10796 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10797 10798 Changed |= CFGChanged |= processLoop(L); 10799 10800 if (Changed) { 10801 LAIs->clear(); 10802 10803 #ifndef NDEBUG 10804 if (VerifySCEV) 10805 SE->verify(); 10806 #endif 10807 } 10808 } 10809 10810 // Process each loop nest in the function. 10811 return LoopVectorizeResult(Changed, CFGChanged); 10812 } 10813 10814 PreservedAnalyses LoopVectorizePass::run(Function &F, 10815 FunctionAnalysisManager &AM) { 10816 LI = &AM.getResult<LoopAnalysis>(F); 10817 // There are no loops in the function. Return before computing other 10818 // expensive analyses. 10819 if (LI->empty()) 10820 return PreservedAnalyses::all(); 10821 SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 10822 TTI = &AM.getResult<TargetIRAnalysis>(F); 10823 DT = &AM.getResult<DominatorTreeAnalysis>(F); 10824 TLI = &AM.getResult<TargetLibraryAnalysis>(F); 10825 AC = &AM.getResult<AssumptionAnalysis>(F); 10826 DB = &AM.getResult<DemandedBitsAnalysis>(F); 10827 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10828 LAIs = &AM.getResult<LoopAccessAnalysis>(F); 10829 10830 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10831 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10832 BFI = nullptr; 10833 if (PSI && PSI->hasProfileSummary()) 10834 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10835 LoopVectorizeResult Result = runImpl(F); 10836 if (!Result.MadeAnyChange) 10837 return PreservedAnalyses::all(); 10838 PreservedAnalyses PA; 10839 10840 if (isAssignmentTrackingEnabled(*F.getParent())) { 10841 for (auto &BB : F) 10842 RemoveRedundantDbgInstrs(&BB); 10843 } 10844 10845 PA.preserve<LoopAnalysis>(); 10846 PA.preserve<DominatorTreeAnalysis>(); 10847 PA.preserve<ScalarEvolutionAnalysis>(); 10848 PA.preserve<LoopAccessAnalysis>(); 10849 10850 if (Result.MadeCFGChange) { 10851 // Making CFG changes likely means a loop got vectorized. Indicate that 10852 // extra simplification passes should be run. 10853 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10854 // be run if runtime checks have been added. 10855 AM.getResult<ShouldRunExtraVectorPasses>(F); 10856 PA.preserve<ShouldRunExtraVectorPasses>(); 10857 } else { 10858 PA.preserveSet<CFGAnalyses>(); 10859 } 10860 return PA; 10861 } 10862 10863 void LoopVectorizePass::printPipeline( 10864 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10865 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10866 OS, MapClassName2PassName); 10867 10868 OS << '<'; 10869 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10870 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10871 OS << '>'; 10872 } 10873