1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanPatternMatch.h" 63 #include "VPlanTransforms.h" 64 #include "VPlanUtils.h" 65 #include "VPlanVerifier.h" 66 #include "llvm/ADT/APInt.h" 67 #include "llvm/ADT/ArrayRef.h" 68 #include "llvm/ADT/DenseMap.h" 69 #include "llvm/ADT/DenseMapInfo.h" 70 #include "llvm/ADT/Hashing.h" 71 #include "llvm/ADT/MapVector.h" 72 #include "llvm/ADT/STLExtras.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/TypeSwitch.h" 79 #include "llvm/ADT/iterator_range.h" 80 #include "llvm/Analysis/AssumptionCache.h" 81 #include "llvm/Analysis/BasicAliasAnalysis.h" 82 #include "llvm/Analysis/BlockFrequencyInfo.h" 83 #include "llvm/Analysis/CFG.h" 84 #include "llvm/Analysis/CodeMetrics.h" 85 #include "llvm/Analysis/DemandedBits.h" 86 #include "llvm/Analysis/GlobalsModRef.h" 87 #include "llvm/Analysis/LoopAccessAnalysis.h" 88 #include "llvm/Analysis/LoopAnalysisManager.h" 89 #include "llvm/Analysis/LoopInfo.h" 90 #include "llvm/Analysis/LoopIterator.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/ValueTracking.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfo.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/MDBuilder.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/PatternMatch.h" 122 #include "llvm/IR/ProfDataUtils.h" 123 #include "llvm/IR/Type.h" 124 #include "llvm/IR/Use.h" 125 #include "llvm/IR/User.h" 126 #include "llvm/IR/Value.h" 127 #include "llvm/IR/Verifier.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/NativeFormatting.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/Local.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 251 "data-and-control-without-rt-check", 252 "Similar to data-and-control, but remove the runtime check"), 253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", 254 "Use predicated EVL instructions for tail folding. If EVL " 255 "is unsupported, fallback to data-without-lane-mask."))); 256 257 static cl::opt<bool> MaximizeBandwidth( 258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 259 cl::desc("Maximize bandwidth when selecting vectorization factor which " 260 "will be determined by the smallest type in loop.")); 261 262 static cl::opt<bool> EnableInterleavedMemAccesses( 263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 264 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 265 266 /// An interleave-group may need masking if it resides in a block that needs 267 /// predication, or in order to mask away gaps. 268 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// The number of stores in a loop that are allowed to need predication. 320 static cl::opt<unsigned> NumberOfStoresToPredicate( 321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 322 cl::desc("Max number of stores to be predicated behind an if.")); 323 324 static cl::opt<bool> EnableIndVarRegisterHeur( 325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 326 cl::desc("Count the induction variable only once when interleaving")); 327 328 static cl::opt<bool> EnableCondStoresVectorization( 329 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 330 cl::desc("Enable if predication of stores during vectorization.")); 331 332 static cl::opt<unsigned> MaxNestedScalarReductionIC( 333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 334 cl::desc("The maximum interleave count to use when interleaving a scalar " 335 "reduction in a nested loop.")); 336 337 static cl::opt<bool> 338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 339 cl::Hidden, 340 cl::desc("Prefer in-loop vector reductions, " 341 "overriding the targets preference.")); 342 343 static cl::opt<bool> ForceOrderedReductions( 344 "force-ordered-reductions", cl::init(false), cl::Hidden, 345 cl::desc("Enable the vectorisation of loops with in-order (strict) " 346 "FP reductions")); 347 348 static cl::opt<bool> PreferPredicatedReductionSelect( 349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 350 cl::desc( 351 "Prefer predicating a reduction operation over an after loop select.")); 352 353 namespace llvm { 354 cl::opt<bool> EnableVPlanNativePath( 355 "enable-vplan-native-path", cl::Hidden, 356 cl::desc("Enable VPlan-native vectorization path with " 357 "support for outer loop vectorization.")); 358 359 cl::opt<bool> 360 VerifyEachVPlan("vplan-verify-each", 361 #ifdef EXPENSIVE_CHECKS 362 cl::init(true), 363 #else 364 cl::init(false), 365 #endif 366 cl::Hidden, 367 cl::desc("Verfiy VPlans after VPlan transforms.")); 368 } // namespace llvm 369 370 // This flag enables the stress testing of the VPlan H-CFG construction in the 371 // VPlan-native vectorization path. It must be used in conjuction with 372 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 373 // verification of the H-CFGs built. 374 static cl::opt<bool> VPlanBuildStressTest( 375 "vplan-build-stress-test", cl::init(false), cl::Hidden, 376 cl::desc( 377 "Build VPlan for every supported loop nest in the function and bail " 378 "out right after the build (stress test the VPlan H-CFG construction " 379 "in the VPlan-native vectorization path).")); 380 381 cl::opt<bool> llvm::EnableLoopInterleaving( 382 "interleave-loops", cl::init(true), cl::Hidden, 383 cl::desc("Enable loop interleaving in Loop vectorization passes")); 384 cl::opt<bool> llvm::EnableLoopVectorization( 385 "vectorize-loops", cl::init(true), cl::Hidden, 386 cl::desc("Run the Loop vectorization passes")); 387 388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 389 "force-widen-divrem-via-safe-divisor", cl::Hidden, 390 cl::desc( 391 "Override cost based safe divisor widening for div/rem instructions")); 392 393 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 394 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 395 cl::Hidden, 396 cl::desc("Try wider VFs if they enable the use of vector variants")); 397 398 static cl::opt<bool> EnableEarlyExitVectorization( 399 "enable-early-exit-vectorization", cl::init(false), cl::Hidden, 400 cl::desc( 401 "Enable vectorization of early exit loops with uncountable exits.")); 402 403 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 404 // variables not overflowing do not hold. See `emitSCEVChecks`. 405 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 406 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 407 // `emitMemRuntimeChecks`. 408 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 409 // Likelyhood of bypassing the vectorized loop because there are zero trips left 410 // after prolog. See `emitIterationCountCheck`. 411 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 412 413 /// A helper function that returns true if the given type is irregular. The 414 /// type is irregular if its allocated size doesn't equal the store size of an 415 /// element of the corresponding vector type. 416 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 417 // Determine if an array of N elements of type Ty is "bitcast compatible" 418 // with a <N x Ty> vector. 419 // This is only true if there is no padding between the array elements. 420 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 421 } 422 423 /// Returns "best known" trip count for the specified loop \p L as defined by 424 /// the following procedure: 425 /// 1) Returns exact trip count if it is known. 426 /// 2) Returns expected trip count according to profile data if any. 427 /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. 428 /// 4) Returns std::nullopt if all of the above failed. 429 static std::optional<unsigned> 430 getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, 431 bool CanUseConstantMax = true) { 432 // Check if exact trip count is known. 433 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) 434 return ExpectedTC; 435 436 // Check if there is an expected trip count available from profile data. 437 if (LoopVectorizeWithBlockFrequency) 438 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 439 return *EstimatedTC; 440 441 if (!CanUseConstantMax) 442 return std::nullopt; 443 444 // Check if upper bound estimate is known. 445 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) 446 return ExpectedTC; 447 448 return std::nullopt; 449 } 450 451 namespace { 452 // Forward declare GeneratedRTChecks. 453 class GeneratedRTChecks; 454 455 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 456 } // namespace 457 458 namespace llvm { 459 460 AnalysisKey ShouldRunExtraVectorPasses::Key; 461 462 /// InnerLoopVectorizer vectorizes loops which contain only one basic 463 /// block to a specified vectorization factor (VF). 464 /// This class performs the widening of scalars into vectors, or multiple 465 /// scalars. This class also implements the following features: 466 /// * It inserts an epilogue loop for handling loops that don't have iteration 467 /// counts that are known to be a multiple of the vectorization factor. 468 /// * It handles the code generation for reduction variables. 469 /// * Scalarization (implementation using scalars) of un-vectorizable 470 /// instructions. 471 /// InnerLoopVectorizer does not perform any vectorization-legality 472 /// checks, and relies on the caller to check for the different legality 473 /// aspects. The InnerLoopVectorizer relies on the 474 /// LoopVectorizationLegality class to provide information about the induction 475 /// and reduction variables that were found to a given vectorization factor. 476 class InnerLoopVectorizer { 477 public: 478 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 479 LoopInfo *LI, DominatorTree *DT, 480 const TargetLibraryInfo *TLI, 481 const TargetTransformInfo *TTI, AssumptionCache *AC, 482 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 483 ElementCount MinProfitableTripCount, 484 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 485 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 486 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, 487 VPlan &Plan) 488 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 489 AC(AC), ORE(ORE), VF(VecWidth), 490 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), 491 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 492 PSI(PSI), RTChecks(RTChecks), Plan(Plan), 493 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) { 494 // Query this against the original loop and save it here because the profile 495 // of the original loop header may change as the transformation happens. 496 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 497 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 498 } 499 500 virtual ~InnerLoopVectorizer() = default; 501 502 /// Create a new empty loop that will contain vectorized instructions later 503 /// on, while the old loop will be used as the scalar remainder. Control flow 504 /// is generated around the vectorized (and scalar epilogue) loops consisting 505 /// of various checks and bypasses. Return the pre-header block of the new 506 /// loop. In the case of epilogue vectorization, this function is overriden to 507 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is 508 /// used to look up SCEV expansions for expressions needed during skeleton 509 /// creation. 510 virtual BasicBlock * 511 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 512 513 /// Fix the vectorized code, taking care of header phi's, and more. 514 void fixVectorizedLoop(VPTransformState &State); 515 516 // Return true if any runtime check is added. 517 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 518 519 /// A helper function to scalarize a single Instruction in the innermost loop. 520 /// Generates a sequence of scalar instances for each lane between \p MinLane 521 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 522 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 523 /// Instr's operands. 524 void scalarizeInstruction(const Instruction *Instr, 525 VPReplicateRecipe *RepRecipe, const VPLane &Lane, 526 VPTransformState &State); 527 528 /// Fix the non-induction PHIs in \p Plan. 529 void fixNonInductionPHIs(VPTransformState &State); 530 531 /// Returns the original loop trip count. 532 Value *getTripCount() const { return TripCount; } 533 534 /// Used to set the trip count after ILV's construction and after the 535 /// preheader block has been executed. Note that this always holds the trip 536 /// count of the original loop for both main loop and epilogue vectorization. 537 void setTripCount(Value *TC) { TripCount = TC; } 538 539 // Retrieve the additional bypass value associated with an original 540 /// induction header phi. 541 Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const { 542 return Induction2AdditionalBypassValue.at(OrigPhi); 543 } 544 545 /// Return the additional bypass block which targets the scalar loop by 546 /// skipping the epilogue loop after completing the main loop. 547 BasicBlock *getAdditionalBypassBlock() const { 548 assert(AdditionalBypassBlock && 549 "Trying to access AdditionalBypassBlock but it has not been set"); 550 return AdditionalBypassBlock; 551 } 552 553 protected: 554 friend class LoopVectorizationPlanner; 555 556 /// Iteratively sink the scalarized operands of a predicated instruction into 557 /// the block that was created for it. 558 void sinkScalarOperands(Instruction *PredInst); 559 560 /// Returns (and creates if needed) the trip count of the widened loop. 561 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 562 563 /// Emit a bypass check to see if the vector trip count is zero, including if 564 /// it overflows. 565 void emitIterationCountCheck(BasicBlock *Bypass); 566 567 /// Emit a bypass check to see if all of the SCEV assumptions we've 568 /// had to make are correct. Returns the block containing the checks or 569 /// nullptr if no checks have been added. 570 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 571 572 /// Emit bypass checks to check any memory assumptions we may have made. 573 /// Returns the block containing the checks or nullptr if no checks have been 574 /// added. 575 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 576 577 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 578 /// vector loop preheader, middle block and scalar preheader. 579 void createVectorLoopSkeleton(StringRef Prefix); 580 581 /// Create and record the values for induction variables to resume coming from 582 /// the additional bypass block. 583 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, 584 Value *MainVectorTripCount); 585 586 /// Allow subclasses to override and print debug traces before/after vplan 587 /// execution, when trace information is requested. 588 virtual void printDebugTracesAtStart() {} 589 virtual void printDebugTracesAtEnd() {} 590 591 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the 592 /// vector preheader and its predecessor, also connecting the new block to the 593 /// scalar preheader. 594 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB); 595 596 /// The original loop. 597 Loop *OrigLoop; 598 599 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 600 /// dynamic knowledge to simplify SCEV expressions and converts them to a 601 /// more usable form. 602 PredicatedScalarEvolution &PSE; 603 604 /// Loop Info. 605 LoopInfo *LI; 606 607 /// Dominator Tree. 608 DominatorTree *DT; 609 610 /// Target Library Info. 611 const TargetLibraryInfo *TLI; 612 613 /// Target Transform Info. 614 const TargetTransformInfo *TTI; 615 616 /// Assumption Cache. 617 AssumptionCache *AC; 618 619 /// Interface to emit optimization remarks. 620 OptimizationRemarkEmitter *ORE; 621 622 /// The vectorization SIMD factor to use. Each vector will have this many 623 /// vector elements. 624 ElementCount VF; 625 626 ElementCount MinProfitableTripCount; 627 628 /// The vectorization unroll factor to use. Each scalar is vectorized to this 629 /// many different vector instructions. 630 unsigned UF; 631 632 /// The builder that we use 633 IRBuilder<> Builder; 634 635 // --- Vectorization state --- 636 637 /// The vector-loop preheader. 638 BasicBlock *LoopVectorPreHeader; 639 640 /// The scalar-loop preheader. 641 BasicBlock *LoopScalarPreHeader; 642 643 /// Middle Block between the vector and the scalar. 644 BasicBlock *LoopMiddleBlock; 645 646 /// A list of all bypass blocks. The first block is the entry of the loop. 647 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 648 649 /// Store instructions that were predicated. 650 SmallVector<Instruction *, 4> PredicatedInstructions; 651 652 /// Trip count of the original loop. 653 Value *TripCount = nullptr; 654 655 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 656 Value *VectorTripCount = nullptr; 657 658 /// The legality analysis. 659 LoopVectorizationLegality *Legal; 660 661 /// The profitablity analysis. 662 LoopVectorizationCostModel *Cost; 663 664 // Record whether runtime checks are added. 665 bool AddedSafetyChecks = false; 666 667 /// BFI and PSI are used to check for profile guided size optimizations. 668 BlockFrequencyInfo *BFI; 669 ProfileSummaryInfo *PSI; 670 671 // Whether this loop should be optimized for size based on profile guided size 672 // optimizatios. 673 bool OptForSizeBasedOnProfile; 674 675 /// Structure to hold information about generated runtime checks, responsible 676 /// for cleaning the checks, if vectorization turns out unprofitable. 677 GeneratedRTChecks &RTChecks; 678 679 /// Mapping of induction phis to their additional bypass values. They 680 /// need to be added as operands to phi nodes in the scalar loop preheader 681 /// after the epilogue skeleton has been created. 682 DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue; 683 684 /// The additional bypass block which conditionally skips over the epilogue 685 /// loop after executing the main loop. Needed to resume inductions and 686 /// reductions during epilogue vectorization. 687 BasicBlock *AdditionalBypassBlock = nullptr; 688 689 VPlan &Plan; 690 691 /// The vector preheader block of \p Plan, used as target for check blocks 692 /// introduced during skeleton creation. 693 VPBlockBase *VectorPHVPB; 694 }; 695 696 /// Encapsulate information regarding vectorization of a loop and its epilogue. 697 /// This information is meant to be updated and used across two stages of 698 /// epilogue vectorization. 699 struct EpilogueLoopVectorizationInfo { 700 ElementCount MainLoopVF = ElementCount::getFixed(0); 701 unsigned MainLoopUF = 0; 702 ElementCount EpilogueVF = ElementCount::getFixed(0); 703 unsigned EpilogueUF = 0; 704 BasicBlock *MainLoopIterationCountCheck = nullptr; 705 BasicBlock *EpilogueIterationCountCheck = nullptr; 706 BasicBlock *SCEVSafetyCheck = nullptr; 707 BasicBlock *MemSafetyCheck = nullptr; 708 Value *TripCount = nullptr; 709 Value *VectorTripCount = nullptr; 710 VPlan &EpiloguePlan; 711 712 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 713 ElementCount EVF, unsigned EUF, 714 VPlan &EpiloguePlan) 715 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), 716 EpiloguePlan(EpiloguePlan) { 717 assert(EUF == 1 && 718 "A high UF for the epilogue loop is likely not beneficial."); 719 } 720 }; 721 722 /// An extension of the inner loop vectorizer that creates a skeleton for a 723 /// vectorized loop that has its epilogue (residual) also vectorized. 724 /// The idea is to run the vplan on a given loop twice, firstly to setup the 725 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 726 /// from the first step and vectorize the epilogue. This is achieved by 727 /// deriving two concrete strategy classes from this base class and invoking 728 /// them in succession from the loop vectorizer planner. 729 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 730 public: 731 InnerLoopAndEpilogueVectorizer( 732 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 733 DominatorTree *DT, const TargetLibraryInfo *TLI, 734 const TargetTransformInfo *TTI, AssumptionCache *AC, 735 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 736 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 737 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 738 GeneratedRTChecks &Checks, VPlan &Plan) 739 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 740 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 741 CM, BFI, PSI, Checks, Plan), 742 EPI(EPI) {} 743 744 // Override this function to handle the more complex control flow around the 745 // three loops. 746 BasicBlock * 747 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final { 748 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 749 } 750 751 /// The interface for creating a vectorized skeleton using one of two 752 /// different strategies, each corresponding to one execution of the vplan 753 /// as described above. 754 virtual BasicBlock * 755 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 756 757 /// Holds and updates state information required to vectorize the main loop 758 /// and its epilogue in two separate passes. This setup helps us avoid 759 /// regenerating and recomputing runtime safety checks. It also helps us to 760 /// shorten the iteration-count-check path length for the cases where the 761 /// iteration count of the loop is so small that the main vector loop is 762 /// completely skipped. 763 EpilogueLoopVectorizationInfo &EPI; 764 }; 765 766 /// A specialized derived class of inner loop vectorizer that performs 767 /// vectorization of *main* loops in the process of vectorizing loops and their 768 /// epilogues. 769 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 770 public: 771 EpilogueVectorizerMainLoop( 772 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 773 DominatorTree *DT, const TargetLibraryInfo *TLI, 774 const TargetTransformInfo *TTI, AssumptionCache *AC, 775 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 776 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 777 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 778 GeneratedRTChecks &Check, VPlan &Plan) 779 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 780 EPI, LVL, CM, BFI, PSI, Check, Plan) {} 781 /// Implements the interface for creating a vectorized skeleton using the 782 /// *main loop* strategy (ie the first pass of vplan execution). 783 BasicBlock * 784 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 785 786 protected: 787 /// Emits an iteration count bypass check once for the main loop (when \p 788 /// ForEpilogue is false) and once for the epilogue loop (when \p 789 /// ForEpilogue is true). 790 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 791 void printDebugTracesAtStart() override; 792 void printDebugTracesAtEnd() override; 793 }; 794 795 // A specialized derived class of inner loop vectorizer that performs 796 // vectorization of *epilogue* loops in the process of vectorizing loops and 797 // their epilogues. 798 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 799 public: 800 EpilogueVectorizerEpilogueLoop( 801 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 802 DominatorTree *DT, const TargetLibraryInfo *TLI, 803 const TargetTransformInfo *TTI, AssumptionCache *AC, 804 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 805 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 806 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 807 GeneratedRTChecks &Checks, VPlan &Plan) 808 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 809 EPI, LVL, CM, BFI, PSI, Checks, Plan) { 810 TripCount = EPI.TripCount; 811 } 812 /// Implements the interface for creating a vectorized skeleton using the 813 /// *epilogue loop* strategy (ie the second pass of vplan execution). 814 BasicBlock * 815 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 816 817 protected: 818 /// Emits an iteration count bypass check after the main vector loop has 819 /// finished to see if there are any iterations left to execute by either 820 /// the vector epilogue or the scalar epilogue. 821 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 822 BasicBlock *Bypass, 823 BasicBlock *Insert); 824 void printDebugTracesAtStart() override; 825 void printDebugTracesAtEnd() override; 826 }; 827 } // end namespace llvm 828 829 /// Look for a meaningful debug location on the instruction or its operands. 830 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 831 if (!I) 832 return DebugLoc(); 833 834 DebugLoc Empty; 835 if (I->getDebugLoc() != Empty) 836 return I->getDebugLoc(); 837 838 for (Use &Op : I->operands()) { 839 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 840 if (OpInst->getDebugLoc() != Empty) 841 return OpInst->getDebugLoc(); 842 } 843 844 return I->getDebugLoc(); 845 } 846 847 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 848 /// is passed, the message relates to that particular instruction. 849 #ifndef NDEBUG 850 static void debugVectorizationMessage(const StringRef Prefix, 851 const StringRef DebugMsg, 852 Instruction *I) { 853 dbgs() << "LV: " << Prefix << DebugMsg; 854 if (I != nullptr) 855 dbgs() << " " << *I; 856 else 857 dbgs() << '.'; 858 dbgs() << '\n'; 859 } 860 #endif 861 862 /// Create an analysis remark that explains why vectorization failed 863 /// 864 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 865 /// RemarkName is the identifier for the remark. If \p I is passed it is an 866 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 867 /// the location of the remark. If \p DL is passed, use it as debug location for 868 /// the remark. \return the remark object that can be streamed to. 869 static OptimizationRemarkAnalysis 870 createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, 871 Instruction *I, DebugLoc DL = {}) { 872 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); 873 // If debug location is attached to the instruction, use it. Otherwise if DL 874 // was not provided, use the loop's. 875 if (I && I->getDebugLoc()) 876 DL = I->getDebugLoc(); 877 else if (!DL) 878 DL = TheLoop->getStartLoc(); 879 880 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 881 } 882 883 namespace llvm { 884 885 /// Return a value for Step multiplied by VF. 886 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 887 int64_t Step) { 888 assert(Ty->isIntegerTy() && "Expected an integer step"); 889 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 890 } 891 892 /// Return the runtime value for VF. 893 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 894 return B.CreateElementCount(Ty, VF); 895 } 896 897 void reportVectorizationFailure(const StringRef DebugMsg, 898 const StringRef OREMsg, const StringRef ORETag, 899 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 900 Instruction *I) { 901 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 902 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 903 ORE->emit( 904 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 905 << "loop not vectorized: " << OREMsg); 906 } 907 908 /// Reports an informative message: print \p Msg for debugging purposes as well 909 /// as an optimization remark. Uses either \p I as location of the remark, or 910 /// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the 911 /// remark. If \p DL is passed, use it as debug location for the remark. 912 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 913 OptimizationRemarkEmitter *ORE, 914 Loop *TheLoop, Instruction *I = nullptr, 915 DebugLoc DL = {}) { 916 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 917 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 918 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, 919 I, DL) 920 << Msg); 921 } 922 923 /// Report successful vectorization of the loop. In case an outer loop is 924 /// vectorized, prepend "outer" to the vectorization remark. 925 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 926 VectorizationFactor VF, unsigned IC) { 927 LLVM_DEBUG(debugVectorizationMessage( 928 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 929 nullptr)); 930 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 931 ORE->emit([&]() { 932 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 933 TheLoop->getHeader()) 934 << "vectorized " << LoopType << "loop (vectorization width: " 935 << ore::NV("VectorizationFactor", VF.Width) 936 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 937 }); 938 } 939 940 } // end namespace llvm 941 942 namespace llvm { 943 944 // Loop vectorization cost-model hints how the scalar epilogue loop should be 945 // lowered. 946 enum ScalarEpilogueLowering { 947 948 // The default: allowing scalar epilogues. 949 CM_ScalarEpilogueAllowed, 950 951 // Vectorization with OptForSize: don't allow epilogues. 952 CM_ScalarEpilogueNotAllowedOptSize, 953 954 // A special case of vectorisation with OptForSize: loops with a very small 955 // trip count are considered for vectorization under OptForSize, thereby 956 // making sure the cost of their loop body is dominant, free of runtime 957 // guards and scalar iteration overheads. 958 CM_ScalarEpilogueNotAllowedLowTripLoop, 959 960 // Loop hint predicate indicating an epilogue is undesired. 961 CM_ScalarEpilogueNotNeededUsePredicate, 962 963 // Directive indicating we must either tail fold or not vectorize 964 CM_ScalarEpilogueNotAllowedUsePredicate 965 }; 966 967 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 968 969 /// LoopVectorizationCostModel - estimates the expected speedups due to 970 /// vectorization. 971 /// In many cases vectorization is not profitable. This can happen because of 972 /// a number of reasons. In this class we mainly attempt to predict the 973 /// expected speedup/slowdowns due to the supported instruction set. We use the 974 /// TargetTransformInfo to query the different backends for the cost of 975 /// different operations. 976 class LoopVectorizationCostModel { 977 friend class LoopVectorizationPlanner; 978 979 public: 980 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 981 PredicatedScalarEvolution &PSE, LoopInfo *LI, 982 LoopVectorizationLegality *Legal, 983 const TargetTransformInfo &TTI, 984 const TargetLibraryInfo *TLI, DemandedBits *DB, 985 AssumptionCache *AC, 986 OptimizationRemarkEmitter *ORE, const Function *F, 987 const LoopVectorizeHints *Hints, 988 InterleavedAccessInfo &IAI) 989 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 990 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 991 Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {} 992 993 /// \return An upper bound for the vectorization factors (both fixed and 994 /// scalable). If the factors are 0, vectorization and interleaving should be 995 /// avoided up front. 996 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 997 998 /// \return True if runtime checks are required for vectorization, and false 999 /// otherwise. 1000 bool runtimeChecksRequired(); 1001 1002 /// Setup cost-based decisions for user vectorization factor. 1003 /// \return true if the UserVF is a feasible VF to be chosen. 1004 bool selectUserVectorizationFactor(ElementCount UserVF) { 1005 collectUniformsAndScalars(UserVF); 1006 collectInstsToScalarize(UserVF); 1007 return expectedCost(UserVF).isValid(); 1008 } 1009 1010 /// \return The size (in bits) of the smallest and widest types in the code 1011 /// that needs to be vectorized. We ignore values that remain scalar such as 1012 /// 64 bit loop indices. 1013 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1014 1015 /// \return The desired interleave count. 1016 /// If interleave count has been specified by metadata it will be returned. 1017 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1018 /// are the selected vectorization factor and the cost of the selected VF. 1019 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1020 1021 /// Memory access instruction may be vectorized in more than one way. 1022 /// Form of instruction after vectorization depends on cost. 1023 /// This function takes cost-based decisions for Load/Store instructions 1024 /// and collects them in a map. This decisions map is used for building 1025 /// the lists of loop-uniform and loop-scalar instructions. 1026 /// The calculated cost is saved with widening decision in order to 1027 /// avoid redundant calculations. 1028 void setCostBasedWideningDecision(ElementCount VF); 1029 1030 /// A call may be vectorized in different ways depending on whether we have 1031 /// vectorized variants available and whether the target supports masking. 1032 /// This function analyzes all calls in the function at the supplied VF, 1033 /// makes a decision based on the costs of available options, and stores that 1034 /// decision in a map for use in planning and plan execution. 1035 void setVectorizedCallDecision(ElementCount VF); 1036 1037 /// A struct that represents some properties of the register usage 1038 /// of a loop. 1039 struct RegisterUsage { 1040 /// Holds the number of loop invariant values that are used in the loop. 1041 /// The key is ClassID of target-provided register class. 1042 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1043 /// Holds the maximum number of concurrent live intervals in the loop. 1044 /// The key is ClassID of target-provided register class. 1045 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1046 }; 1047 1048 /// \return Returns information about the register usages of the loop for the 1049 /// given vectorization factors. 1050 SmallVector<RegisterUsage, 8> 1051 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1052 1053 /// Collect values we want to ignore in the cost model. 1054 void collectValuesToIgnore(); 1055 1056 /// Collect all element types in the loop for which widening is needed. 1057 void collectElementTypesForWidening(); 1058 1059 /// Split reductions into those that happen in the loop, and those that happen 1060 /// outside. In loop reductions are collected into InLoopReductions. 1061 void collectInLoopReductions(); 1062 1063 /// Returns true if we should use strict in-order reductions for the given 1064 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1065 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1066 /// of FP operations. 1067 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1068 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1069 } 1070 1071 /// \returns The smallest bitwidth each instruction can be represented with. 1072 /// The vector equivalents of these instructions should be truncated to this 1073 /// type. 1074 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1075 return MinBWs; 1076 } 1077 1078 /// \returns True if it is more profitable to scalarize instruction \p I for 1079 /// vectorization factor \p VF. 1080 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1081 assert(VF.isVector() && 1082 "Profitable to scalarize relevant only for VF > 1."); 1083 assert( 1084 TheLoop->isInnermost() && 1085 "cost-model should not be used for outer loops (in VPlan-native path)"); 1086 1087 auto Scalars = InstsToScalarize.find(VF); 1088 assert(Scalars != InstsToScalarize.end() && 1089 "VF not yet analyzed for scalarization profitability"); 1090 return Scalars->second.contains(I); 1091 } 1092 1093 /// Returns true if \p I is known to be uniform after vectorization. 1094 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1095 assert( 1096 TheLoop->isInnermost() && 1097 "cost-model should not be used for outer loops (in VPlan-native path)"); 1098 // Pseudo probe needs to be duplicated for each unrolled iteration and 1099 // vector lane so that profiled loop trip count can be accurately 1100 // accumulated instead of being under counted. 1101 if (isa<PseudoProbeInst>(I)) 1102 return false; 1103 1104 if (VF.isScalar()) 1105 return true; 1106 1107 auto UniformsPerVF = Uniforms.find(VF); 1108 assert(UniformsPerVF != Uniforms.end() && 1109 "VF not yet analyzed for uniformity"); 1110 return UniformsPerVF->second.count(I); 1111 } 1112 1113 /// Returns true if \p I is known to be scalar after vectorization. 1114 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1115 assert( 1116 TheLoop->isInnermost() && 1117 "cost-model should not be used for outer loops (in VPlan-native path)"); 1118 if (VF.isScalar()) 1119 return true; 1120 1121 auto ScalarsPerVF = Scalars.find(VF); 1122 assert(ScalarsPerVF != Scalars.end() && 1123 "Scalar values are not calculated for VF"); 1124 return ScalarsPerVF->second.count(I); 1125 } 1126 1127 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1128 /// for vectorization factor \p VF. 1129 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1130 return VF.isVector() && MinBWs.contains(I) && 1131 !isProfitableToScalarize(I, VF) && 1132 !isScalarAfterVectorization(I, VF); 1133 } 1134 1135 /// Decision that was taken during cost calculation for memory instruction. 1136 enum InstWidening { 1137 CM_Unknown, 1138 CM_Widen, // For consecutive accesses with stride +1. 1139 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1140 CM_Interleave, 1141 CM_GatherScatter, 1142 CM_Scalarize, 1143 CM_VectorCall, 1144 CM_IntrinsicCall 1145 }; 1146 1147 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1148 /// instruction \p I and vector width \p VF. 1149 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1150 InstructionCost Cost) { 1151 assert(VF.isVector() && "Expected VF >=2"); 1152 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1153 } 1154 1155 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1156 /// interleaving group \p Grp and vector width \p VF. 1157 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1158 ElementCount VF, InstWidening W, 1159 InstructionCost Cost) { 1160 assert(VF.isVector() && "Expected VF >=2"); 1161 /// Broadcast this decicion to all instructions inside the group. 1162 /// When interleaving, the cost will only be assigned one instruction, the 1163 /// insert position. For other cases, add the appropriate fraction of the 1164 /// total cost to each instruction. This ensures accurate costs are used, 1165 /// even if the insert position instruction is not used. 1166 InstructionCost InsertPosCost = Cost; 1167 InstructionCost OtherMemberCost = 0; 1168 if (W != CM_Interleave) 1169 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); 1170 ; 1171 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { 1172 if (auto *I = Grp->getMember(Idx)) { 1173 if (Grp->getInsertPos() == I) 1174 WideningDecisions[std::make_pair(I, VF)] = 1175 std::make_pair(W, InsertPosCost); 1176 else 1177 WideningDecisions[std::make_pair(I, VF)] = 1178 std::make_pair(W, OtherMemberCost); 1179 } 1180 } 1181 } 1182 1183 /// Return the cost model decision for the given instruction \p I and vector 1184 /// width \p VF. Return CM_Unknown if this instruction did not pass 1185 /// through the cost modeling. 1186 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1187 assert(VF.isVector() && "Expected VF to be a vector VF"); 1188 assert( 1189 TheLoop->isInnermost() && 1190 "cost-model should not be used for outer loops (in VPlan-native path)"); 1191 1192 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1193 auto Itr = WideningDecisions.find(InstOnVF); 1194 if (Itr == WideningDecisions.end()) 1195 return CM_Unknown; 1196 return Itr->second.first; 1197 } 1198 1199 /// Return the vectorization cost for the given instruction \p I and vector 1200 /// width \p VF. 1201 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1202 assert(VF.isVector() && "Expected VF >=2"); 1203 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1204 assert(WideningDecisions.contains(InstOnVF) && 1205 "The cost is not calculated"); 1206 return WideningDecisions[InstOnVF].second; 1207 } 1208 1209 struct CallWideningDecision { 1210 InstWidening Kind; 1211 Function *Variant; 1212 Intrinsic::ID IID; 1213 std::optional<unsigned> MaskPos; 1214 InstructionCost Cost; 1215 }; 1216 1217 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1218 Function *Variant, Intrinsic::ID IID, 1219 std::optional<unsigned> MaskPos, 1220 InstructionCost Cost) { 1221 assert(!VF.isScalar() && "Expected vector VF"); 1222 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1223 MaskPos, Cost}; 1224 } 1225 1226 CallWideningDecision getCallWideningDecision(CallInst *CI, 1227 ElementCount VF) const { 1228 assert(!VF.isScalar() && "Expected vector VF"); 1229 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1230 } 1231 1232 /// Return True if instruction \p I is an optimizable truncate whose operand 1233 /// is an induction variable. Such a truncate will be removed by adding a new 1234 /// induction variable with the destination type. 1235 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1236 // If the instruction is not a truncate, return false. 1237 auto *Trunc = dyn_cast<TruncInst>(I); 1238 if (!Trunc) 1239 return false; 1240 1241 // Get the source and destination types of the truncate. 1242 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1243 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1244 1245 // If the truncate is free for the given types, return false. Replacing a 1246 // free truncate with an induction variable would add an induction variable 1247 // update instruction to each iteration of the loop. We exclude from this 1248 // check the primary induction variable since it will need an update 1249 // instruction regardless. 1250 Value *Op = Trunc->getOperand(0); 1251 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1252 return false; 1253 1254 // If the truncated value is not an induction variable, return false. 1255 return Legal->isInductionPhi(Op); 1256 } 1257 1258 /// Collects the instructions to scalarize for each predicated instruction in 1259 /// the loop. 1260 void collectInstsToScalarize(ElementCount VF); 1261 1262 /// Collect Uniform and Scalar values for the given \p VF. 1263 /// The sets depend on CM decision for Load/Store instructions 1264 /// that may be vectorized as interleave, gather-scatter or scalarized. 1265 /// Also make a decision on what to do about call instructions in the loop 1266 /// at that VF -- scalarize, call a known vector routine, or call a 1267 /// vector intrinsic. 1268 void collectUniformsAndScalars(ElementCount VF) { 1269 // Do the analysis once. 1270 if (VF.isScalar() || Uniforms.contains(VF)) 1271 return; 1272 setCostBasedWideningDecision(VF); 1273 collectLoopUniforms(VF); 1274 setVectorizedCallDecision(VF); 1275 collectLoopScalars(VF); 1276 } 1277 1278 /// Returns true if the target machine supports masked store operation 1279 /// for the given \p DataType and kind of access to \p Ptr. 1280 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1281 return Legal->isConsecutivePtr(DataType, Ptr) && 1282 TTI.isLegalMaskedStore(DataType, Alignment); 1283 } 1284 1285 /// Returns true if the target machine supports masked load operation 1286 /// for the given \p DataType and kind of access to \p Ptr. 1287 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1288 return Legal->isConsecutivePtr(DataType, Ptr) && 1289 TTI.isLegalMaskedLoad(DataType, Alignment); 1290 } 1291 1292 /// Returns true if the target machine can represent \p V as a masked gather 1293 /// or scatter operation. 1294 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1295 bool LI = isa<LoadInst>(V); 1296 bool SI = isa<StoreInst>(V); 1297 if (!LI && !SI) 1298 return false; 1299 auto *Ty = getLoadStoreType(V); 1300 Align Align = getLoadStoreAlignment(V); 1301 if (VF.isVector()) 1302 Ty = VectorType::get(Ty, VF); 1303 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1304 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1305 } 1306 1307 /// Returns true if the target machine supports all of the reduction 1308 /// variables found for the given VF. 1309 bool canVectorizeReductions(ElementCount VF) const { 1310 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1311 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1312 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1313 })); 1314 } 1315 1316 /// Given costs for both strategies, return true if the scalar predication 1317 /// lowering should be used for div/rem. This incorporates an override 1318 /// option so it is not simply a cost comparison. 1319 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1320 InstructionCost SafeDivisorCost) const { 1321 switch (ForceSafeDivisor) { 1322 case cl::BOU_UNSET: 1323 return ScalarCost < SafeDivisorCost; 1324 case cl::BOU_TRUE: 1325 return false; 1326 case cl::BOU_FALSE: 1327 return true; 1328 } 1329 llvm_unreachable("impossible case value"); 1330 } 1331 1332 /// Returns true if \p I is an instruction which requires predication and 1333 /// for which our chosen predication strategy is scalarization (i.e. we 1334 /// don't have an alternate strategy such as masking available). 1335 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1336 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1337 1338 /// Returns true if \p I is an instruction that needs to be predicated 1339 /// at runtime. The result is independent of the predication mechanism. 1340 /// Superset of instructions that return true for isScalarWithPredication. 1341 bool isPredicatedInst(Instruction *I) const; 1342 1343 /// Return the costs for our two available strategies for lowering a 1344 /// div/rem operation which requires speculating at least one lane. 1345 /// First result is for scalarization (will be invalid for scalable 1346 /// vectors); second is for the safe-divisor strategy. 1347 std::pair<InstructionCost, InstructionCost> 1348 getDivRemSpeculationCost(Instruction *I, 1349 ElementCount VF) const; 1350 1351 /// Returns true if \p I is a memory instruction with consecutive memory 1352 /// access that can be widened. 1353 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1354 1355 /// Returns true if \p I is a memory instruction in an interleaved-group 1356 /// of memory accesses that can be vectorized with wide vector loads/stores 1357 /// and shuffles. 1358 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; 1359 1360 /// Check if \p Instr belongs to any interleaved access group. 1361 bool isAccessInterleaved(Instruction *Instr) const { 1362 return InterleaveInfo.isInterleaved(Instr); 1363 } 1364 1365 /// Get the interleaved access group that \p Instr belongs to. 1366 const InterleaveGroup<Instruction> * 1367 getInterleavedAccessGroup(Instruction *Instr) const { 1368 return InterleaveInfo.getInterleaveGroup(Instr); 1369 } 1370 1371 /// Returns true if we're required to use a scalar epilogue for at least 1372 /// the final iteration of the original loop. 1373 bool requiresScalarEpilogue(bool IsVectorizing) const { 1374 if (!isScalarEpilogueAllowed()) { 1375 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1376 return false; 1377 } 1378 // If we might exit from anywhere but the latch and early exit vectorization 1379 // is disabled, we must run the exiting iteration in scalar form. 1380 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && 1381 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) { 1382 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting " 1383 "from latch block\n"); 1384 return true; 1385 } 1386 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { 1387 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " 1388 "interleaved group requires scalar epilogue\n"); 1389 return true; 1390 } 1391 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1392 return false; 1393 } 1394 1395 /// Returns true if we're required to use a scalar epilogue for at least 1396 /// the final iteration of the original loop for all VFs in \p Range. 1397 /// A scalar epilogue must either be required for all VFs in \p Range or for 1398 /// none. 1399 bool requiresScalarEpilogue(VFRange Range) const { 1400 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1401 return requiresScalarEpilogue(VF.isVector()); 1402 }; 1403 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1404 assert( 1405 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1406 "all VFs in range must agree on whether a scalar epilogue is required"); 1407 return IsRequired; 1408 } 1409 1410 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1411 /// loop hint annotation. 1412 bool isScalarEpilogueAllowed() const { 1413 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1414 } 1415 1416 /// Returns the TailFoldingStyle that is best for the current loop. 1417 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1418 if (!ChosenTailFoldingStyle) 1419 return TailFoldingStyle::None; 1420 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first 1421 : ChosenTailFoldingStyle->second; 1422 } 1423 1424 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1425 /// overflow or not. 1426 /// \param IsScalableVF true if scalable vector factors enabled. 1427 /// \param UserIC User specific interleave count. 1428 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { 1429 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); 1430 if (!Legal->canFoldTailByMasking()) { 1431 ChosenTailFoldingStyle = 1432 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); 1433 return; 1434 } 1435 1436 if (!ForceTailFoldingStyle.getNumOccurrences()) { 1437 ChosenTailFoldingStyle = std::make_pair( 1438 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), 1439 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); 1440 return; 1441 } 1442 1443 // Set styles when forced. 1444 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), 1445 ForceTailFoldingStyle.getValue()); 1446 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) 1447 return; 1448 // Override forced styles if needed. 1449 // FIXME: use actual opcode/data type for analysis here. 1450 // FIXME: Investigate opportunity for fixed vector factor. 1451 // FIXME: support fixed-order recurrences by fixing splice of non VFxUF 1452 // penultimate EVL. 1453 bool EVLIsLegal = 1454 UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && 1455 !EnableVPlanNativePath && Legal->getFixedOrderRecurrences().empty(); 1456 if (!EVLIsLegal) { 1457 // If for some reason EVL mode is unsupported, fallback to 1458 // DataWithoutLaneMask to try to vectorize the loop with folded tail 1459 // in a generic way. 1460 ChosenTailFoldingStyle = 1461 std::make_pair(TailFoldingStyle::DataWithoutLaneMask, 1462 TailFoldingStyle::DataWithoutLaneMask); 1463 LLVM_DEBUG( 1464 dbgs() 1465 << "LV: Preference for VP intrinsics indicated. Will " 1466 "not try to generate VP Intrinsics " 1467 << (UserIC > 1 1468 ? "since interleave count specified is greater than 1.\n" 1469 : "due to non-interleaving reasons.\n")); 1470 } 1471 } 1472 1473 /// Returns true if all loop blocks should be masked to fold tail loop. 1474 bool foldTailByMasking() const { 1475 // TODO: check if it is possible to check for None style independent of 1476 // IVUpdateMayOverflow flag in getTailFoldingStyle. 1477 return getTailFoldingStyle() != TailFoldingStyle::None; 1478 } 1479 1480 /// Return maximum safe number of elements to be processed per vector 1481 /// iteration, which do not prevent store-load forwarding and are safe with 1482 /// regard to the memory dependencies. Required for EVL-based VPlans to 1483 /// correctly calculate AVL (application vector length) as min(remaining AVL, 1484 /// MaxSafeElements). 1485 /// TODO: need to consider adjusting cost model to use this value as a 1486 /// vectorization factor for EVL-based vectorization. 1487 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; } 1488 1489 /// Returns true if the instructions in this block requires predication 1490 /// for any reason, e.g. because tail folding now requires a predicate 1491 /// or because the block in the original loop was predicated. 1492 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1493 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1494 } 1495 1496 /// Returns true if VP intrinsics with explicit vector length support should 1497 /// be generated in the tail folded loop. 1498 bool foldTailWithEVL() const { 1499 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; 1500 } 1501 1502 /// Returns true if the Phi is part of an inloop reduction. 1503 bool isInLoopReduction(PHINode *Phi) const { 1504 return InLoopReductions.contains(Phi); 1505 } 1506 1507 /// Returns true if the predicated reduction select should be used to set the 1508 /// incoming value for the reduction phi. 1509 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const { 1510 // Force to use predicated reduction select since the EVL of the 1511 // second-to-last iteration might not be VF*UF. 1512 if (foldTailWithEVL()) 1513 return true; 1514 return PreferPredicatedReductionSelect || 1515 TTI.preferPredicatedReductionSelect( 1516 Opcode, PhiTy, TargetTransformInfo::ReductionFlags()); 1517 } 1518 1519 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1520 /// with factor VF. Return the cost of the instruction, including 1521 /// scalarization overhead if it's needed. 1522 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1523 1524 /// Estimate cost of a call instruction CI if it were vectorized with factor 1525 /// VF. Return the cost of the instruction, including scalarization overhead 1526 /// if it's needed. 1527 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1528 1529 /// Invalidates decisions already taken by the cost model. 1530 void invalidateCostModelingDecisions() { 1531 WideningDecisions.clear(); 1532 CallWideningDecisions.clear(); 1533 Uniforms.clear(); 1534 Scalars.clear(); 1535 } 1536 1537 /// Returns the expected execution cost. The unit of the cost does 1538 /// not matter because we use the 'cost' units to compare different 1539 /// vector widths. The cost that is returned is *not* normalized by 1540 /// the factor width. 1541 InstructionCost expectedCost(ElementCount VF); 1542 1543 bool hasPredStores() const { return NumPredStores > 0; } 1544 1545 /// Returns true if epilogue vectorization is considered profitable, and 1546 /// false otherwise. 1547 /// \p VF is the vectorization factor chosen for the original loop. 1548 /// \p Multiplier is an aditional scaling factor applied to VF before 1549 /// comparing to EpilogueVectorizationMinVF. 1550 bool isEpilogueVectorizationProfitable(const ElementCount VF, 1551 const unsigned IC) const; 1552 1553 /// Returns the execution time cost of an instruction for a given vector 1554 /// width. Vector width of one means scalar. 1555 InstructionCost getInstructionCost(Instruction *I, ElementCount VF); 1556 1557 /// Return the cost of instructions in an inloop reduction pattern, if I is 1558 /// part of that pattern. 1559 std::optional<InstructionCost> getReductionPatternCost(Instruction *I, 1560 ElementCount VF, 1561 Type *VectorTy) const; 1562 1563 /// Returns true if \p Op should be considered invariant and if it is 1564 /// trivially hoistable. 1565 bool shouldConsiderInvariant(Value *Op); 1566 1567 private: 1568 unsigned NumPredStores = 0; 1569 1570 /// \return An upper bound for the vectorization factors for both 1571 /// fixed and scalable vectorization, where the minimum-known number of 1572 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1573 /// disabled or unsupported, then the scalable part will be equal to 1574 /// ElementCount::getScalable(0). 1575 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1576 ElementCount UserVF, 1577 bool FoldTailByMasking); 1578 1579 /// \return the maximized element count based on the targets vector 1580 /// registers and the loop trip-count, but limited to a maximum safe VF. 1581 /// This is a helper function of computeFeasibleMaxVF. 1582 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1583 unsigned SmallestType, 1584 unsigned WidestType, 1585 ElementCount MaxSafeVF, 1586 bool FoldTailByMasking); 1587 1588 /// Checks if scalable vectorization is supported and enabled. Caches the 1589 /// result to avoid repeated debug dumps for repeated queries. 1590 bool isScalableVectorizationAllowed(); 1591 1592 /// \return the maximum legal scalable VF, based on the safe max number 1593 /// of elements. 1594 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1595 1596 /// Calculate vectorization cost of memory instruction \p I. 1597 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1598 1599 /// The cost computation for scalarized memory instruction. 1600 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1601 1602 /// The cost computation for interleaving group of memory instructions. 1603 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1604 1605 /// The cost computation for Gather/Scatter instruction. 1606 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1607 1608 /// The cost computation for widening instruction \p I with consecutive 1609 /// memory access. 1610 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1611 1612 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1613 /// Load: scalar load + broadcast. 1614 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1615 /// element) 1616 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1617 1618 /// Estimate the overhead of scalarizing an instruction. This is a 1619 /// convenience wrapper for the type-based getScalarizationOverhead API. 1620 InstructionCost getScalarizationOverhead(Instruction *I, 1621 ElementCount VF) const; 1622 1623 /// Returns true if an artificially high cost for emulated masked memrefs 1624 /// should be used. 1625 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1626 1627 /// Map of scalar integer values to the smallest bitwidth they can be legally 1628 /// represented as. The vector equivalents of these values should be truncated 1629 /// to this type. 1630 MapVector<Instruction *, uint64_t> MinBWs; 1631 1632 /// A type representing the costs for instructions if they were to be 1633 /// scalarized rather than vectorized. The entries are Instruction-Cost 1634 /// pairs. 1635 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1636 1637 /// A set containing all BasicBlocks that are known to present after 1638 /// vectorization as a predicated block. 1639 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1640 PredicatedBBsAfterVectorization; 1641 1642 /// Records whether it is allowed to have the original scalar loop execute at 1643 /// least once. This may be needed as a fallback loop in case runtime 1644 /// aliasing/dependence checks fail, or to handle the tail/remainder 1645 /// iterations when the trip count is unknown or doesn't divide by the VF, 1646 /// or as a peel-loop to handle gaps in interleave-groups. 1647 /// Under optsize and when the trip count is very small we don't allow any 1648 /// iterations to execute in the scalar loop. 1649 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1650 1651 /// Control finally chosen tail folding style. The first element is used if 1652 /// the IV update may overflow, the second element - if it does not. 1653 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> 1654 ChosenTailFoldingStyle; 1655 1656 /// true if scalable vectorization is supported and enabled. 1657 std::optional<bool> IsScalableVectorizationAllowed; 1658 1659 /// Maximum safe number of elements to be processed per vector iteration, 1660 /// which do not prevent store-load forwarding and are safe with regard to the 1661 /// memory dependencies. Required for EVL-based veectorization, where this 1662 /// value is used as the upper bound of the safe AVL. 1663 std::optional<unsigned> MaxSafeElements; 1664 1665 /// A map holding scalar costs for different vectorization factors. The 1666 /// presence of a cost for an instruction in the mapping indicates that the 1667 /// instruction will be scalarized when vectorizing with the associated 1668 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1669 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1670 1671 /// Holds the instructions known to be uniform after vectorization. 1672 /// The data is collected per VF. 1673 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1674 1675 /// Holds the instructions known to be scalar after vectorization. 1676 /// The data is collected per VF. 1677 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1678 1679 /// Holds the instructions (address computations) that are forced to be 1680 /// scalarized. 1681 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1682 1683 /// PHINodes of the reductions that should be expanded in-loop. 1684 SmallPtrSet<PHINode *, 4> InLoopReductions; 1685 1686 /// A Map of inloop reduction operations and their immediate chain operand. 1687 /// FIXME: This can be removed once reductions can be costed correctly in 1688 /// VPlan. This was added to allow quick lookup of the inloop operations. 1689 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1690 1691 /// Returns the expected difference in cost from scalarizing the expression 1692 /// feeding a predicated instruction \p PredInst. The instructions to 1693 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1694 /// non-negative return value implies the expression will be scalarized. 1695 /// Currently, only single-use chains are considered for scalarization. 1696 InstructionCost computePredInstDiscount(Instruction *PredInst, 1697 ScalarCostsTy &ScalarCosts, 1698 ElementCount VF); 1699 1700 /// Collect the instructions that are uniform after vectorization. An 1701 /// instruction is uniform if we represent it with a single scalar value in 1702 /// the vectorized loop corresponding to each vector iteration. Examples of 1703 /// uniform instructions include pointer operands of consecutive or 1704 /// interleaved memory accesses. Note that although uniformity implies an 1705 /// instruction will be scalar, the reverse is not true. In general, a 1706 /// scalarized instruction will be represented by VF scalar values in the 1707 /// vectorized loop, each corresponding to an iteration of the original 1708 /// scalar loop. 1709 void collectLoopUniforms(ElementCount VF); 1710 1711 /// Collect the instructions that are scalar after vectorization. An 1712 /// instruction is scalar if it is known to be uniform or will be scalarized 1713 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1714 /// to the list if they are used by a load/store instruction that is marked as 1715 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1716 /// VF values in the vectorized loop, each corresponding to an iteration of 1717 /// the original scalar loop. 1718 void collectLoopScalars(ElementCount VF); 1719 1720 /// Keeps cost model vectorization decision and cost for instructions. 1721 /// Right now it is used for memory instructions only. 1722 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1723 std::pair<InstWidening, InstructionCost>>; 1724 1725 DecisionList WideningDecisions; 1726 1727 using CallDecisionList = 1728 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1729 1730 CallDecisionList CallWideningDecisions; 1731 1732 /// Returns true if \p V is expected to be vectorized and it needs to be 1733 /// extracted. 1734 bool needsExtract(Value *V, ElementCount VF) const { 1735 Instruction *I = dyn_cast<Instruction>(V); 1736 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1737 TheLoop->isLoopInvariant(I) || 1738 getWideningDecision(I, VF) == CM_Scalarize) 1739 return false; 1740 1741 // Assume we can vectorize V (and hence we need extraction) if the 1742 // scalars are not computed yet. This can happen, because it is called 1743 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1744 // the scalars are collected. That should be a safe assumption in most 1745 // cases, because we check if the operands have vectorizable types 1746 // beforehand in LoopVectorizationLegality. 1747 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1748 }; 1749 1750 /// Returns a range containing only operands needing to be extracted. 1751 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1752 ElementCount VF) const { 1753 return SmallVector<Value *, 4>(make_filter_range( 1754 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1755 } 1756 1757 public: 1758 /// The loop that we evaluate. 1759 Loop *TheLoop; 1760 1761 /// Predicated scalar evolution analysis. 1762 PredicatedScalarEvolution &PSE; 1763 1764 /// Loop Info analysis. 1765 LoopInfo *LI; 1766 1767 /// Vectorization legality. 1768 LoopVectorizationLegality *Legal; 1769 1770 /// Vector target information. 1771 const TargetTransformInfo &TTI; 1772 1773 /// Target Library Info. 1774 const TargetLibraryInfo *TLI; 1775 1776 /// Demanded bits analysis. 1777 DemandedBits *DB; 1778 1779 /// Assumption cache. 1780 AssumptionCache *AC; 1781 1782 /// Interface to emit optimization remarks. 1783 OptimizationRemarkEmitter *ORE; 1784 1785 const Function *TheFunction; 1786 1787 /// Loop Vectorize Hint. 1788 const LoopVectorizeHints *Hints; 1789 1790 /// The interleave access information contains groups of interleaved accesses 1791 /// with the same stride and close to each other. 1792 InterleavedAccessInfo &InterleaveInfo; 1793 1794 /// Values to ignore in the cost model. 1795 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1796 1797 /// Values to ignore in the cost model when VF > 1. 1798 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1799 1800 /// All element types found in the loop. 1801 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1802 1803 /// The kind of cost that we are calculating 1804 TTI::TargetCostKind CostKind; 1805 }; 1806 } // end namespace llvm 1807 1808 namespace { 1809 /// Helper struct to manage generating runtime checks for vectorization. 1810 /// 1811 /// The runtime checks are created up-front in temporary blocks to allow better 1812 /// estimating the cost and un-linked from the existing IR. After deciding to 1813 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1814 /// temporary blocks are completely removed. 1815 class GeneratedRTChecks { 1816 /// Basic block which contains the generated SCEV checks, if any. 1817 BasicBlock *SCEVCheckBlock = nullptr; 1818 1819 /// The value representing the result of the generated SCEV checks. If it is 1820 /// nullptr, either no SCEV checks have been generated or they have been used. 1821 Value *SCEVCheckCond = nullptr; 1822 1823 /// Basic block which contains the generated memory runtime checks, if any. 1824 BasicBlock *MemCheckBlock = nullptr; 1825 1826 /// The value representing the result of the generated memory runtime checks. 1827 /// If it is nullptr, either no memory runtime checks have been generated or 1828 /// they have been used. 1829 Value *MemRuntimeCheckCond = nullptr; 1830 1831 DominatorTree *DT; 1832 LoopInfo *LI; 1833 TargetTransformInfo *TTI; 1834 1835 SCEVExpander SCEVExp; 1836 SCEVExpander MemCheckExp; 1837 1838 bool CostTooHigh = false; 1839 const bool AddBranchWeights; 1840 1841 Loop *OuterLoop = nullptr; 1842 1843 PredicatedScalarEvolution &PSE; 1844 1845 /// The kind of cost that we are calculating 1846 TTI::TargetCostKind CostKind; 1847 1848 public: 1849 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, 1850 LoopInfo *LI, TargetTransformInfo *TTI, 1851 const DataLayout &DL, bool AddBranchWeights, 1852 TTI::TargetCostKind CostKind) 1853 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), 1854 MemCheckExp(*PSE.getSE(), DL, "scev.check"), 1855 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {} 1856 1857 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1858 /// accurately estimate the cost of the runtime checks. The blocks are 1859 /// un-linked from the IR and are added back during vector code generation. If 1860 /// there is no vector code generation, the check blocks are removed 1861 /// completely. 1862 void create(Loop *L, const LoopAccessInfo &LAI, 1863 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1864 1865 // Hard cutoff to limit compile-time increase in case a very large number of 1866 // runtime checks needs to be generated. 1867 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1868 // profile info. 1869 CostTooHigh = 1870 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1871 if (CostTooHigh) 1872 return; 1873 1874 BasicBlock *LoopHeader = L->getHeader(); 1875 BasicBlock *Preheader = L->getLoopPreheader(); 1876 1877 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1878 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1879 // may be used by SCEVExpander. The blocks will be un-linked from their 1880 // predecessors and removed from LI & DT at the end of the function. 1881 if (!UnionPred.isAlwaysTrue()) { 1882 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1883 nullptr, "vector.scevcheck"); 1884 1885 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1886 &UnionPred, SCEVCheckBlock->getTerminator()); 1887 } 1888 1889 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1890 if (RtPtrChecking.Need) { 1891 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1892 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1893 "vector.memcheck"); 1894 1895 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1896 if (DiffChecks) { 1897 Value *RuntimeVF = nullptr; 1898 MemRuntimeCheckCond = addDiffRuntimeChecks( 1899 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1900 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1901 if (!RuntimeVF) 1902 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1903 return RuntimeVF; 1904 }, 1905 IC); 1906 } else { 1907 MemRuntimeCheckCond = addRuntimeChecks( 1908 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 1909 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 1910 } 1911 assert(MemRuntimeCheckCond && 1912 "no RT checks generated although RtPtrChecking " 1913 "claimed checks are required"); 1914 } 1915 1916 if (!MemCheckBlock && !SCEVCheckBlock) 1917 return; 1918 1919 // Unhook the temporary block with the checks, update various places 1920 // accordingly. 1921 if (SCEVCheckBlock) 1922 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1923 if (MemCheckBlock) 1924 MemCheckBlock->replaceAllUsesWith(Preheader); 1925 1926 if (SCEVCheckBlock) { 1927 SCEVCheckBlock->getTerminator()->moveBefore( 1928 Preheader->getTerminator()->getIterator()); 1929 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1930 Preheader->getTerminator()->eraseFromParent(); 1931 } 1932 if (MemCheckBlock) { 1933 MemCheckBlock->getTerminator()->moveBefore( 1934 Preheader->getTerminator()->getIterator()); 1935 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1936 Preheader->getTerminator()->eraseFromParent(); 1937 } 1938 1939 DT->changeImmediateDominator(LoopHeader, Preheader); 1940 if (MemCheckBlock) { 1941 DT->eraseNode(MemCheckBlock); 1942 LI->removeBlock(MemCheckBlock); 1943 } 1944 if (SCEVCheckBlock) { 1945 DT->eraseNode(SCEVCheckBlock); 1946 LI->removeBlock(SCEVCheckBlock); 1947 } 1948 1949 // Outer loop is used as part of the later cost calculations. 1950 OuterLoop = L->getParentLoop(); 1951 } 1952 1953 InstructionCost getCost() { 1954 if (SCEVCheckBlock || MemCheckBlock) 1955 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1956 1957 if (CostTooHigh) { 1958 InstructionCost Cost; 1959 Cost.setInvalid(); 1960 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1961 return Cost; 1962 } 1963 1964 InstructionCost RTCheckCost = 0; 1965 if (SCEVCheckBlock) 1966 for (Instruction &I : *SCEVCheckBlock) { 1967 if (SCEVCheckBlock->getTerminator() == &I) 1968 continue; 1969 InstructionCost C = TTI->getInstructionCost(&I, CostKind); 1970 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1971 RTCheckCost += C; 1972 } 1973 if (MemCheckBlock) { 1974 InstructionCost MemCheckCost = 0; 1975 for (Instruction &I : *MemCheckBlock) { 1976 if (MemCheckBlock->getTerminator() == &I) 1977 continue; 1978 InstructionCost C = TTI->getInstructionCost(&I, CostKind); 1979 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1980 MemCheckCost += C; 1981 } 1982 1983 // If the runtime memory checks are being created inside an outer loop 1984 // we should find out if these checks are outer loop invariant. If so, 1985 // the checks will likely be hoisted out and so the effective cost will 1986 // reduce according to the outer loop trip count. 1987 if (OuterLoop) { 1988 ScalarEvolution *SE = MemCheckExp.getSE(); 1989 // TODO: If profitable, we could refine this further by analysing every 1990 // individual memory check, since there could be a mixture of loop 1991 // variant and invariant checks that mean the final condition is 1992 // variant. 1993 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 1994 if (SE->isLoopInvariant(Cond, OuterLoop)) { 1995 // It seems reasonable to assume that we can reduce the effective 1996 // cost of the checks even when we know nothing about the trip 1997 // count. Assume that the outer loop executes at least twice. 1998 unsigned BestTripCount = 2; 1999 2000 // Get the best known TC estimate. 2001 if (auto EstimatedTC = getSmallBestKnownTC( 2002 PSE, OuterLoop, /* CanUseConstantMax = */ false)) 2003 BestTripCount = *EstimatedTC; 2004 2005 BestTripCount = std::max(BestTripCount, 1U); 2006 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 2007 2008 // Let's ensure the cost is always at least 1. 2009 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 2010 (InstructionCost::CostType)1); 2011 2012 if (BestTripCount > 1) 2013 LLVM_DEBUG(dbgs() 2014 << "We expect runtime memory checks to be hoisted " 2015 << "out of the outer loop. Cost reduced from " 2016 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2017 2018 MemCheckCost = NewMemCheckCost; 2019 } 2020 } 2021 2022 RTCheckCost += MemCheckCost; 2023 } 2024 2025 if (SCEVCheckBlock || MemCheckBlock) 2026 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2027 << "\n"); 2028 2029 return RTCheckCost; 2030 } 2031 2032 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2033 /// unused. 2034 ~GeneratedRTChecks() { 2035 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2036 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2037 if (!SCEVCheckCond) 2038 SCEVCleaner.markResultUsed(); 2039 2040 if (!MemRuntimeCheckCond) 2041 MemCheckCleaner.markResultUsed(); 2042 2043 if (MemRuntimeCheckCond) { 2044 auto &SE = *MemCheckExp.getSE(); 2045 // Memory runtime check generation creates compares that use expanded 2046 // values. Remove them before running the SCEVExpanderCleaners. 2047 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2048 if (MemCheckExp.isInsertedInstruction(&I)) 2049 continue; 2050 SE.forgetValue(&I); 2051 I.eraseFromParent(); 2052 } 2053 } 2054 MemCheckCleaner.cleanup(); 2055 SCEVCleaner.cleanup(); 2056 2057 if (SCEVCheckCond) 2058 SCEVCheckBlock->eraseFromParent(); 2059 if (MemRuntimeCheckCond) 2060 MemCheckBlock->eraseFromParent(); 2061 } 2062 2063 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2064 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2065 /// depending on the generated condition. 2066 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2067 BasicBlock *LoopVectorPreHeader) { 2068 if (!SCEVCheckCond) 2069 return nullptr; 2070 2071 Value *Cond = SCEVCheckCond; 2072 // Mark the check as used, to prevent it from being removed during cleanup. 2073 SCEVCheckCond = nullptr; 2074 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2075 if (C->isZero()) 2076 return nullptr; 2077 2078 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2079 2080 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2081 // Create new preheader for vector loop. 2082 if (OuterLoop) 2083 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2084 2085 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2086 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2087 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2088 SCEVCheckBlock); 2089 2090 DT->addNewBlock(SCEVCheckBlock, Pred); 2091 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2092 2093 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2094 if (AddBranchWeights) 2095 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); 2096 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2097 return SCEVCheckBlock; 2098 } 2099 2100 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2101 /// the branches to branch to the vector preheader or \p Bypass, depending on 2102 /// the generated condition. 2103 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2104 BasicBlock *LoopVectorPreHeader) { 2105 // Check if we generated code that checks in runtime if arrays overlap. 2106 if (!MemRuntimeCheckCond) 2107 return nullptr; 2108 2109 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2110 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2111 MemCheckBlock); 2112 2113 DT->addNewBlock(MemCheckBlock, Pred); 2114 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2115 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2116 2117 if (OuterLoop) 2118 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2119 2120 BranchInst &BI = 2121 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2122 if (AddBranchWeights) { 2123 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); 2124 } 2125 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2126 MemCheckBlock->getTerminator()->setDebugLoc( 2127 Pred->getTerminator()->getDebugLoc()); 2128 2129 // Mark the check as used, to prevent it from being removed during cleanup. 2130 MemRuntimeCheckCond = nullptr; 2131 return MemCheckBlock; 2132 } 2133 }; 2134 } // namespace 2135 2136 static bool useActiveLaneMask(TailFoldingStyle Style) { 2137 return Style == TailFoldingStyle::Data || 2138 Style == TailFoldingStyle::DataAndControlFlow || 2139 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2140 } 2141 2142 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2143 return Style == TailFoldingStyle::DataAndControlFlow || 2144 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2145 } 2146 2147 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2148 // vectorization. The loop needs to be annotated with #pragma omp simd 2149 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2150 // vector length information is not provided, vectorization is not considered 2151 // explicit. Interleave hints are not allowed either. These limitations will be 2152 // relaxed in the future. 2153 // Please, note that we are currently forced to abuse the pragma 'clang 2154 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2155 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2156 // provides *explicit vectorization hints* (LV can bypass legal checks and 2157 // assume that vectorization is legal). However, both hints are implemented 2158 // using the same metadata (llvm.loop.vectorize, processed by 2159 // LoopVectorizeHints). This will be fixed in the future when the native IR 2160 // representation for pragma 'omp simd' is introduced. 2161 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2162 OptimizationRemarkEmitter *ORE) { 2163 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2164 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2165 2166 // Only outer loops with an explicit vectorization hint are supported. 2167 // Unannotated outer loops are ignored. 2168 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2169 return false; 2170 2171 Function *Fn = OuterLp->getHeader()->getParent(); 2172 if (!Hints.allowVectorization(Fn, OuterLp, 2173 true /*VectorizeOnlyWhenForced*/)) { 2174 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2175 return false; 2176 } 2177 2178 if (Hints.getInterleave() > 1) { 2179 // TODO: Interleave support is future work. 2180 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2181 "outer loops.\n"); 2182 Hints.emitRemarkWithHints(); 2183 return false; 2184 } 2185 2186 return true; 2187 } 2188 2189 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2190 OptimizationRemarkEmitter *ORE, 2191 SmallVectorImpl<Loop *> &V) { 2192 // Collect inner loops and outer loops without irreducible control flow. For 2193 // now, only collect outer loops that have explicit vectorization hints. If we 2194 // are stress testing the VPlan H-CFG construction, we collect the outermost 2195 // loop of every loop nest. 2196 if (L.isInnermost() || VPlanBuildStressTest || 2197 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2198 LoopBlocksRPO RPOT(&L); 2199 RPOT.perform(LI); 2200 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2201 V.push_back(&L); 2202 // TODO: Collect inner loops inside marked outer loops in case 2203 // vectorization fails for the outer loop. Do not invoke 2204 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2205 // already known to be reducible. We can use an inherited attribute for 2206 // that. 2207 return; 2208 } 2209 } 2210 for (Loop *InnerL : L) 2211 collectSupportedLoops(*InnerL, LI, ORE, V); 2212 } 2213 2214 //===----------------------------------------------------------------------===// 2215 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2216 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2217 //===----------------------------------------------------------------------===// 2218 2219 /// Compute the transformed value of Index at offset StartValue using step 2220 /// StepValue. 2221 /// For integer induction, returns StartValue + Index * StepValue. 2222 /// For pointer induction, returns StartValue[Index * StepValue]. 2223 /// FIXME: The newly created binary instructions should contain nsw/nuw 2224 /// flags, which can be found from the original scalar operations. 2225 static Value * 2226 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2227 Value *Step, 2228 InductionDescriptor::InductionKind InductionKind, 2229 const BinaryOperator *InductionBinOp) { 2230 Type *StepTy = Step->getType(); 2231 Value *CastedIndex = StepTy->isIntegerTy() 2232 ? B.CreateSExtOrTrunc(Index, StepTy) 2233 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2234 if (CastedIndex != Index) { 2235 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2236 Index = CastedIndex; 2237 } 2238 2239 // Note: the IR at this point is broken. We cannot use SE to create any new 2240 // SCEV and then expand it, hoping that SCEV's simplification will give us 2241 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2242 // lead to various SCEV crashes. So all we can do is to use builder and rely 2243 // on InstCombine for future simplifications. Here we handle some trivial 2244 // cases only. 2245 auto CreateAdd = [&B](Value *X, Value *Y) { 2246 assert(X->getType() == Y->getType() && "Types don't match!"); 2247 if (auto *CX = dyn_cast<ConstantInt>(X)) 2248 if (CX->isZero()) 2249 return Y; 2250 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2251 if (CY->isZero()) 2252 return X; 2253 return B.CreateAdd(X, Y); 2254 }; 2255 2256 // We allow X to be a vector type, in which case Y will potentially be 2257 // splatted into a vector with the same element count. 2258 auto CreateMul = [&B](Value *X, Value *Y) { 2259 assert(X->getType()->getScalarType() == Y->getType() && 2260 "Types don't match!"); 2261 if (auto *CX = dyn_cast<ConstantInt>(X)) 2262 if (CX->isOne()) 2263 return Y; 2264 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2265 if (CY->isOne()) 2266 return X; 2267 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2268 if (XVTy && !isa<VectorType>(Y->getType())) 2269 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2270 return B.CreateMul(X, Y); 2271 }; 2272 2273 switch (InductionKind) { 2274 case InductionDescriptor::IK_IntInduction: { 2275 assert(!isa<VectorType>(Index->getType()) && 2276 "Vector indices not supported for integer inductions yet"); 2277 assert(Index->getType() == StartValue->getType() && 2278 "Index type does not match StartValue type"); 2279 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2280 return B.CreateSub(StartValue, Index); 2281 auto *Offset = CreateMul(Index, Step); 2282 return CreateAdd(StartValue, Offset); 2283 } 2284 case InductionDescriptor::IK_PtrInduction: 2285 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2286 case InductionDescriptor::IK_FpInduction: { 2287 assert(!isa<VectorType>(Index->getType()) && 2288 "Vector indices not supported for FP inductions yet"); 2289 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2290 assert(InductionBinOp && 2291 (InductionBinOp->getOpcode() == Instruction::FAdd || 2292 InductionBinOp->getOpcode() == Instruction::FSub) && 2293 "Original bin op should be defined for FP induction"); 2294 2295 Value *MulExp = B.CreateFMul(Step, Index); 2296 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2297 "induction"); 2298 } 2299 case InductionDescriptor::IK_NoInduction: 2300 return nullptr; 2301 } 2302 llvm_unreachable("invalid enum"); 2303 } 2304 2305 std::optional<unsigned> getMaxVScale(const Function &F, 2306 const TargetTransformInfo &TTI) { 2307 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2308 return MaxVScale; 2309 2310 if (F.hasFnAttribute(Attribute::VScaleRange)) 2311 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2312 2313 return std::nullopt; 2314 } 2315 2316 /// For the given VF and UF and maximum trip count computed for the loop, return 2317 /// whether the induction variable might overflow in the vectorized loop. If not, 2318 /// then we know a runtime overflow check always evaluates to false and can be 2319 /// removed. 2320 static bool isIndvarOverflowCheckKnownFalse( 2321 const LoopVectorizationCostModel *Cost, 2322 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2323 // Always be conservative if we don't know the exact unroll factor. 2324 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2325 2326 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2327 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2328 2329 // We know the runtime overflow check is known false iff the (max) trip-count 2330 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2331 // the vector loop induction variable. 2332 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { 2333 uint64_t MaxVF = VF.getKnownMinValue(); 2334 if (VF.isScalable()) { 2335 std::optional<unsigned> MaxVScale = 2336 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2337 if (!MaxVScale) 2338 return false; 2339 MaxVF *= *MaxVScale; 2340 } 2341 2342 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2343 } 2344 2345 return false; 2346 } 2347 2348 // Return whether we allow using masked interleave-groups (for dealing with 2349 // strided loads/stores that reside in predicated blocks, or for dealing 2350 // with gaps). 2351 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2352 // If an override option has been passed in for interleaved accesses, use it. 2353 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2354 return EnableMaskedInterleavedMemAccesses; 2355 2356 return TTI.enableMaskedInterleavedAccessVectorization(); 2357 } 2358 2359 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2360 VPReplicateRecipe *RepRecipe, 2361 const VPLane &Lane, 2362 VPTransformState &State) { 2363 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2364 2365 // Does this instruction return a value ? 2366 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2367 2368 Instruction *Cloned = Instr->clone(); 2369 if (!IsVoidRetTy) { 2370 Cloned->setName(Instr->getName() + ".cloned"); 2371 #if !defined(NDEBUG) 2372 // Verify that VPlan type inference results agree with the type of the 2373 // generated values. 2374 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2375 "inferred type and type from generated instructions do not match"); 2376 #endif 2377 } 2378 2379 RepRecipe->setFlags(Cloned); 2380 2381 if (auto DL = Instr->getDebugLoc()) 2382 State.setDebugLocFrom(DL); 2383 2384 // Replace the operands of the cloned instructions with their scalar 2385 // equivalents in the new loop. 2386 for (const auto &I : enumerate(RepRecipe->operands())) { 2387 auto InputLane = Lane; 2388 VPValue *Operand = I.value(); 2389 if (vputils::isUniformAfterVectorization(Operand)) 2390 InputLane = VPLane::getFirstLane(); 2391 Cloned->setOperand(I.index(), State.get(Operand, InputLane)); 2392 } 2393 State.addNewMetadata(Cloned, Instr); 2394 2395 // Place the cloned scalar in the new loop. 2396 State.Builder.Insert(Cloned); 2397 2398 State.set(RepRecipe, Cloned, Lane); 2399 2400 // If we just cloned a new assumption, add it the assumption cache. 2401 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2402 AC->registerAssumption(II); 2403 2404 // End if-block. 2405 VPRegionBlock *Parent = RepRecipe->getParent()->getParent(); 2406 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false; 2407 assert( 2408 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() || 2409 all_of(RepRecipe->operands(), 2410 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) && 2411 "Expected a recipe is either within a region or all of its operands " 2412 "are defined outside the vectorized region."); 2413 if (IfPredicateInstr) 2414 PredicatedInstructions.push_back(Cloned); 2415 } 2416 2417 Value * 2418 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2419 if (VectorTripCount) 2420 return VectorTripCount; 2421 2422 Value *TC = getTripCount(); 2423 IRBuilder<> Builder(InsertBlock->getTerminator()); 2424 2425 Type *Ty = TC->getType(); 2426 // This is where we can make the step a runtime constant. 2427 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2428 2429 // If the tail is to be folded by masking, round the number of iterations N 2430 // up to a multiple of Step instead of rounding down. This is done by first 2431 // adding Step-1 and then rounding down. Note that it's ok if this addition 2432 // overflows: the vector induction variable will eventually wrap to zero given 2433 // that it starts at zero and its Step is a power of two; the loop will then 2434 // exit, with the last early-exit vector comparison also producing all-true. 2435 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2436 // is accounted for in emitIterationCountCheck that adds an overflow check. 2437 if (Cost->foldTailByMasking()) { 2438 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2439 "VF*UF must be a power of 2 when folding tail by masking"); 2440 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), 2441 "n.rnd.up"); 2442 } 2443 2444 // Now we need to generate the expression for the part of the loop that the 2445 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2446 // iterations are not required for correctness, or N - Step, otherwise. Step 2447 // is equal to the vectorization factor (number of SIMD elements) times the 2448 // unroll factor (number of SIMD instructions). 2449 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2450 2451 // There are cases where we *must* run at least one iteration in the remainder 2452 // loop. See the cost model for when this can happen. If the step evenly 2453 // divides the trip count, we set the remainder to be equal to the step. If 2454 // the step does not evenly divide the trip count, no adjustment is necessary 2455 // since there will already be scalar iterations. Note that the minimum 2456 // iterations check ensures that N >= Step. 2457 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2458 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2459 R = Builder.CreateSelect(IsZero, Step, R); 2460 } 2461 2462 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2463 2464 return VectorTripCount; 2465 } 2466 2467 void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { 2468 VPBlockBase *ScalarPH = Plan.getScalarPreheader(); 2469 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor(); 2470 if (PreVectorPH->getNumSuccessors() != 1) { 2471 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors"); 2472 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && 2473 "Unexpected successor"); 2474 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB); 2475 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB); 2476 PreVectorPH = CheckVPIRBB; 2477 } 2478 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); 2479 PreVectorPH->swapSuccessors(); 2480 } 2481 2482 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2483 Value *Count = getTripCount(); 2484 // Reuse existing vector loop preheader for TC checks. 2485 // Note that new preheader block is generated for vector loop. 2486 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2487 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2488 2489 // Generate code to check if the loop's trip count is less than VF * UF, or 2490 // equal to it in case a scalar epilogue is required; this implies that the 2491 // vector trip count is zero. This check also covers the case where adding one 2492 // to the backedge-taken count overflowed leading to an incorrect trip count 2493 // of zero. In this case we will also jump to the scalar loop. 2494 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2495 : ICmpInst::ICMP_ULT; 2496 2497 // If tail is to be folded, vector loop takes care of all iterations. 2498 Type *CountTy = Count->getType(); 2499 Value *CheckMinIters = Builder.getFalse(); 2500 auto CreateStep = [&]() -> Value * { 2501 // Create step with max(MinProTripCount, UF * VF). 2502 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2503 return createStepForVF(Builder, CountTy, VF, UF); 2504 2505 Value *MinProfTC = 2506 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2507 if (!VF.isScalable()) 2508 return MinProfTC; 2509 return Builder.CreateBinaryIntrinsic( 2510 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2511 }; 2512 2513 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2514 if (Style == TailFoldingStyle::None) { 2515 Value *Step = CreateStep(); 2516 ScalarEvolution &SE = *PSE.getSE(); 2517 // TODO: Emit unconditional branch to vector preheader instead of 2518 // conditional branch with known condition. 2519 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop); 2520 // Check if the trip count is < the step. 2521 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) { 2522 // TODO: Ensure step is at most the trip count when determining max VF and 2523 // UF, w/o tail folding. 2524 CheckMinIters = Builder.getTrue(); 2525 } else if (!SE.isKnownPredicate(CmpInst::getInversePredicate(P), 2526 TripCountSCEV, SE.getSCEV(Step))) { 2527 // Generate the minimum iteration check only if we cannot prove the 2528 // check is known to be true, or known to be false. 2529 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2530 } // else step known to be < trip count, use CheckMinIters preset to false. 2531 } else if (VF.isScalable() && 2532 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2533 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2534 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2535 // an overflow to zero when updating induction variables and so an 2536 // additional overflow check is required before entering the vector loop. 2537 2538 // Get the maximum unsigned value for the type. 2539 Value *MaxUIntTripCount = 2540 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2541 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2542 2543 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2544 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2545 } 2546 2547 // Create new preheader for vector loop. 2548 LoopVectorPreHeader = 2549 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2550 "vector.ph"); 2551 2552 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2553 DT->getNode(Bypass)->getIDom()) && 2554 "TC check is expected to dominate Bypass"); 2555 2556 BranchInst &BI = 2557 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2558 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2559 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 2560 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2561 LoopBypassBlocks.push_back(TCCheckBlock); 2562 2563 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. 2564 introduceCheckBlockInVPlan(TCCheckBlock); 2565 } 2566 2567 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2568 BasicBlock *const SCEVCheckBlock = 2569 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader); 2570 if (!SCEVCheckBlock) 2571 return nullptr; 2572 2573 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2574 (OptForSizeBasedOnProfile && 2575 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2576 "Cannot SCEV check stride or overflow when optimizing for size"); 2577 assert(!LoopBypassBlocks.empty() && 2578 "Should already be a bypass block due to iteration count check"); 2579 LoopBypassBlocks.push_back(SCEVCheckBlock); 2580 AddedSafetyChecks = true; 2581 2582 introduceCheckBlockInVPlan(SCEVCheckBlock); 2583 return SCEVCheckBlock; 2584 } 2585 2586 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2587 // VPlan-native path does not do any analysis for runtime checks currently. 2588 if (EnableVPlanNativePath) 2589 return nullptr; 2590 2591 BasicBlock *const MemCheckBlock = 2592 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2593 2594 // Check if we generated code that checks in runtime if arrays overlap. We put 2595 // the checks into a separate block to make the more common case of few 2596 // elements faster. 2597 if (!MemCheckBlock) 2598 return nullptr; 2599 2600 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2601 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2602 "Cannot emit memory checks when optimizing for size, unless forced " 2603 "to vectorize."); 2604 ORE->emit([&]() { 2605 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2606 OrigLoop->getStartLoc(), 2607 OrigLoop->getHeader()) 2608 << "Code-size may be reduced by not forcing " 2609 "vectorization, or by source-code modifications " 2610 "eliminating the need for runtime checks " 2611 "(e.g., adding 'restrict')."; 2612 }); 2613 } 2614 2615 LoopBypassBlocks.push_back(MemCheckBlock); 2616 2617 AddedSafetyChecks = true; 2618 2619 introduceCheckBlockInVPlan(MemCheckBlock); 2620 return MemCheckBlock; 2621 } 2622 2623 /// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p 2624 /// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must 2625 /// have a single predecessor, which is rewired to the new VPIRBasicBlock. All 2626 /// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. 2627 static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { 2628 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB); 2629 for (auto &R : make_early_inc_range(*VPBB)) { 2630 assert(!R.isPhi() && "Tried to move phi recipe to end of block"); 2631 R.moveBefore(*IRVPBB, IRVPBB->end()); 2632 } 2633 2634 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB); 2635 // VPBB is now dead and will be cleaned up when the plan gets destroyed. 2636 } 2637 2638 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2639 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2640 assert(LoopVectorPreHeader && "Invalid loop structure"); 2641 assert((OrigLoop->getUniqueLatchExitBlock() || 2642 Cost->requiresScalarEpilogue(VF.isVector())) && 2643 "loops not exiting via the latch without required epilogue?"); 2644 2645 LoopMiddleBlock = 2646 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2647 LI, nullptr, Twine(Prefix) + "middle.block"); 2648 replaceVPBBWithIRVPBB(Plan.getMiddleBlock(), LoopMiddleBlock); 2649 LoopScalarPreHeader = 2650 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2651 nullptr, Twine(Prefix) + "scalar.ph"); 2652 replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); 2653 } 2654 2655 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2656 /// expansion results. 2657 static Value *getExpandedStep(const InductionDescriptor &ID, 2658 const SCEV2ValueTy &ExpandedSCEVs) { 2659 const SCEV *Step = ID.getStep(); 2660 if (auto *C = dyn_cast<SCEVConstant>(Step)) 2661 return C->getValue(); 2662 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 2663 return U->getValue(); 2664 auto I = ExpandedSCEVs.find(Step); 2665 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 2666 return I->second; 2667 } 2668 2669 /// Knowing that loop \p L executes a single vector iteration, add instructions 2670 /// that will get simplified and thus should not have any cost to \p 2671 /// InstsToIgnore. 2672 static void addFullyUnrolledInstructionsToIgnore( 2673 Loop *L, const LoopVectorizationLegality::InductionList &IL, 2674 SmallPtrSetImpl<Instruction *> &InstsToIgnore) { 2675 auto *Cmp = L->getLatchCmpInst(); 2676 if (Cmp) 2677 InstsToIgnore.insert(Cmp); 2678 for (const auto &KV : IL) { 2679 // Extract the key by hand so that it can be used in the lambda below. Note 2680 // that captured structured bindings are a C++20 extension. 2681 const PHINode *IV = KV.first; 2682 2683 // Get next iteration value of the induction variable. 2684 Instruction *IVInst = 2685 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch())); 2686 if (all_of(IVInst->users(), 2687 [&](const User *U) { return U == IV || U == Cmp; })) 2688 InstsToIgnore.insert(IVInst); 2689 } 2690 } 2691 2692 void InnerLoopVectorizer::createInductionAdditionalBypassValues( 2693 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) { 2694 assert(MainVectorTripCount && "Must have bypass information"); 2695 2696 Instruction *OldInduction = Legal->getPrimaryInduction(); 2697 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(), 2698 getAdditionalBypassBlock()->getFirstInsertionPt()); 2699 for (const auto &InductionEntry : Legal->getInductionVars()) { 2700 PHINode *OrigPhi = InductionEntry.first; 2701 const InductionDescriptor &II = InductionEntry.second; 2702 Value *Step = getExpandedStep(II, ExpandedSCEVs); 2703 // For the primary induction the additional bypass end value is known. 2704 // Otherwise it is computed. 2705 Value *EndValueFromAdditionalBypass = MainVectorTripCount; 2706 if (OrigPhi != OldInduction) { 2707 auto *BinOp = II.getInductionBinOp(); 2708 // Fast-math-flags propagate from the original induction instruction. 2709 if (isa_and_nonnull<FPMathOperator>(BinOp)) 2710 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags()); 2711 2712 // Compute the end value for the additional bypass. 2713 EndValueFromAdditionalBypass = 2714 emitTransformedIndex(BypassBuilder, MainVectorTripCount, 2715 II.getStartValue(), Step, II.getKind(), BinOp); 2716 EndValueFromAdditionalBypass->setName("ind.end"); 2717 } 2718 2719 // Store the bypass value here, as it needs to be added as operand to its 2720 // scalar preheader phi node after the epilogue skeleton has been created. 2721 // TODO: Directly add as extra operand to the VPResumePHI recipe. 2722 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) && 2723 "entry for OrigPhi already exits"); 2724 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass; 2725 } 2726 } 2727 2728 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( 2729 const SCEV2ValueTy &ExpandedSCEVs) { 2730 /* 2731 In this function we generate a new loop. The new loop will contain 2732 the vectorized instructions while the old loop will continue to run the 2733 scalar remainder. 2734 2735 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2736 / | preheader are expanded here. Eventually all required SCEV 2737 / | expansion should happen here. 2738 / v 2739 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2740 | / | 2741 | / v 2742 || [ ] <-- vector pre header. 2743 |/ | 2744 | v 2745 | [ ] \ 2746 | [ ]_| <-- vector loop (created during VPlan execution). 2747 | | 2748 | v 2749 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2750 | | successors created during VPlan execution) 2751 \/ | 2752 /\ v 2753 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2754 | | 2755 (opt) v <-- edge from middle to exit iff epilogue is not required. 2756 | [ ] \ 2757 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header 2758 | | wrapped in VPIRBasicBlock). 2759 \ | 2760 \ v 2761 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2762 ... 2763 */ 2764 2765 // Create an empty vector loop, and prepare basic blocks for the runtime 2766 // checks. 2767 createVectorLoopSkeleton(""); 2768 2769 // Now, compare the new count to zero. If it is zero skip the vector loop and 2770 // jump to the scalar loop. This check also covers the case where the 2771 // backedge-taken count is uint##_max: adding one to it will overflow leading 2772 // to an incorrect trip count of zero. In this (rare) case we will also jump 2773 // to the scalar loop. 2774 emitIterationCountCheck(LoopScalarPreHeader); 2775 2776 // Generate the code to check any assumptions that we've made for SCEV 2777 // expressions. 2778 emitSCEVChecks(LoopScalarPreHeader); 2779 2780 // Generate the code that checks in runtime if arrays overlap. We put the 2781 // checks into a separate block to make the more common case of few elements 2782 // faster. 2783 emitMemRuntimeChecks(LoopScalarPreHeader); 2784 2785 return LoopVectorPreHeader; 2786 } 2787 2788 namespace { 2789 2790 struct CSEDenseMapInfo { 2791 static bool canHandle(const Instruction *I) { 2792 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2793 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2794 } 2795 2796 static inline Instruction *getEmptyKey() { 2797 return DenseMapInfo<Instruction *>::getEmptyKey(); 2798 } 2799 2800 static inline Instruction *getTombstoneKey() { 2801 return DenseMapInfo<Instruction *>::getTombstoneKey(); 2802 } 2803 2804 static unsigned getHashValue(const Instruction *I) { 2805 assert(canHandle(I) && "Unknown instruction!"); 2806 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2807 I->value_op_end())); 2808 } 2809 2810 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 2811 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2812 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2813 return LHS == RHS; 2814 return LHS->isIdenticalTo(RHS); 2815 } 2816 }; 2817 2818 } // end anonymous namespace 2819 2820 ///Perform cse of induction variable instructions. 2821 static void cse(BasicBlock *BB) { 2822 // Perform simple cse. 2823 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2824 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 2825 if (!CSEDenseMapInfo::canHandle(&In)) 2826 continue; 2827 2828 // Check if we can replace this instruction with any of the 2829 // visited instructions. 2830 if (Instruction *V = CSEMap.lookup(&In)) { 2831 In.replaceAllUsesWith(V); 2832 In.eraseFromParent(); 2833 continue; 2834 } 2835 2836 CSEMap[&In] = &In; 2837 } 2838 } 2839 2840 InstructionCost 2841 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 2842 ElementCount VF) const { 2843 // We only need to calculate a cost if the VF is scalar; for actual vectors 2844 // we should already have a pre-calculated cost at each VF. 2845 if (!VF.isScalar()) 2846 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 2847 2848 Type *RetTy = CI->getType(); 2849 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 2850 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) 2851 return *RedCost; 2852 2853 SmallVector<Type *, 4> Tys; 2854 for (auto &ArgOp : CI->args()) 2855 Tys.push_back(ArgOp->getType()); 2856 2857 InstructionCost ScalarCallCost = 2858 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 2859 2860 // If this is an intrinsic we may have a lower cost for it. 2861 if (getVectorIntrinsicIDForCall(CI, TLI)) { 2862 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 2863 return std::min(ScalarCallCost, IntrinsicCost); 2864 } 2865 return ScalarCallCost; 2866 } 2867 2868 static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { 2869 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 2870 return Elt; 2871 return VectorType::get(Elt, VF); 2872 } 2873 2874 InstructionCost 2875 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 2876 ElementCount VF) const { 2877 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 2878 assert(ID && "Expected intrinsic call!"); 2879 Type *RetTy = maybeVectorizeType(CI->getType(), VF); 2880 FastMathFlags FMF; 2881 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 2882 FMF = FPMO->getFastMathFlags(); 2883 2884 SmallVector<const Value *> Arguments(CI->args()); 2885 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 2886 SmallVector<Type *> ParamTys; 2887 std::transform(FTy->param_begin(), FTy->param_end(), 2888 std::back_inserter(ParamTys), 2889 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); 2890 2891 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 2892 dyn_cast<IntrinsicInst>(CI)); 2893 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); 2894 } 2895 2896 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 2897 // Fix widened non-induction PHIs by setting up the PHI operands. 2898 if (EnableVPlanNativePath) 2899 fixNonInductionPHIs(State); 2900 2901 // Forget the original basic block. 2902 PSE.getSE()->forgetLoop(OrigLoop); 2903 PSE.getSE()->forgetBlockAndLoopDispositions(); 2904 2905 // After vectorization, the exit blocks of the original loop will have 2906 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 2907 // looked through single-entry phis. 2908 SmallVector<BasicBlock *> ExitBlocks; 2909 OrigLoop->getExitBlocks(ExitBlocks); 2910 for (BasicBlock *Exit : ExitBlocks) 2911 for (PHINode &PN : Exit->phis()) 2912 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 2913 2914 // Don't apply optimizations below when no vector region remains, as they all 2915 // require a vector loop at the moment. 2916 if (!State.Plan->getVectorLoopRegion()) 2917 return; 2918 2919 for (Instruction *PI : PredicatedInstructions) 2920 sinkScalarOperands(&*PI); 2921 2922 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 2923 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 2924 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; 2925 2926 // Remove redundant induction instructions. 2927 cse(HeaderBB); 2928 2929 // Set/update profile weights for the vector and remainder loops as original 2930 // loop iterations are now distributed among them. Note that original loop 2931 // becomes the scalar remainder loop after vectorization. 2932 // 2933 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 2934 // end up getting slightly roughened result but that should be OK since 2935 // profile is not inherently precise anyway. Note also possible bypass of 2936 // vector code caused by legality checks is ignored, assigning all the weight 2937 // to the vector loop, optimistically. 2938 // 2939 // For scalable vectorization we can't know at compile time how many 2940 // iterations of the loop are handled in one vector iteration, so instead 2941 // assume a pessimistic vscale of '1'. 2942 Loop *VectorLoop = LI->getLoopFor(HeaderBB); 2943 setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, 2944 VF.getKnownMinValue() * UF); 2945 } 2946 2947 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 2948 // The basic block and loop containing the predicated instruction. 2949 auto *PredBB = PredInst->getParent(); 2950 auto *VectorLoop = LI->getLoopFor(PredBB); 2951 2952 // Initialize a worklist with the operands of the predicated instruction. 2953 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 2954 2955 // Holds instructions that we need to analyze again. An instruction may be 2956 // reanalyzed if we don't yet know if we can sink it or not. 2957 SmallVector<Instruction *, 8> InstsToReanalyze; 2958 2959 // Returns true if a given use occurs in the predicated block. Phi nodes use 2960 // their operands in their corresponding predecessor blocks. 2961 auto IsBlockOfUsePredicated = [&](Use &U) -> bool { 2962 auto *I = cast<Instruction>(U.getUser()); 2963 BasicBlock *BB = I->getParent(); 2964 if (auto *Phi = dyn_cast<PHINode>(I)) 2965 BB = Phi->getIncomingBlock( 2966 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 2967 return BB == PredBB; 2968 }; 2969 2970 // Iteratively sink the scalarized operands of the predicated instruction 2971 // into the block we created for it. When an instruction is sunk, it's 2972 // operands are then added to the worklist. The algorithm ends after one pass 2973 // through the worklist doesn't sink a single instruction. 2974 bool Changed; 2975 do { 2976 // Add the instructions that need to be reanalyzed to the worklist, and 2977 // reset the changed indicator. 2978 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 2979 InstsToReanalyze.clear(); 2980 Changed = false; 2981 2982 while (!Worklist.empty()) { 2983 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 2984 2985 // We can't sink an instruction if it is a phi node, is not in the loop, 2986 // may have side effects or may read from memory. 2987 // TODO: Could do more granular checking to allow sinking 2988 // a load past non-store instructions. 2989 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 2990 I->mayHaveSideEffects() || I->mayReadFromMemory()) 2991 continue; 2992 2993 // If the instruction is already in PredBB, check if we can sink its 2994 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 2995 // sinking the scalar instruction I, hence it appears in PredBB; but it 2996 // may have failed to sink I's operands (recursively), which we try 2997 // (again) here. 2998 if (I->getParent() == PredBB) { 2999 Worklist.insert(I->op_begin(), I->op_end()); 3000 continue; 3001 } 3002 3003 // It's legal to sink the instruction if all its uses occur in the 3004 // predicated block. Otherwise, there's nothing to do yet, and we may 3005 // need to reanalyze the instruction. 3006 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { 3007 InstsToReanalyze.push_back(I); 3008 continue; 3009 } 3010 3011 // Move the instruction to the beginning of the predicated block, and add 3012 // it's operands to the worklist. 3013 I->moveBefore(PredBB->getFirstInsertionPt()); 3014 Worklist.insert(I->op_begin(), I->op_end()); 3015 3016 // The sinking may have enabled other instructions to be sunk, so we will 3017 // need to iterate. 3018 Changed = true; 3019 } 3020 } while (Changed); 3021 } 3022 3023 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 3024 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3025 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3026 for (VPRecipeBase &P : VPBB->phis()) { 3027 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3028 if (!VPPhi) 3029 continue; 3030 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi)); 3031 // Make sure the builder has a valid insert point. 3032 Builder.SetInsertPoint(NewPhi); 3033 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) { 3034 VPValue *Inc = VPPhi->getIncomingValue(Idx); 3035 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); 3036 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]); 3037 } 3038 } 3039 } 3040 } 3041 3042 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3043 // We should not collect Scalars more than once per VF. Right now, this 3044 // function is called from collectUniformsAndScalars(), which already does 3045 // this check. Collecting Scalars for VF=1 does not make any sense. 3046 assert(VF.isVector() && !Scalars.contains(VF) && 3047 "This function should not be visited twice for the same VF"); 3048 3049 // This avoids any chances of creating a REPLICATE recipe during planning 3050 // since that would result in generation of scalarized code during execution, 3051 // which is not supported for scalable vectors. 3052 if (VF.isScalable()) { 3053 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3054 return; 3055 } 3056 3057 SmallSetVector<Instruction *, 8> Worklist; 3058 3059 // These sets are used to seed the analysis with pointers used by memory 3060 // accesses that will remain scalar. 3061 SmallSetVector<Instruction *, 8> ScalarPtrs; 3062 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3063 auto *Latch = TheLoop->getLoopLatch(); 3064 3065 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3066 // The pointer operands of loads and stores will be scalar as long as the 3067 // memory access is not a gather or scatter operation. The value operand of a 3068 // store will remain scalar if the store is scalarized. 3069 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3070 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3071 assert(WideningDecision != CM_Unknown && 3072 "Widening decision should be ready at this moment"); 3073 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3074 if (Ptr == Store->getValueOperand()) 3075 return WideningDecision == CM_Scalarize; 3076 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3077 "Ptr is neither a value or pointer operand"); 3078 return WideningDecision != CM_GatherScatter; 3079 }; 3080 3081 // A helper that returns true if the given value is a getelementptr 3082 // instruction contained in the loop. 3083 auto IsLoopVaryingGEP = [&](Value *V) { 3084 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V); 3085 }; 3086 3087 // A helper that evaluates a memory access's use of a pointer. If the use will 3088 // be a scalar use and the pointer is only used by memory accesses, we place 3089 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3090 // PossibleNonScalarPtrs. 3091 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3092 // We only care about bitcast and getelementptr instructions contained in 3093 // the loop. 3094 if (!IsLoopVaryingGEP(Ptr)) 3095 return; 3096 3097 // If the pointer has already been identified as scalar (e.g., if it was 3098 // also identified as uniform), there's nothing to do. 3099 auto *I = cast<Instruction>(Ptr); 3100 if (Worklist.count(I)) 3101 return; 3102 3103 // If the use of the pointer will be a scalar use, and all users of the 3104 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3105 // place the pointer in PossibleNonScalarPtrs. 3106 if (IsScalarUse(MemAccess, Ptr) && 3107 all_of(I->users(), IsaPred<LoadInst, StoreInst>)) 3108 ScalarPtrs.insert(I); 3109 else 3110 PossibleNonScalarPtrs.insert(I); 3111 }; 3112 3113 // We seed the scalars analysis with three classes of instructions: (1) 3114 // instructions marked uniform-after-vectorization and (2) bitcast, 3115 // getelementptr and (pointer) phi instructions used by memory accesses 3116 // requiring a scalar use. 3117 // 3118 // (1) Add to the worklist all instructions that have been identified as 3119 // uniform-after-vectorization. 3120 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3121 3122 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3123 // memory accesses requiring a scalar use. The pointer operands of loads and 3124 // stores will be scalar unless the operation is a gather or scatter. 3125 // The value operand of a store will remain scalar if the store is scalarized. 3126 for (auto *BB : TheLoop->blocks()) 3127 for (auto &I : *BB) { 3128 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3129 EvaluatePtrUse(Load, Load->getPointerOperand()); 3130 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3131 EvaluatePtrUse(Store, Store->getPointerOperand()); 3132 EvaluatePtrUse(Store, Store->getValueOperand()); 3133 } 3134 } 3135 for (auto *I : ScalarPtrs) 3136 if (!PossibleNonScalarPtrs.count(I)) { 3137 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3138 Worklist.insert(I); 3139 } 3140 3141 // Insert the forced scalars. 3142 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3143 // induction variable when the PHI user is scalarized. 3144 auto ForcedScalar = ForcedScalars.find(VF); 3145 if (ForcedScalar != ForcedScalars.end()) 3146 for (auto *I : ForcedScalar->second) { 3147 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3148 Worklist.insert(I); 3149 } 3150 3151 // Expand the worklist by looking through any bitcasts and getelementptr 3152 // instructions we've already identified as scalar. This is similar to the 3153 // expansion step in collectLoopUniforms(); however, here we're only 3154 // expanding to include additional bitcasts and getelementptr instructions. 3155 unsigned Idx = 0; 3156 while (Idx != Worklist.size()) { 3157 Instruction *Dst = Worklist[Idx++]; 3158 if (!IsLoopVaryingGEP(Dst->getOperand(0))) 3159 continue; 3160 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3161 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3162 auto *J = cast<Instruction>(U); 3163 return !TheLoop->contains(J) || Worklist.count(J) || 3164 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3165 IsScalarUse(J, Src)); 3166 })) { 3167 Worklist.insert(Src); 3168 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3169 } 3170 } 3171 3172 // An induction variable will remain scalar if all users of the induction 3173 // variable and induction variable update remain scalar. 3174 for (const auto &Induction : Legal->getInductionVars()) { 3175 auto *Ind = Induction.first; 3176 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3177 3178 // If tail-folding is applied, the primary induction variable will be used 3179 // to feed a vector compare. 3180 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3181 continue; 3182 3183 // Returns true if \p Indvar is a pointer induction that is used directly by 3184 // load/store instruction \p I. 3185 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3186 Instruction *I) { 3187 return Induction.second.getKind() == 3188 InductionDescriptor::IK_PtrInduction && 3189 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3190 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar); 3191 }; 3192 3193 // Determine if all users of the induction variable are scalar after 3194 // vectorization. 3195 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { 3196 auto *I = cast<Instruction>(U); 3197 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3198 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3199 }); 3200 if (!ScalarInd) 3201 continue; 3202 3203 // If the induction variable update is a fixed-order recurrence, neither the 3204 // induction variable or its update should be marked scalar after 3205 // vectorization. 3206 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); 3207 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) 3208 continue; 3209 3210 // Determine if all users of the induction variable update instruction are 3211 // scalar after vectorization. 3212 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3213 auto *I = cast<Instruction>(U); 3214 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3215 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3216 }); 3217 if (!ScalarIndUpdate) 3218 continue; 3219 3220 // The induction variable and its update instruction will remain scalar. 3221 Worklist.insert(Ind); 3222 Worklist.insert(IndUpdate); 3223 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3224 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3225 << "\n"); 3226 } 3227 3228 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3229 } 3230 3231 bool LoopVectorizationCostModel::isScalarWithPredication( 3232 Instruction *I, ElementCount VF) const { 3233 if (!isPredicatedInst(I)) 3234 return false; 3235 3236 // Do we have a non-scalar lowering for this predicated 3237 // instruction? No - it is scalar with predication. 3238 switch(I->getOpcode()) { 3239 default: 3240 return true; 3241 case Instruction::Call: 3242 if (VF.isScalar()) 3243 return true; 3244 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3245 .Kind == CM_Scalarize; 3246 case Instruction::Load: 3247 case Instruction::Store: { 3248 auto *Ptr = getLoadStorePointerOperand(I); 3249 auto *Ty = getLoadStoreType(I); 3250 Type *VTy = Ty; 3251 if (VF.isVector()) 3252 VTy = VectorType::get(Ty, VF); 3253 const Align Alignment = getLoadStoreAlignment(I); 3254 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3255 TTI.isLegalMaskedGather(VTy, Alignment)) 3256 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3257 TTI.isLegalMaskedScatter(VTy, Alignment)); 3258 } 3259 case Instruction::UDiv: 3260 case Instruction::SDiv: 3261 case Instruction::SRem: 3262 case Instruction::URem: { 3263 // We have the option to use the safe-divisor idiom to avoid predication. 3264 // The cost based decision here will always select safe-divisor for 3265 // scalable vectors as scalarization isn't legal. 3266 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3267 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3268 } 3269 } 3270 } 3271 3272 // TODO: Fold into LoopVectorizationLegality::isMaskRequired. 3273 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3274 // If predication is not needed, avoid it. 3275 // TODO: We can use the loop-preheader as context point here and get 3276 // context sensitive reasoning for isSafeToSpeculativelyExecute. 3277 if (!blockNeedsPredicationForAnyReason(I->getParent()) || 3278 isSafeToSpeculativelyExecute(I) || 3279 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) || 3280 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I)) 3281 return false; 3282 3283 // If the instruction was executed conditionally in the original scalar loop, 3284 // predication is needed with a mask whose lanes are all possibly inactive. 3285 if (Legal->blockNeedsPredication(I->getParent())) 3286 return true; 3287 3288 // All that remain are instructions with side-effects originally executed in 3289 // the loop unconditionally, but now execute under a tail-fold mask (only) 3290 // having at least one active lane (the first). If the side-effects of the 3291 // instruction are invariant, executing it w/o (the tail-folding) mask is safe 3292 // - it will cause the same side-effects as when masked. 3293 switch(I->getOpcode()) { 3294 default: 3295 llvm_unreachable( 3296 "instruction should have been considered by earlier checks"); 3297 case Instruction::Call: 3298 // Side-effects of a Call are assumed to be non-invariant, needing a 3299 // (fold-tail) mask. 3300 assert(Legal->isMaskRequired(I) && 3301 "should have returned earlier for calls not needing a mask"); 3302 return true; 3303 case Instruction::Load: 3304 // If the address is loop invariant no predication is needed. 3305 return !Legal->isInvariant(getLoadStorePointerOperand(I)); 3306 case Instruction::Store: { 3307 // For stores, we need to prove both speculation safety (which follows from 3308 // the same argument as loads), but also must prove the value being stored 3309 // is correct. The easiest form of the later is to require that all values 3310 // stored are the same. 3311 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) && 3312 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand())); 3313 } 3314 case Instruction::UDiv: 3315 case Instruction::SDiv: 3316 case Instruction::SRem: 3317 case Instruction::URem: 3318 // If the divisor is loop-invariant no predication is needed. 3319 return !TheLoop->isLoopInvariant(I->getOperand(1)); 3320 } 3321 } 3322 3323 std::pair<InstructionCost, InstructionCost> 3324 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3325 ElementCount VF) const { 3326 assert(I->getOpcode() == Instruction::UDiv || 3327 I->getOpcode() == Instruction::SDiv || 3328 I->getOpcode() == Instruction::SRem || 3329 I->getOpcode() == Instruction::URem); 3330 assert(!isSafeToSpeculativelyExecute(I)); 3331 3332 // Scalarization isn't legal for scalable vector types 3333 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3334 if (!VF.isScalable()) { 3335 // Get the scalarization cost and scale this amount by the probability of 3336 // executing the predicated block. If the instruction is not predicated, 3337 // we fall through to the next case. 3338 ScalarizationCost = 0; 3339 3340 // These instructions have a non-void type, so account for the phi nodes 3341 // that we will create. This cost is likely to be zero. The phi node 3342 // cost, if any, should be scaled by the block probability because it 3343 // models a copy at the end of each predicated block. 3344 ScalarizationCost += VF.getKnownMinValue() * 3345 TTI.getCFInstrCost(Instruction::PHI, CostKind); 3346 3347 // The cost of the non-predicated instruction. 3348 ScalarizationCost += VF.getKnownMinValue() * 3349 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 3350 3351 // The cost of insertelement and extractelement instructions needed for 3352 // scalarization. 3353 ScalarizationCost += getScalarizationOverhead(I, VF); 3354 3355 // Scale the cost by the probability of executing the predicated blocks. 3356 // This assumes the predicated block for each vector lane is equally 3357 // likely. 3358 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 3359 } 3360 InstructionCost SafeDivisorCost = 0; 3361 3362 auto *VecTy = toVectorTy(I->getType(), VF); 3363 3364 // The cost of the select guard to ensure all lanes are well defined 3365 // after we speculate above any internal control flow. 3366 SafeDivisorCost += 3367 TTI.getCmpSelInstrCost(Instruction::Select, VecTy, 3368 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 3369 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3370 3371 // Certain instructions can be cheaper to vectorize if they have a constant 3372 // second vector operand. One example of this are shifts on x86. 3373 Value *Op2 = I->getOperand(1); 3374 auto Op2Info = TTI.getOperandInfo(Op2); 3375 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 3376 Legal->isInvariant(Op2)) 3377 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 3378 3379 SmallVector<const Value *, 4> Operands(I->operand_values()); 3380 SafeDivisorCost += TTI.getArithmeticInstrCost( 3381 I->getOpcode(), VecTy, CostKind, 3382 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 3383 Op2Info, Operands, I); 3384 return {ScalarizationCost, SafeDivisorCost}; 3385 } 3386 3387 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 3388 Instruction *I, ElementCount VF) const { 3389 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 3390 assert(getWideningDecision(I, VF) == CM_Unknown && 3391 "Decision should not be set yet."); 3392 auto *Group = getInterleavedAccessGroup(I); 3393 assert(Group && "Must have a group."); 3394 unsigned InterleaveFactor = Group->getFactor(); 3395 3396 // If the instruction's allocated size doesn't equal its type size, it 3397 // requires padding and will be scalarized. 3398 auto &DL = I->getDataLayout(); 3399 auto *ScalarTy = getLoadStoreType(I); 3400 if (hasIrregularType(ScalarTy, DL)) 3401 return false; 3402 3403 // We currently only know how to emit interleave/deinterleave with 3404 // Factor=2 for scalable vectors. This is purely an implementation 3405 // limit. 3406 if (VF.isScalable() && InterleaveFactor != 2) 3407 return false; 3408 3409 // If the group involves a non-integral pointer, we may not be able to 3410 // losslessly cast all values to a common type. 3411 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 3412 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) { 3413 Instruction *Member = Group->getMember(Idx); 3414 if (!Member) 3415 continue; 3416 auto *MemberTy = getLoadStoreType(Member); 3417 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 3418 // Don't coerce non-integral pointers to integers or vice versa. 3419 if (MemberNI != ScalarNI) 3420 // TODO: Consider adding special nullptr value case here 3421 return false; 3422 if (MemberNI && ScalarNI && 3423 ScalarTy->getPointerAddressSpace() != 3424 MemberTy->getPointerAddressSpace()) 3425 return false; 3426 } 3427 3428 // Check if masking is required. 3429 // A Group may need masking for one of two reasons: it resides in a block that 3430 // needs predication, or it was decided to use masking to deal with gaps 3431 // (either a gap at the end of a load-access that may result in a speculative 3432 // load, or any gaps in a store-access). 3433 bool PredicatedAccessRequiresMasking = 3434 blockNeedsPredicationForAnyReason(I->getParent()) && 3435 Legal->isMaskRequired(I); 3436 bool LoadAccessWithGapsRequiresEpilogMasking = 3437 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 3438 !isScalarEpilogueAllowed(); 3439 bool StoreAccessWithGapsRequiresMasking = 3440 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 3441 if (!PredicatedAccessRequiresMasking && 3442 !LoadAccessWithGapsRequiresEpilogMasking && 3443 !StoreAccessWithGapsRequiresMasking) 3444 return true; 3445 3446 // If masked interleaving is required, we expect that the user/target had 3447 // enabled it, because otherwise it either wouldn't have been created or 3448 // it should have been invalidated by the CostModel. 3449 assert(useMaskedInterleavedAccesses(TTI) && 3450 "Masked interleave-groups for predicated accesses are not enabled."); 3451 3452 if (Group->isReverse()) 3453 return false; 3454 3455 auto *Ty = getLoadStoreType(I); 3456 const Align Alignment = getLoadStoreAlignment(I); 3457 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 3458 : TTI.isLegalMaskedStore(Ty, Alignment); 3459 } 3460 3461 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 3462 Instruction *I, ElementCount VF) { 3463 // Get and ensure we have a valid memory instruction. 3464 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 3465 3466 auto *Ptr = getLoadStorePointerOperand(I); 3467 auto *ScalarTy = getLoadStoreType(I); 3468 3469 // In order to be widened, the pointer should be consecutive, first of all. 3470 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 3471 return false; 3472 3473 // If the instruction is a store located in a predicated block, it will be 3474 // scalarized. 3475 if (isScalarWithPredication(I, VF)) 3476 return false; 3477 3478 // If the instruction's allocated size doesn't equal it's type size, it 3479 // requires padding and will be scalarized. 3480 auto &DL = I->getDataLayout(); 3481 if (hasIrregularType(ScalarTy, DL)) 3482 return false; 3483 3484 return true; 3485 } 3486 3487 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 3488 // We should not collect Uniforms more than once per VF. Right now, 3489 // this function is called from collectUniformsAndScalars(), which 3490 // already does this check. Collecting Uniforms for VF=1 does not make any 3491 // sense. 3492 3493 assert(VF.isVector() && !Uniforms.contains(VF) && 3494 "This function should not be visited twice for the same VF"); 3495 3496 // Visit the list of Uniforms. If we find no uniform value, we won't 3497 // analyze again. Uniforms.count(VF) will return 1. 3498 Uniforms[VF].clear(); 3499 3500 // Now we know that the loop is vectorizable! 3501 // Collect instructions inside the loop that will remain uniform after 3502 // vectorization. 3503 3504 // Global values, params and instructions outside of current loop are out of 3505 // scope. 3506 auto IsOutOfScope = [&](Value *V) -> bool { 3507 Instruction *I = dyn_cast<Instruction>(V); 3508 return (!I || !TheLoop->contains(I)); 3509 }; 3510 3511 // Worklist containing uniform instructions demanding lane 0. 3512 SetVector<Instruction *> Worklist; 3513 3514 // Add uniform instructions demanding lane 0 to the worklist. Instructions 3515 // that require predication must not be considered uniform after 3516 // vectorization, because that would create an erroneous replicating region 3517 // where only a single instance out of VF should be formed. 3518 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void { 3519 if (IsOutOfScope(I)) { 3520 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 3521 << *I << "\n"); 3522 return; 3523 } 3524 if (isPredicatedInst(I)) { 3525 LLVM_DEBUG( 3526 dbgs() << "LV: Found not uniform due to requiring predication: " << *I 3527 << "\n"); 3528 return; 3529 } 3530 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 3531 Worklist.insert(I); 3532 }; 3533 3534 // Start with the conditional branches exiting the loop. If the branch 3535 // condition is an instruction contained in the loop that is only used by the 3536 // branch, it is uniform. Note conditions from uncountable early exits are not 3537 // uniform. 3538 SmallVector<BasicBlock *> Exiting; 3539 TheLoop->getExitingBlocks(Exiting); 3540 for (BasicBlock *E : Exiting) { 3541 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E) 3542 continue; 3543 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); 3544 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 3545 AddToWorklistIfAllowed(Cmp); 3546 } 3547 3548 auto PrevVF = VF.divideCoefficientBy(2); 3549 // Return true if all lanes perform the same memory operation, and we can 3550 // thus choose to execute only one. 3551 auto IsUniformMemOpUse = [&](Instruction *I) { 3552 // If the value was already known to not be uniform for the previous 3553 // (smaller VF), it cannot be uniform for the larger VF. 3554 if (PrevVF.isVector()) { 3555 auto Iter = Uniforms.find(PrevVF); 3556 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 3557 return false; 3558 } 3559 if (!Legal->isUniformMemOp(*I, VF)) 3560 return false; 3561 if (isa<LoadInst>(I)) 3562 // Loading the same address always produces the same result - at least 3563 // assuming aliasing and ordering which have already been checked. 3564 return true; 3565 // Storing the same value on every iteration. 3566 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 3567 }; 3568 3569 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) { 3570 InstWidening WideningDecision = getWideningDecision(I, VF); 3571 assert(WideningDecision != CM_Unknown && 3572 "Widening decision should be ready at this moment"); 3573 3574 if (IsUniformMemOpUse(I)) 3575 return true; 3576 3577 return (WideningDecision == CM_Widen || 3578 WideningDecision == CM_Widen_Reverse || 3579 WideningDecision == CM_Interleave); 3580 }; 3581 3582 // Returns true if Ptr is the pointer operand of a memory access instruction 3583 // I, I is known to not require scalarization, and the pointer is not also 3584 // stored. 3585 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 3586 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 3587 return false; 3588 return getLoadStorePointerOperand(I) == Ptr && 3589 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 3590 }; 3591 3592 // Holds a list of values which are known to have at least one uniform use. 3593 // Note that there may be other uses which aren't uniform. A "uniform use" 3594 // here is something which only demands lane 0 of the unrolled iterations; 3595 // it does not imply that all lanes produce the same value (e.g. this is not 3596 // the usual meaning of uniform) 3597 SetVector<Value *> HasUniformUse; 3598 3599 // Scan the loop for instructions which are either a) known to have only 3600 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3601 for (auto *BB : TheLoop->blocks()) 3602 for (auto &I : *BB) { 3603 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 3604 switch (II->getIntrinsicID()) { 3605 case Intrinsic::sideeffect: 3606 case Intrinsic::experimental_noalias_scope_decl: 3607 case Intrinsic::assume: 3608 case Intrinsic::lifetime_start: 3609 case Intrinsic::lifetime_end: 3610 if (TheLoop->hasLoopInvariantOperands(&I)) 3611 AddToWorklistIfAllowed(&I); 3612 break; 3613 default: 3614 break; 3615 } 3616 } 3617 3618 // ExtractValue instructions must be uniform, because the operands are 3619 // known to be loop-invariant. 3620 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 3621 assert(IsOutOfScope(EVI->getAggregateOperand()) && 3622 "Expected aggregate value to be loop invariant"); 3623 AddToWorklistIfAllowed(EVI); 3624 continue; 3625 } 3626 3627 // If there's no pointer operand, there's nothing to do. 3628 auto *Ptr = getLoadStorePointerOperand(&I); 3629 if (!Ptr) 3630 continue; 3631 3632 if (IsUniformMemOpUse(&I)) 3633 AddToWorklistIfAllowed(&I); 3634 3635 if (IsVectorizedMemAccessUse(&I, Ptr)) 3636 HasUniformUse.insert(Ptr); 3637 } 3638 3639 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 3640 // demanding) users. Since loops are assumed to be in LCSSA form, this 3641 // disallows uses outside the loop as well. 3642 for (auto *V : HasUniformUse) { 3643 if (IsOutOfScope(V)) 3644 continue; 3645 auto *I = cast<Instruction>(V); 3646 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool { 3647 auto *UI = cast<Instruction>(U); 3648 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V); 3649 }); 3650 if (UsersAreMemAccesses) 3651 AddToWorklistIfAllowed(I); 3652 } 3653 3654 // Expand Worklist in topological order: whenever a new instruction 3655 // is added , its users should be already inside Worklist. It ensures 3656 // a uniform instruction will only be used by uniform instructions. 3657 unsigned Idx = 0; 3658 while (Idx != Worklist.size()) { 3659 Instruction *I = Worklist[Idx++]; 3660 3661 for (auto *OV : I->operand_values()) { 3662 // isOutOfScope operands cannot be uniform instructions. 3663 if (IsOutOfScope(OV)) 3664 continue; 3665 // First order recurrence Phi's should typically be considered 3666 // non-uniform. 3667 auto *OP = dyn_cast<PHINode>(OV); 3668 if (OP && Legal->isFixedOrderRecurrence(OP)) 3669 continue; 3670 // If all the users of the operand are uniform, then add the 3671 // operand into the uniform worklist. 3672 auto *OI = cast<Instruction>(OV); 3673 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 3674 auto *J = cast<Instruction>(U); 3675 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI); 3676 })) 3677 AddToWorklistIfAllowed(OI); 3678 } 3679 } 3680 3681 // For an instruction to be added into Worklist above, all its users inside 3682 // the loop should also be in Worklist. However, this condition cannot be 3683 // true for phi nodes that form a cyclic dependence. We must process phi 3684 // nodes separately. An induction variable will remain uniform if all users 3685 // of the induction variable and induction variable update remain uniform. 3686 // The code below handles both pointer and non-pointer induction variables. 3687 BasicBlock *Latch = TheLoop->getLoopLatch(); 3688 for (const auto &Induction : Legal->getInductionVars()) { 3689 auto *Ind = Induction.first; 3690 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3691 3692 // Determine if all users of the induction variable are uniform after 3693 // vectorization. 3694 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool { 3695 auto *I = cast<Instruction>(U); 3696 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3697 IsVectorizedMemAccessUse(I, Ind); 3698 }); 3699 if (!UniformInd) 3700 continue; 3701 3702 // Determine if all users of the induction variable update instruction are 3703 // uniform after vectorization. 3704 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { 3705 auto *I = cast<Instruction>(U); 3706 return I == Ind || Worklist.count(I) || 3707 IsVectorizedMemAccessUse(I, IndUpdate); 3708 }); 3709 if (!UniformIndUpdate) 3710 continue; 3711 3712 // The induction variable and its update instruction will remain uniform. 3713 AddToWorklistIfAllowed(Ind); 3714 AddToWorklistIfAllowed(IndUpdate); 3715 } 3716 3717 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 3718 } 3719 3720 bool LoopVectorizationCostModel::runtimeChecksRequired() { 3721 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 3722 3723 if (Legal->getRuntimePointerChecking()->Need) { 3724 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 3725 "runtime pointer checks needed. Enable vectorization of this " 3726 "loop with '#pragma clang loop vectorize(enable)' when " 3727 "compiling with -Os/-Oz", 3728 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3729 return true; 3730 } 3731 3732 if (!PSE.getPredicate().isAlwaysTrue()) { 3733 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 3734 "runtime SCEV checks needed. Enable vectorization of this " 3735 "loop with '#pragma clang loop vectorize(enable)' when " 3736 "compiling with -Os/-Oz", 3737 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3738 return true; 3739 } 3740 3741 // FIXME: Avoid specializing for stride==1 instead of bailing out. 3742 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 3743 reportVectorizationFailure("Runtime stride check for small trip count", 3744 "runtime stride == 1 checks needed. Enable vectorization of " 3745 "this loop without such check by compiling with -Os/-Oz", 3746 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3747 return true; 3748 } 3749 3750 return false; 3751 } 3752 3753 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { 3754 if (IsScalableVectorizationAllowed) 3755 return *IsScalableVectorizationAllowed; 3756 3757 IsScalableVectorizationAllowed = false; 3758 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 3759 return false; 3760 3761 if (Hints->isScalableVectorizationDisabled()) { 3762 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 3763 "ScalableVectorizationDisabled", ORE, TheLoop); 3764 return false; 3765 } 3766 3767 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 3768 3769 auto MaxScalableVF = ElementCount::getScalable( 3770 std::numeric_limits<ElementCount::ScalarTy>::max()); 3771 3772 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 3773 // FIXME: While for scalable vectors this is currently sufficient, this should 3774 // be replaced by a more detailed mechanism that filters out specific VFs, 3775 // instead of invalidating vectorization for a whole set of VFs based on the 3776 // MaxVF. 3777 3778 // Disable scalable vectorization if the loop contains unsupported reductions. 3779 if (!canVectorizeReductions(MaxScalableVF)) { 3780 reportVectorizationInfo( 3781 "Scalable vectorization not supported for the reduction " 3782 "operations found in this loop.", 3783 "ScalableVFUnfeasible", ORE, TheLoop); 3784 return false; 3785 } 3786 3787 // Disable scalable vectorization if the loop contains any instructions 3788 // with element types not supported for scalable vectors. 3789 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 3790 return !Ty->isVoidTy() && 3791 !this->TTI.isElementTypeLegalForScalableVector(Ty); 3792 })) { 3793 reportVectorizationInfo("Scalable vectorization is not supported " 3794 "for all element types found in this loop.", 3795 "ScalableVFUnfeasible", ORE, TheLoop); 3796 return false; 3797 } 3798 3799 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { 3800 reportVectorizationInfo("The target does not provide maximum vscale value " 3801 "for safe distance analysis.", 3802 "ScalableVFUnfeasible", ORE, TheLoop); 3803 return false; 3804 } 3805 3806 IsScalableVectorizationAllowed = true; 3807 return true; 3808 } 3809 3810 ElementCount 3811 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 3812 if (!isScalableVectorizationAllowed()) 3813 return ElementCount::getScalable(0); 3814 3815 auto MaxScalableVF = ElementCount::getScalable( 3816 std::numeric_limits<ElementCount::ScalarTy>::max()); 3817 if (Legal->isSafeForAnyVectorWidth()) 3818 return MaxScalableVF; 3819 3820 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 3821 // Limit MaxScalableVF by the maximum safe dependence distance. 3822 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 3823 3824 if (!MaxScalableVF) 3825 reportVectorizationInfo( 3826 "Max legal vector width too small, scalable vectorization " 3827 "unfeasible.", 3828 "ScalableVFUnfeasible", ORE, TheLoop); 3829 3830 return MaxScalableVF; 3831 } 3832 3833 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 3834 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 3835 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 3836 unsigned SmallestType, WidestType; 3837 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 3838 3839 // Get the maximum safe dependence distance in bits computed by LAA. 3840 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3841 // the memory accesses that is most restrictive (involved in the smallest 3842 // dependence distance). 3843 unsigned MaxSafeElements = 3844 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 3845 3846 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 3847 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 3848 if (!Legal->isSafeForAnyVectorWidth()) 3849 this->MaxSafeElements = MaxSafeElements; 3850 3851 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 3852 << ".\n"); 3853 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 3854 << ".\n"); 3855 3856 // First analyze the UserVF, fall back if the UserVF should be ignored. 3857 if (UserVF) { 3858 auto MaxSafeUserVF = 3859 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 3860 3861 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 3862 // If `VF=vscale x N` is safe, then so is `VF=N` 3863 if (UserVF.isScalable()) 3864 return FixedScalableVFPair( 3865 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 3866 3867 return UserVF; 3868 } 3869 3870 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 3871 3872 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3873 // is better to ignore the hint and let the compiler choose a suitable VF. 3874 if (!UserVF.isScalable()) { 3875 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3876 << " is unsafe, clamping to max safe VF=" 3877 << MaxSafeFixedVF << ".\n"); 3878 ORE->emit([&]() { 3879 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3880 TheLoop->getStartLoc(), 3881 TheLoop->getHeader()) 3882 << "User-specified vectorization factor " 3883 << ore::NV("UserVectorizationFactor", UserVF) 3884 << " is unsafe, clamping to maximum safe vectorization factor " 3885 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 3886 }); 3887 return MaxSafeFixedVF; 3888 } 3889 3890 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 3891 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3892 << " is ignored because scalable vectors are not " 3893 "available.\n"); 3894 ORE->emit([&]() { 3895 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3896 TheLoop->getStartLoc(), 3897 TheLoop->getHeader()) 3898 << "User-specified vectorization factor " 3899 << ore::NV("UserVectorizationFactor", UserVF) 3900 << " is ignored because the target does not support scalable " 3901 "vectors. The compiler will pick a more suitable value."; 3902 }); 3903 } else { 3904 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3905 << " is unsafe. Ignoring scalable UserVF.\n"); 3906 ORE->emit([&]() { 3907 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3908 TheLoop->getStartLoc(), 3909 TheLoop->getHeader()) 3910 << "User-specified vectorization factor " 3911 << ore::NV("UserVectorizationFactor", UserVF) 3912 << " is unsafe. Ignoring the hint to let the compiler pick a " 3913 "more suitable value."; 3914 }); 3915 } 3916 } 3917 3918 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 3919 << " / " << WidestType << " bits.\n"); 3920 3921 FixedScalableVFPair Result(ElementCount::getFixed(1), 3922 ElementCount::getScalable(0)); 3923 if (auto MaxVF = 3924 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3925 MaxSafeFixedVF, FoldTailByMasking)) 3926 Result.FixedVF = MaxVF; 3927 3928 if (auto MaxVF = 3929 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3930 MaxSafeScalableVF, FoldTailByMasking)) 3931 if (MaxVF.isScalable()) { 3932 Result.ScalableVF = MaxVF; 3933 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 3934 << "\n"); 3935 } 3936 3937 return Result; 3938 } 3939 3940 FixedScalableVFPair 3941 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 3942 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 3943 // TODO: It may be useful to do since it's still likely to be dynamically 3944 // uniform if the target can skip. 3945 reportVectorizationFailure( 3946 "Not inserting runtime ptr check for divergent target", 3947 "runtime pointer checks needed. Not enabled for divergent target", 3948 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 3949 return FixedScalableVFPair::getNone(); 3950 } 3951 3952 ScalarEvolution *SE = PSE.getSE(); 3953 unsigned TC = SE->getSmallConstantTripCount(TheLoop); 3954 unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); 3955 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 3956 if (TC != MaxTC) 3957 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); 3958 if (TC == 1) { 3959 reportVectorizationFailure("Single iteration (non) loop", 3960 "loop trip count is one, irrelevant for vectorization", 3961 "SingleIterationLoop", ORE, TheLoop); 3962 return FixedScalableVFPair::getNone(); 3963 } 3964 3965 // If BTC matches the widest induction type and is -1 then the trip count 3966 // computation will wrap to 0 and the vector trip count will be 0. Do not try 3967 // to vectorize. 3968 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); 3969 if (!isa<SCEVCouldNotCompute>(BTC) && 3970 BTC->getType()->getScalarSizeInBits() >= 3971 Legal->getWidestInductionType()->getScalarSizeInBits() && 3972 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, 3973 SE->getMinusOne(BTC->getType()))) { 3974 reportVectorizationFailure( 3975 "Trip count computation wrapped", 3976 "backedge-taken count is -1, loop trip count wrapped to 0", 3977 "TripCountWrapped", ORE, TheLoop); 3978 return FixedScalableVFPair::getNone(); 3979 } 3980 3981 switch (ScalarEpilogueStatus) { 3982 case CM_ScalarEpilogueAllowed: 3983 return computeFeasibleMaxVF(MaxTC, UserVF, false); 3984 case CM_ScalarEpilogueNotAllowedUsePredicate: 3985 [[fallthrough]]; 3986 case CM_ScalarEpilogueNotNeededUsePredicate: 3987 LLVM_DEBUG( 3988 dbgs() << "LV: vector predicate hint/switch found.\n" 3989 << "LV: Not allowing scalar epilogue, creating predicated " 3990 << "vector loop.\n"); 3991 break; 3992 case CM_ScalarEpilogueNotAllowedLowTripLoop: 3993 // fallthrough as a special case of OptForSize 3994 case CM_ScalarEpilogueNotAllowedOptSize: 3995 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 3996 LLVM_DEBUG( 3997 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 3998 else 3999 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4000 << "count.\n"); 4001 4002 // Bail if runtime checks are required, which are not good when optimising 4003 // for size. 4004 if (runtimeChecksRequired()) 4005 return FixedScalableVFPair::getNone(); 4006 4007 break; 4008 } 4009 4010 // The only loops we can vectorize without a scalar epilogue, are loops with 4011 // a bottom-test and a single exiting block. We'd have to handle the fact 4012 // that not every instruction executes on the last iteration. This will 4013 // require a lane mask which varies through the vector loop body. (TODO) 4014 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4015 // If there was a tail-folding hint/switch, but we can't fold the tail by 4016 // masking, fallback to a vectorization with a scalar epilogue. 4017 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4018 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4019 "scalar epilogue instead.\n"); 4020 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4021 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4022 } 4023 return FixedScalableVFPair::getNone(); 4024 } 4025 4026 // Now try the tail folding 4027 4028 // Invalidate interleave groups that require an epilogue if we can't mask 4029 // the interleave-group. 4030 if (!useMaskedInterleavedAccesses(TTI)) { 4031 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4032 "No decisions should have been taken at this point"); 4033 // Note: There is no need to invalidate any cost modeling decisions here, as 4034 // none were taken so far. 4035 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4036 } 4037 4038 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4039 4040 // Avoid tail folding if the trip count is known to be a multiple of any VF 4041 // we choose. 4042 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4043 MaxFactors.FixedVF.getFixedValue(); 4044 if (MaxFactors.ScalableVF) { 4045 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4046 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4047 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4048 *MaxPowerOf2RuntimeVF, 4049 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4050 } else 4051 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4052 } 4053 4054 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4055 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4056 "MaxFixedVF must be a power of 2"); 4057 unsigned MaxVFtimesIC = 4058 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4059 ScalarEvolution *SE = PSE.getSE(); 4060 // Currently only loops with countable exits are vectorized, but calling 4061 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with 4062 // uncountable exits whilst also ensuring the symbolic maximum and known 4063 // back-edge taken count remain identical for loops with countable exits. 4064 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount(); 4065 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() && 4066 "Invalid loop count"); 4067 const SCEV *ExitCount = SE->getAddExpr( 4068 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4069 const SCEV *Rem = SE->getURemExpr( 4070 SE->applyLoopGuards(ExitCount, TheLoop), 4071 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4072 if (Rem->isZero()) { 4073 // Accept MaxFixedVF if we do not have a tail. 4074 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4075 return MaxFactors; 4076 } 4077 } 4078 4079 // If we don't know the precise trip count, or if the trip count that we 4080 // found modulo the vectorization factor is not zero, try to fold the tail 4081 // by masking. 4082 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4083 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); 4084 if (foldTailByMasking()) { 4085 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { 4086 LLVM_DEBUG( 4087 dbgs() 4088 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4089 "try to generate VP Intrinsics with scalable vector " 4090 "factors only.\n"); 4091 // Tail folded loop using VP intrinsics restricts the VF to be scalable 4092 // for now. 4093 // TODO: extend it for fixed vectors, if required. 4094 assert(MaxFactors.ScalableVF.isScalable() && 4095 "Expected scalable vector factor."); 4096 4097 MaxFactors.FixedVF = ElementCount::getFixed(1); 4098 } 4099 return MaxFactors; 4100 } 4101 4102 // If there was a tail-folding hint/switch, but we can't fold the tail by 4103 // masking, fallback to a vectorization with a scalar epilogue. 4104 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4105 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4106 "scalar epilogue instead.\n"); 4107 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4108 return MaxFactors; 4109 } 4110 4111 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4112 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4113 return FixedScalableVFPair::getNone(); 4114 } 4115 4116 if (TC == 0) { 4117 reportVectorizationFailure( 4118 "unable to calculate the loop count due to complex control flow", 4119 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4120 return FixedScalableVFPair::getNone(); 4121 } 4122 4123 reportVectorizationFailure( 4124 "Cannot optimize for size and vectorize at the same time.", 4125 "cannot optimize for size and vectorize at the same time. " 4126 "Enable vectorization of this loop with '#pragma clang loop " 4127 "vectorize(enable)' when compiling with -Os/-Oz", 4128 "NoTailLoopWithOptForSize", ORE, TheLoop); 4129 return FixedScalableVFPair::getNone(); 4130 } 4131 4132 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4133 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4134 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4135 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4136 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4137 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4138 : TargetTransformInfo::RGK_FixedWidthVector); 4139 4140 // Convenience function to return the minimum of two ElementCounts. 4141 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4142 assert((LHS.isScalable() == RHS.isScalable()) && 4143 "Scalable flags must match"); 4144 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4145 }; 4146 4147 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4148 // Note that both WidestRegister and WidestType may not be a powers of 2. 4149 auto MaxVectorElementCount = ElementCount::get( 4150 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4151 ComputeScalableMaxVF); 4152 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4153 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4154 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4155 4156 if (!MaxVectorElementCount) { 4157 LLVM_DEBUG(dbgs() << "LV: The target has no " 4158 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4159 << " vector registers.\n"); 4160 return ElementCount::getFixed(1); 4161 } 4162 4163 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4164 if (MaxVectorElementCount.isScalable() && 4165 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4166 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4167 auto Min = Attr.getVScaleRangeMin(); 4168 WidestRegisterMinEC *= Min; 4169 } 4170 4171 // When a scalar epilogue is required, at least one iteration of the scalar 4172 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4173 // max VF that results in a dead vector loop. 4174 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4175 MaxTripCount -= 1; 4176 4177 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4178 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4179 // If upper bound loop trip count (TC) is known at compile time there is no 4180 // point in choosing VF greater than TC (as done in the loop below). Select 4181 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4182 // scalable, we only fall back on a fixed VF when the TC is less than or 4183 // equal to the known number of lanes. 4184 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4185 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4186 "exceeding the constant trip count: " 4187 << ClampedUpperTripCount << "\n"); 4188 return ElementCount::get( 4189 ClampedUpperTripCount, 4190 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4191 } 4192 4193 TargetTransformInfo::RegisterKind RegKind = 4194 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4195 : TargetTransformInfo::RGK_FixedWidthVector; 4196 ElementCount MaxVF = MaxVectorElementCount; 4197 if (MaximizeBandwidth || 4198 (MaximizeBandwidth.getNumOccurrences() == 0 && 4199 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4200 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4201 auto MaxVectorElementCountMaxBW = ElementCount::get( 4202 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4203 ComputeScalableMaxVF); 4204 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4205 4206 // Collect all viable vectorization factors larger than the default MaxVF 4207 // (i.e. MaxVectorElementCount). 4208 SmallVector<ElementCount, 8> VFs; 4209 for (ElementCount VS = MaxVectorElementCount * 2; 4210 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4211 VFs.push_back(VS); 4212 4213 // For each VF calculate its register usage. 4214 auto RUs = calculateRegisterUsage(VFs); 4215 4216 // Select the largest VF which doesn't require more registers than existing 4217 // ones. 4218 for (int I = RUs.size() - 1; I >= 0; --I) { 4219 const auto &MLU = RUs[I].MaxLocalUsers; 4220 if (all_of(MLU, [&](decltype(MLU.front()) &LU) { 4221 return LU.second <= TTI.getNumberOfRegisters(LU.first); 4222 })) { 4223 MaxVF = VFs[I]; 4224 break; 4225 } 4226 } 4227 if (ElementCount MinVF = 4228 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4229 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4230 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4231 << ") with target's minimum: " << MinVF << '\n'); 4232 MaxVF = MinVF; 4233 } 4234 } 4235 4236 // Invalidate any widening decisions we might have made, in case the loop 4237 // requires prediction (decided later), but we have already made some 4238 // load/store widening decisions. 4239 invalidateCostModelingDecisions(); 4240 } 4241 return MaxVF; 4242 } 4243 4244 /// Convenience function that returns the value of vscale_range iff 4245 /// vscale_range.min == vscale_range.max or otherwise returns the value 4246 /// returned by the corresponding TTI method. 4247 static std::optional<unsigned> 4248 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4249 const Function *Fn = L->getHeader()->getParent(); 4250 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4251 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4252 auto Min = Attr.getVScaleRangeMin(); 4253 auto Max = Attr.getVScaleRangeMax(); 4254 if (Max && Min == Max) 4255 return Max; 4256 } 4257 4258 return TTI.getVScaleForTuning(); 4259 } 4260 4261 /// This function attempts to return a value that represents the vectorization 4262 /// factor at runtime. For fixed-width VFs we know this precisely at compile 4263 /// time, but for scalable VFs we calculate it based on an estimate of the 4264 /// vscale value. 4265 static unsigned getEstimatedRuntimeVF(const Loop *L, 4266 const TargetTransformInfo &TTI, 4267 ElementCount VF) { 4268 unsigned EstimatedVF = VF.getKnownMinValue(); 4269 if (VF.isScalable()) 4270 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI)) 4271 EstimatedVF *= *VScale; 4272 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 4273 return EstimatedVF; 4274 } 4275 4276 bool LoopVectorizationPlanner::isMoreProfitable( 4277 const VectorizationFactor &A, const VectorizationFactor &B, 4278 const unsigned MaxTripCount) const { 4279 InstructionCost CostA = A.Cost; 4280 InstructionCost CostB = B.Cost; 4281 4282 // Improve estimate for the vector width if it is scalable. 4283 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4284 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4285 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4286 if (A.Width.isScalable()) 4287 EstimatedWidthA *= *VScale; 4288 if (B.Width.isScalable()) 4289 EstimatedWidthB *= *VScale; 4290 } 4291 4292 // Assume vscale may be larger than 1 (or the value being tuned for), 4293 // so that scalable vectorization is slightly favorable over fixed-width 4294 // vectorization. 4295 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && 4296 A.Width.isScalable() && !B.Width.isScalable(); 4297 4298 auto CmpFn = [PreferScalable](const InstructionCost &LHS, 4299 const InstructionCost &RHS) { 4300 return PreferScalable ? LHS <= RHS : LHS < RHS; 4301 }; 4302 4303 // To avoid the need for FP division: 4304 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4305 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4306 if (!MaxTripCount) 4307 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); 4308 4309 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4310 InstructionCost VectorCost, 4311 InstructionCost ScalarCost) { 4312 // If the trip count is a known (possibly small) constant, the trip count 4313 // will be rounded up to an integer number of iterations under 4314 // FoldTailByMasking. The total cost in that case will be 4315 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4316 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4317 // some extra overheads, but for the purpose of comparing the costs of 4318 // different VFs we can use this to compare the total loop-body cost 4319 // expected after vectorization. 4320 if (CM.foldTailByMasking()) 4321 return VectorCost * divideCeil(MaxTripCount, VF); 4322 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); 4323 }; 4324 4325 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); 4326 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); 4327 return CmpFn(RTCostA, RTCostB); 4328 } 4329 4330 bool LoopVectorizationPlanner::isMoreProfitable( 4331 const VectorizationFactor &A, const VectorizationFactor &B) const { 4332 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); 4333 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount); 4334 } 4335 4336 void LoopVectorizationPlanner::emitInvalidCostRemarks( 4337 OptimizationRemarkEmitter *ORE) { 4338 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>; 4339 SmallVector<RecipeVFPair> InvalidCosts; 4340 for (const auto &Plan : VPlans) { 4341 for (ElementCount VF : Plan->vectorFactors()) { 4342 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), 4343 CM, CM.CostKind); 4344 precomputeCosts(*Plan, VF, CostCtx); 4345 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); 4346 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4347 for (auto &R : *VPBB) { 4348 if (!R.cost(VF, CostCtx).isValid()) 4349 InvalidCosts.emplace_back(&R, VF); 4350 } 4351 } 4352 } 4353 } 4354 if (InvalidCosts.empty()) 4355 return; 4356 4357 // Emit a report of VFs with invalid costs in the loop. 4358 4359 // Group the remarks per recipe, keeping the recipe order from InvalidCosts. 4360 DenseMap<VPRecipeBase *, unsigned> Numbering; 4361 unsigned I = 0; 4362 for (auto &Pair : InvalidCosts) 4363 if (!Numbering.count(Pair.first)) 4364 Numbering[Pair.first] = I++; 4365 4366 // Sort the list, first on recipe(number) then on VF. 4367 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) { 4368 if (Numbering[A.first] != Numbering[B.first]) 4369 return Numbering[A.first] < Numbering[B.first]; 4370 const auto &LHS = A.second; 4371 const auto &RHS = B.second; 4372 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 4373 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 4374 }); 4375 4376 // For a list of ordered recipe-VF pairs: 4377 // [(load, VF1), (load, VF2), (store, VF1)] 4378 // group the recipes together to emit separate remarks for: 4379 // load (VF1, VF2) 4380 // store (VF1) 4381 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts); 4382 auto Subset = ArrayRef<RecipeVFPair>(); 4383 do { 4384 if (Subset.empty()) 4385 Subset = Tail.take_front(1); 4386 4387 VPRecipeBase *R = Subset.front().first; 4388 4389 unsigned Opcode = 4390 TypeSwitch<const VPRecipeBase *, unsigned>(R) 4391 .Case<VPHeaderPHIRecipe>( 4392 [](const auto *R) { return Instruction::PHI; }) 4393 .Case<VPWidenSelectRecipe>( 4394 [](const auto *R) { return Instruction::Select; }) 4395 .Case<VPWidenStoreRecipe>( 4396 [](const auto *R) { return Instruction::Store; }) 4397 .Case<VPWidenLoadRecipe>( 4398 [](const auto *R) { return Instruction::Load; }) 4399 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>( 4400 [](const auto *R) { return Instruction::Call; }) 4401 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe, 4402 VPWidenCastRecipe>( 4403 [](const auto *R) { return R->getOpcode(); }) 4404 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) { 4405 return R->getStoredValues().empty() ? Instruction::Load 4406 : Instruction::Store; 4407 }); 4408 4409 // If the next recipe is different, or if there are no other pairs, 4410 // emit a remark for the collated subset. e.g. 4411 // [(load, VF1), (load, VF2))] 4412 // to emit: 4413 // remark: invalid costs for 'load' at VF=(VF1, VF2) 4414 if (Subset == Tail || Tail[Subset.size()].first != R) { 4415 std::string OutString; 4416 raw_string_ostream OS(OutString); 4417 assert(!Subset.empty() && "Unexpected empty range"); 4418 OS << "Recipe with invalid costs prevented vectorization at VF=("; 4419 for (const auto &Pair : Subset) 4420 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4421 OS << "):"; 4422 if (Opcode == Instruction::Call) { 4423 StringRef Name = ""; 4424 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) { 4425 Name = Int->getIntrinsicName(); 4426 } else { 4427 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R); 4428 Function *CalledFn = 4429 WidenCall ? WidenCall->getCalledScalarFunction() 4430 : cast<Function>(R->getOperand(R->getNumOperands() - 1) 4431 ->getLiveInIRValue()); 4432 Name = CalledFn->getName(); 4433 } 4434 OS << " call to " << Name; 4435 } else 4436 OS << " " << Instruction::getOpcodeName(Opcode); 4437 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr, 4438 R->getDebugLoc()); 4439 Tail = Tail.drop_front(Subset.size()); 4440 Subset = {}; 4441 } else 4442 // Grow the subset by one element 4443 Subset = Tail.take_front(Subset.size() + 1); 4444 } while (!Tail.empty()); 4445 } 4446 4447 /// Check if any recipe of \p Plan will generate a vector value, which will be 4448 /// assigned a vector register. 4449 static bool willGenerateVectors(VPlan &Plan, ElementCount VF, 4450 const TargetTransformInfo &TTI) { 4451 assert(VF.isVector() && "Checking a scalar VF?"); 4452 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 4453 DenseSet<VPRecipeBase *> EphemeralRecipes; 4454 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); 4455 // Set of already visited types. 4456 DenseSet<Type *> Visited; 4457 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 4458 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { 4459 for (VPRecipeBase &R : *VPBB) { 4460 if (EphemeralRecipes.contains(&R)) 4461 continue; 4462 // Continue early if the recipe is considered to not produce a vector 4463 // result. Note that this includes VPInstruction where some opcodes may 4464 // produce a vector, to preserve existing behavior as VPInstructions model 4465 // aspects not directly mapped to existing IR instructions. 4466 switch (R.getVPDefID()) { 4467 case VPDef::VPDerivedIVSC: 4468 case VPDef::VPScalarIVStepsSC: 4469 case VPDef::VPScalarCastSC: 4470 case VPDef::VPReplicateSC: 4471 case VPDef::VPInstructionSC: 4472 case VPDef::VPCanonicalIVPHISC: 4473 case VPDef::VPVectorPointerSC: 4474 case VPDef::VPReverseVectorPointerSC: 4475 case VPDef::VPExpandSCEVSC: 4476 case VPDef::VPEVLBasedIVPHISC: 4477 case VPDef::VPPredInstPHISC: 4478 case VPDef::VPBranchOnMaskSC: 4479 continue; 4480 case VPDef::VPReductionSC: 4481 case VPDef::VPActiveLaneMaskPHISC: 4482 case VPDef::VPWidenCallSC: 4483 case VPDef::VPWidenCanonicalIVSC: 4484 case VPDef::VPWidenCastSC: 4485 case VPDef::VPWidenGEPSC: 4486 case VPDef::VPWidenIntrinsicSC: 4487 case VPDef::VPWidenSC: 4488 case VPDef::VPWidenSelectSC: 4489 case VPDef::VPBlendSC: 4490 case VPDef::VPFirstOrderRecurrencePHISC: 4491 case VPDef::VPWidenPHISC: 4492 case VPDef::VPWidenIntOrFpInductionSC: 4493 case VPDef::VPWidenPointerInductionSC: 4494 case VPDef::VPReductionPHISC: 4495 case VPDef::VPInterleaveSC: 4496 case VPDef::VPWidenLoadEVLSC: 4497 case VPDef::VPWidenLoadSC: 4498 case VPDef::VPWidenStoreEVLSC: 4499 case VPDef::VPWidenStoreSC: 4500 break; 4501 default: 4502 llvm_unreachable("unhandled recipe"); 4503 } 4504 4505 auto WillWiden = [&TTI, VF](Type *ScalarTy) { 4506 Type *VectorTy = toVectorTy(ScalarTy, VF); 4507 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); 4508 if (!NumLegalParts) 4509 return false; 4510 if (VF.isScalable()) { 4511 // <vscale x 1 x iN> is assumed to be profitable over iN because 4512 // scalable registers are a distinct register class from scalar 4513 // ones. If we ever find a target which wants to lower scalable 4514 // vectors back to scalars, we'll need to update this code to 4515 // explicitly ask TTI about the register class uses for each part. 4516 return NumLegalParts <= VF.getKnownMinValue(); 4517 } 4518 // Two or more parts that share a register - are vectorized. 4519 return NumLegalParts < VF.getKnownMinValue(); 4520 }; 4521 4522 // If no def nor is a store, e.g., branches, continue - no value to check. 4523 if (R.getNumDefinedValues() == 0 && 4524 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( 4525 &R)) 4526 continue; 4527 // For multi-def recipes, currently only interleaved loads, suffice to 4528 // check first def only. 4529 // For stores check their stored value; for interleaved stores suffice 4530 // the check first stored value only. In all cases this is the second 4531 // operand. 4532 VPValue *ToCheck = 4533 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); 4534 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); 4535 if (!Visited.insert({ScalarTy}).second) 4536 continue; 4537 if (WillWiden(ScalarTy)) 4538 return true; 4539 } 4540 } 4541 4542 return false; 4543 } 4544 4545 #ifndef NDEBUG 4546 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { 4547 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); 4548 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4549 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4550 assert(any_of(VPlans, 4551 [](std::unique_ptr<VPlan> &P) { 4552 return P->hasVF(ElementCount::getFixed(1)); 4553 }) && 4554 "Expected Scalar VF to be a candidate"); 4555 4556 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4557 ExpectedCost); 4558 VectorizationFactor ChosenFactor = ScalarCost; 4559 4560 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4561 if (ForceVectorization && 4562 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { 4563 // Ignore scalar width, because the user explicitly wants vectorization. 4564 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4565 // evaluation. 4566 ChosenFactor.Cost = InstructionCost::getMax(); 4567 } 4568 4569 for (auto &P : VPlans) { 4570 for (ElementCount VF : P->vectorFactors()) { 4571 // The cost for scalar VF=1 is already calculated, so ignore it. 4572 if (VF.isScalar()) 4573 continue; 4574 4575 InstructionCost C = CM.expectedCost(VF); 4576 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); 4577 4578 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); 4579 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF 4580 << " costs: " << (Candidate.Cost / Width)); 4581 if (VF.isScalable()) 4582 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 4583 << getVScaleForTuning(OrigLoop, TTI).value_or(1) 4584 << ")"); 4585 LLVM_DEBUG(dbgs() << ".\n"); 4586 4587 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 4588 LLVM_DEBUG( 4589 dbgs() 4590 << "LV: Not considering vector loop of width " << VF 4591 << " because it will not generate any vector instructions.\n"); 4592 continue; 4593 } 4594 4595 if (isMoreProfitable(Candidate, ChosenFactor)) 4596 ChosenFactor = Candidate; 4597 } 4598 } 4599 4600 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 4601 reportVectorizationFailure( 4602 "There are conditional stores.", 4603 "store that is conditionally executed prevents vectorization", 4604 "ConditionalStore", ORE, OrigLoop); 4605 ChosenFactor = ScalarCost; 4606 } 4607 4608 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 4609 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 4610 << "LV: Vectorization seems to be not beneficial, " 4611 << "but was forced by a user.\n"); 4612 return ChosenFactor; 4613 } 4614 #endif 4615 4616 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 4617 ElementCount VF) const { 4618 // Cross iteration phis such as reductions need special handling and are 4619 // currently unsupported. 4620 if (any_of(OrigLoop->getHeader()->phis(), 4621 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 4622 return false; 4623 4624 // Phis with uses outside of the loop require special handling and are 4625 // currently unsupported. 4626 for (const auto &Entry : Legal->getInductionVars()) { 4627 // Look for uses of the value of the induction at the last iteration. 4628 Value *PostInc = 4629 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 4630 for (User *U : PostInc->users()) 4631 if (!OrigLoop->contains(cast<Instruction>(U))) 4632 return false; 4633 // Look for uses of penultimate value of the induction. 4634 for (User *U : Entry.first->users()) 4635 if (!OrigLoop->contains(cast<Instruction>(U))) 4636 return false; 4637 } 4638 4639 // Epilogue vectorization code has not been auditted to ensure it handles 4640 // non-latch exits properly. It may be fine, but it needs auditted and 4641 // tested. 4642 // TODO: Add support for loops with an early exit. 4643 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 4644 return false; 4645 4646 return true; 4647 } 4648 4649 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 4650 const ElementCount VF, const unsigned IC) const { 4651 // FIXME: We need a much better cost-model to take different parameters such 4652 // as register pressure, code size increase and cost of extra branches into 4653 // account. For now we apply a very crude heuristic and only consider loops 4654 // with vectorization factors larger than a certain value. 4655 4656 // Allow the target to opt out entirely. 4657 if (!TTI.preferEpilogueVectorization()) 4658 return false; 4659 4660 // We also consider epilogue vectorization unprofitable for targets that don't 4661 // consider interleaving beneficial (eg. MVE). 4662 if (TTI.getMaxInterleaveFactor(VF) <= 1) 4663 return false; 4664 4665 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable 4666 // VFs when deciding profitability. 4667 // See related "TODO: extend to support scalable VFs." in 4668 // selectEpilogueVectorizationFactor. 4669 unsigned Multiplier = VF.isFixed() ? IC : 1; 4670 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 4671 ? EpilogueVectorizationMinVF 4672 : TTI.getEpilogueVectorizationMinVF(); 4673 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; 4674 } 4675 4676 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 4677 const ElementCount MainLoopVF, unsigned IC) { 4678 VectorizationFactor Result = VectorizationFactor::Disabled(); 4679 if (!EnableEpilogueVectorization) { 4680 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 4681 return Result; 4682 } 4683 4684 if (!CM.isScalarEpilogueAllowed()) { 4685 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 4686 "epilogue is allowed.\n"); 4687 return Result; 4688 } 4689 4690 // Not really a cost consideration, but check for unsupported cases here to 4691 // simplify the logic. 4692 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 4693 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 4694 "is not a supported candidate.\n"); 4695 return Result; 4696 } 4697 4698 if (EpilogueVectorizationForceVF > 1) { 4699 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 4700 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 4701 if (hasPlanWithVF(ForcedEC)) 4702 return {ForcedEC, 0, 0}; 4703 4704 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 4705 "viable.\n"); 4706 return Result; 4707 } 4708 4709 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 4710 OrigLoop->getHeader()->getParent()->hasMinSize()) { 4711 LLVM_DEBUG( 4712 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 4713 return Result; 4714 } 4715 4716 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) { 4717 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 4718 "this loop\n"); 4719 return Result; 4720 } 4721 4722 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4723 // the main loop handles 8 lanes per iteration. We could still benefit from 4724 // vectorizing the epilogue loop with VF=4. 4725 ElementCount EstimatedRuntimeVF = 4726 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); 4727 4728 ScalarEvolution &SE = *PSE.getSE(); 4729 Type *TCType = Legal->getWidestInductionType(); 4730 const SCEV *RemainingIterations = nullptr; 4731 unsigned MaxTripCount = 0; 4732 for (auto &NextVF : ProfitableVFs) { 4733 // Skip candidate VFs without a corresponding VPlan. 4734 if (!hasPlanWithVF(NextVF.Width)) 4735 continue; 4736 4737 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable 4738 // vectors) or > the VF of the main loop (fixed vectors). 4739 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 4740 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 4741 (NextVF.Width.isScalable() && 4742 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || 4743 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && 4744 ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) 4745 continue; 4746 4747 // If NextVF is greater than the number of remaining iterations, the 4748 // epilogue loop would be dead. Skip such factors. 4749 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 4750 // TODO: extend to support scalable VFs. 4751 if (!RemainingIterations) { 4752 const SCEV *TC = vputils::getSCEVExprForVPValue( 4753 getPlanFor(NextVF.Width).getTripCount(), SE); 4754 assert(!isa<SCEVCouldNotCompute>(TC) && 4755 "Trip count SCEV must be computable"); 4756 RemainingIterations = SE.getURemExpr( 4757 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 4758 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; 4759 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, 4760 SE.getConstant(TCType, MaxTripCount))) { 4761 MaxTripCount = 4762 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); 4763 } 4764 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " 4765 << MaxTripCount << "\n"); 4766 } 4767 if (SE.isKnownPredicate( 4768 CmpInst::ICMP_UGT, 4769 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 4770 RemainingIterations)) 4771 continue; 4772 } 4773 4774 if (Result.Width.isScalar() || 4775 isMoreProfitable(NextVF, Result, MaxTripCount)) 4776 Result = NextVF; 4777 } 4778 4779 if (Result != VectorizationFactor::Disabled()) 4780 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 4781 << Result.Width << "\n"); 4782 return Result; 4783 } 4784 4785 std::pair<unsigned, unsigned> 4786 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4787 unsigned MinWidth = -1U; 4788 unsigned MaxWidth = 8; 4789 const DataLayout &DL = TheFunction->getDataLayout(); 4790 // For in-loop reductions, no element types are added to ElementTypesInLoop 4791 // if there are no loads/stores in the loop. In this case, check through the 4792 // reduction variables to determine the maximum width. 4793 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 4794 // Reset MaxWidth so that we can find the smallest type used by recurrences 4795 // in the loop. 4796 MaxWidth = -1U; 4797 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 4798 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 4799 // When finding the min width used by the recurrence we need to account 4800 // for casts on the input operands of the recurrence. 4801 MaxWidth = std::min<unsigned>( 4802 MaxWidth, std::min<unsigned>( 4803 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 4804 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 4805 } 4806 } else { 4807 for (Type *T : ElementTypesInLoop) { 4808 MinWidth = std::min<unsigned>( 4809 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4810 MaxWidth = std::max<unsigned>( 4811 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4812 } 4813 } 4814 return {MinWidth, MaxWidth}; 4815 } 4816 4817 void LoopVectorizationCostModel::collectElementTypesForWidening() { 4818 ElementTypesInLoop.clear(); 4819 // For each block. 4820 for (BasicBlock *BB : TheLoop->blocks()) { 4821 // For each instruction in the loop. 4822 for (Instruction &I : BB->instructionsWithoutDebug()) { 4823 Type *T = I.getType(); 4824 4825 // Skip ignored values. 4826 if (ValuesToIgnore.count(&I)) 4827 continue; 4828 4829 // Only examine Loads, Stores and PHINodes. 4830 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 4831 continue; 4832 4833 // Examine PHI nodes that are reduction variables. Update the type to 4834 // account for the recurrence type. 4835 if (auto *PN = dyn_cast<PHINode>(&I)) { 4836 if (!Legal->isReductionVariable(PN)) 4837 continue; 4838 const RecurrenceDescriptor &RdxDesc = 4839 Legal->getReductionVars().find(PN)->second; 4840 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 4841 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 4842 RdxDesc.getRecurrenceType(), 4843 TargetTransformInfo::ReductionFlags())) 4844 continue; 4845 T = RdxDesc.getRecurrenceType(); 4846 } 4847 4848 // Examine the stored values. 4849 if (auto *ST = dyn_cast<StoreInst>(&I)) 4850 T = ST->getValueOperand()->getType(); 4851 4852 assert(T->isSized() && 4853 "Expected the load/store/recurrence type to be sized"); 4854 4855 ElementTypesInLoop.insert(T); 4856 } 4857 } 4858 } 4859 4860 unsigned 4861 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 4862 InstructionCost LoopCost) { 4863 // -- The interleave heuristics -- 4864 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4865 // There are many micro-architectural considerations that we can't predict 4866 // at this level. For example, frontend pressure (on decode or fetch) due to 4867 // code size, or the number and capabilities of the execution ports. 4868 // 4869 // We use the following heuristics to select the interleave count: 4870 // 1. If the code has reductions, then we interleave to break the cross 4871 // iteration dependency. 4872 // 2. If the loop is really small, then we interleave to reduce the loop 4873 // overhead. 4874 // 3. We don't interleave if we think that we will spill registers to memory 4875 // due to the increased register pressure. 4876 4877 if (!isScalarEpilogueAllowed()) 4878 return 1; 4879 4880 // Do not interleave if EVL is preferred and no User IC is specified. 4881 if (foldTailWithEVL()) { 4882 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " 4883 "Unroll factor forced to be 1.\n"); 4884 return 1; 4885 } 4886 4887 // We used the distance for the interleave count. 4888 if (!Legal->isSafeForAnyVectorWidth()) 4889 return 1; 4890 4891 // We don't attempt to perform interleaving for loops with uncountable early 4892 // exits because the VPInstruction::AnyOf code cannot currently handle 4893 // multiple parts. 4894 if (Legal->hasUncountableEarlyExit()) 4895 return 1; 4896 4897 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); 4898 const bool HasReductions = !Legal->getReductionVars().empty(); 4899 4900 // If we did not calculate the cost for VF (because the user selected the VF) 4901 // then we calculate the cost of VF here. 4902 if (LoopCost == 0) { 4903 LoopCost = expectedCost(VF); 4904 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 4905 4906 // Loop body is free and there is no need for interleaving. 4907 if (LoopCost == 0) 4908 return 1; 4909 } 4910 4911 RegisterUsage R = calculateRegisterUsage({VF})[0]; 4912 // We divide by these constants so assume that we have at least one 4913 // instruction that uses at least one register. 4914 for (auto &Pair : R.MaxLocalUsers) { 4915 Pair.second = std::max(Pair.second, 1U); 4916 } 4917 4918 // We calculate the interleave count using the following formula. 4919 // Subtract the number of loop invariants from the number of available 4920 // registers. These registers are used by all of the interleaved instances. 4921 // Next, divide the remaining registers by the number of registers that is 4922 // required by the loop, in order to estimate how many parallel instances 4923 // fit without causing spills. All of this is rounded down if necessary to be 4924 // a power of two. We want power of two interleave count to simplify any 4925 // addressing operations or alignment considerations. 4926 // We also want power of two interleave counts to ensure that the induction 4927 // variable of the vector loop wraps to zero, when tail is folded by masking; 4928 // this currently happens when OptForSize, in which case IC is set to 1 above. 4929 unsigned IC = UINT_MAX; 4930 4931 for (const auto &Pair : R.MaxLocalUsers) { 4932 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first); 4933 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 4934 << " registers of " 4935 << TTI.getRegisterClassName(Pair.first) 4936 << " register class\n"); 4937 if (VF.isScalar()) { 4938 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 4939 TargetNumRegisters = ForceTargetNumScalarRegs; 4940 } else { 4941 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 4942 TargetNumRegisters = ForceTargetNumVectorRegs; 4943 } 4944 unsigned MaxLocalUsers = Pair.second; 4945 unsigned LoopInvariantRegs = 0; 4946 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end()) 4947 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first]; 4948 4949 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 4950 MaxLocalUsers); 4951 // Don't count the induction variable as interleaved. 4952 if (EnableIndVarRegisterHeur) { 4953 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 4954 std::max(1U, (MaxLocalUsers - 1))); 4955 } 4956 4957 IC = std::min(IC, TmpIC); 4958 } 4959 4960 // Clamp the interleave ranges to reasonable counts. 4961 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 4962 4963 // Check if the user has overridden the max. 4964 if (VF.isScalar()) { 4965 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 4966 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 4967 } else { 4968 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 4969 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 4970 } 4971 4972 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); 4973 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4974 if (KnownTC > 0) { 4975 // At least one iteration must be scalar when this constraint holds. So the 4976 // maximum available iterations for interleaving is one less. 4977 unsigned AvailableTC = 4978 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; 4979 4980 // If trip count is known we select between two prospective ICs, where 4981 // 1) the aggressive IC is capped by the trip count divided by VF 4982 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 4983 // The final IC is selected in a way that the epilogue loop trip count is 4984 // minimized while maximizing the IC itself, so that we either run the 4985 // vector loop at least once if it generates a small epilogue loop, or else 4986 // we run the vector loop at least twice. 4987 4988 unsigned InterleaveCountUB = bit_floor( 4989 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); 4990 unsigned InterleaveCountLB = bit_floor(std::max( 4991 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 4992 MaxInterleaveCount = InterleaveCountLB; 4993 4994 if (InterleaveCountUB != InterleaveCountLB) { 4995 unsigned TailTripCountUB = 4996 (AvailableTC % (EstimatedVF * InterleaveCountUB)); 4997 unsigned TailTripCountLB = 4998 (AvailableTC % (EstimatedVF * InterleaveCountLB)); 4999 // If both produce same scalar tail, maximize the IC to do the same work 5000 // in fewer vector loop iterations 5001 if (TailTripCountUB == TailTripCountLB) 5002 MaxInterleaveCount = InterleaveCountUB; 5003 } 5004 } else if (BestKnownTC && *BestKnownTC > 0) { 5005 // At least one iteration must be scalar when this constraint holds. So the 5006 // maximum available iterations for interleaving is one less. 5007 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) 5008 ? (*BestKnownTC) - 1 5009 : *BestKnownTC; 5010 5011 // If trip count is an estimated compile time constant, limit the 5012 // IC to be capped by the trip count divided by VF * 2, such that the vector 5013 // loop runs at least twice to make interleaving seem profitable when there 5014 // is an epilogue loop present. Since exact Trip count is not known we 5015 // choose to be conservative in our IC estimate. 5016 MaxInterleaveCount = bit_floor(std::max( 5017 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 5018 } 5019 5020 assert(MaxInterleaveCount > 0 && 5021 "Maximum interleave count must be greater than 0"); 5022 5023 // Clamp the calculated IC to be between the 1 and the max interleave count 5024 // that the target and trip count allows. 5025 if (IC > MaxInterleaveCount) 5026 IC = MaxInterleaveCount; 5027 else 5028 // Make sure IC is greater than 0. 5029 IC = std::max(1u, IC); 5030 5031 assert(IC > 0 && "Interleave count must be greater than 0."); 5032 5033 // Interleave if we vectorized this loop and there is a reduction that could 5034 // benefit from interleaving. 5035 if (VF.isVector() && HasReductions) { 5036 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5037 return IC; 5038 } 5039 5040 // For any scalar loop that either requires runtime checks or predication we 5041 // are better off leaving this to the unroller. Note that if we've already 5042 // vectorized the loop we will have done the runtime check and so interleaving 5043 // won't require further checks. 5044 bool ScalarInterleavingRequiresPredication = 5045 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5046 return Legal->blockNeedsPredication(BB); 5047 })); 5048 bool ScalarInterleavingRequiresRuntimePointerCheck = 5049 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5050 5051 // We want to interleave small loops in order to reduce the loop overhead and 5052 // potentially expose ILP opportunities. 5053 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5054 << "LV: IC is " << IC << '\n' 5055 << "LV: VF is " << VF << '\n'); 5056 const bool AggressivelyInterleaveReductions = 5057 TTI.enableAggressiveInterleaving(HasReductions); 5058 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5059 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5060 // We assume that the cost overhead is 1 and we use the cost model 5061 // to estimate the cost of the loop and interleave until the cost of the 5062 // loop overhead is about 5% of the cost of the loop. 5063 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5064 SmallLoopCost / *LoopCost.getValue())); 5065 5066 // Interleave until store/load ports (estimated by max interleave count) are 5067 // saturated. 5068 unsigned NumStores = Legal->getNumStores(); 5069 unsigned NumLoads = Legal->getNumLoads(); 5070 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5071 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5072 5073 // There is little point in interleaving for reductions containing selects 5074 // and compares when VF=1 since it may just create more overhead than it's 5075 // worth for loops with small trip counts. This is because we still have to 5076 // do the final reduction after the loop. 5077 bool HasSelectCmpReductions = 5078 HasReductions && 5079 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5080 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5081 RecurKind RK = RdxDesc.getRecurrenceKind(); 5082 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || 5083 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK); 5084 }); 5085 if (HasSelectCmpReductions) { 5086 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5087 return 1; 5088 } 5089 5090 // If we have a scalar reduction (vector reductions are already dealt with 5091 // by this point), we can increase the critical path length if the loop 5092 // we're interleaving is inside another loop. For tree-wise reductions 5093 // set the limit to 2, and for ordered reductions it's best to disable 5094 // interleaving entirely. 5095 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5096 bool HasOrderedReductions = 5097 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5098 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5099 return RdxDesc.isOrdered(); 5100 }); 5101 if (HasOrderedReductions) { 5102 LLVM_DEBUG( 5103 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5104 return 1; 5105 } 5106 5107 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5108 SmallIC = std::min(SmallIC, F); 5109 StoresIC = std::min(StoresIC, F); 5110 LoadsIC = std::min(LoadsIC, F); 5111 } 5112 5113 if (EnableLoadStoreRuntimeInterleave && 5114 std::max(StoresIC, LoadsIC) > SmallIC) { 5115 LLVM_DEBUG( 5116 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5117 return std::max(StoresIC, LoadsIC); 5118 } 5119 5120 // If there are scalar reductions and TTI has enabled aggressive 5121 // interleaving for reductions, we will interleave to expose ILP. 5122 if (VF.isScalar() && AggressivelyInterleaveReductions) { 5123 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5124 // Interleave no less than SmallIC but not as aggressive as the normal IC 5125 // to satisfy the rare situation when resources are too limited. 5126 return std::max(IC / 2, SmallIC); 5127 } 5128 5129 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5130 return SmallIC; 5131 } 5132 5133 // Interleave if this is a large loop (small loops are already dealt with by 5134 // this point) that could benefit from interleaving. 5135 if (AggressivelyInterleaveReductions) { 5136 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5137 return IC; 5138 } 5139 5140 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5141 return 1; 5142 } 5143 5144 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5145 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5146 // This function calculates the register usage by measuring the highest number 5147 // of values that are alive at a single location. Obviously, this is a very 5148 // rough estimation. We scan the loop in a topological order in order and 5149 // assign a number to each instruction. We use RPO to ensure that defs are 5150 // met before their users. We assume that each instruction that has in-loop 5151 // users starts an interval. We record every time that an in-loop value is 5152 // used, so we have a list of the first and last occurrences of each 5153 // instruction. Next, we transpose this data structure into a multi map that 5154 // holds the list of intervals that *end* at a specific location. This multi 5155 // map allows us to perform a linear search. We scan the instructions linearly 5156 // and record each time that a new interval starts, by placing it in a set. 5157 // If we find this value in the multi-map then we remove it from the set. 5158 // The max register usage is the maximum size of the set. 5159 // We also search for instructions that are defined outside the loop, but are 5160 // used inside the loop. We need this number separately from the max-interval 5161 // usage number because when we unroll, loop-invariant values do not take 5162 // more register. 5163 LoopBlocksDFS DFS(TheLoop); 5164 DFS.perform(LI); 5165 5166 RegisterUsage RU; 5167 5168 // Each 'key' in the map opens a new interval. The values 5169 // of the map are the index of the 'last seen' usage of the 5170 // instruction that is the key. 5171 using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>; 5172 5173 // Maps instruction to its index. 5174 SmallVector<Instruction *, 64> IdxToInstr; 5175 // Marks the end of each interval. 5176 IntervalMap EndPoint; 5177 // Saves the list of instruction indices that are used in the loop. 5178 SmallPtrSet<Instruction *, 8> Ends; 5179 // Saves the list of values that are used in the loop but are defined outside 5180 // the loop (not including non-instruction values such as arguments and 5181 // constants). 5182 SmallSetVector<Instruction *, 8> LoopInvariants; 5183 5184 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5185 for (Instruction &I : BB->instructionsWithoutDebug()) { 5186 IdxToInstr.push_back(&I); 5187 5188 // Save the end location of each USE. 5189 for (Value *U : I.operands()) { 5190 auto *Instr = dyn_cast<Instruction>(U); 5191 5192 // Ignore non-instruction values such as arguments, constants, etc. 5193 // FIXME: Might need some motivation why these values are ignored. If 5194 // for example an argument is used inside the loop it will increase the 5195 // register pressure (so shouldn't we add it to LoopInvariants). 5196 if (!Instr) 5197 continue; 5198 5199 // If this instruction is outside the loop then record it and continue. 5200 if (!TheLoop->contains(Instr)) { 5201 LoopInvariants.insert(Instr); 5202 continue; 5203 } 5204 5205 // Overwrite previous end points. 5206 EndPoint[Instr] = IdxToInstr.size(); 5207 Ends.insert(Instr); 5208 } 5209 } 5210 } 5211 5212 // Saves the list of intervals that end with the index in 'key'. 5213 using InstrList = SmallVector<Instruction *, 2>; 5214 SmallDenseMap<unsigned, InstrList, 16> TransposeEnds; 5215 5216 // Transpose the EndPoints to a list of values that end at each index. 5217 for (auto &Interval : EndPoint) 5218 TransposeEnds[Interval.second].push_back(Interval.first); 5219 5220 SmallPtrSet<Instruction *, 8> OpenIntervals; 5221 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5222 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5223 5224 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5225 5226 const auto &TTICapture = TTI; 5227 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5228 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || 5229 (VF.isScalable() && 5230 !TTICapture.isElementTypeLegalForScalableVector(Ty))) 5231 return 0; 5232 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5233 }; 5234 5235 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { 5236 Instruction *I = IdxToInstr[Idx]; 5237 5238 // Remove all of the instructions that end at this location. 5239 InstrList &List = TransposeEnds[Idx]; 5240 for (Instruction *ToRemove : List) 5241 OpenIntervals.erase(ToRemove); 5242 5243 // Ignore instructions that are never used within the loop. 5244 if (!Ends.count(I)) 5245 continue; 5246 5247 // Skip ignored values. 5248 if (ValuesToIgnore.count(I)) 5249 continue; 5250 5251 collectInLoopReductions(); 5252 5253 // For each VF find the maximum usage of registers. 5254 for (unsigned J = 0, E = VFs.size(); J < E; ++J) { 5255 // Count the number of registers used, per register class, given all open 5256 // intervals. 5257 // Note that elements in this SmallMapVector will be default constructed 5258 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5259 // there is no previous entry for ClassID. 5260 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5261 5262 if (VFs[J].isScalar()) { 5263 for (auto *Inst : OpenIntervals) { 5264 unsigned ClassID = 5265 TTI.getRegisterClassForType(false, Inst->getType()); 5266 // FIXME: The target might use more than one register for the type 5267 // even in the scalar case. 5268 RegUsage[ClassID] += 1; 5269 } 5270 } else { 5271 collectUniformsAndScalars(VFs[J]); 5272 for (auto *Inst : OpenIntervals) { 5273 // Skip ignored values for VF > 1. 5274 if (VecValuesToIgnore.count(Inst)) 5275 continue; 5276 if (isScalarAfterVectorization(Inst, VFs[J])) { 5277 unsigned ClassID = 5278 TTI.getRegisterClassForType(false, Inst->getType()); 5279 // FIXME: The target might use more than one register for the type 5280 // even in the scalar case. 5281 RegUsage[ClassID] += 1; 5282 } else { 5283 unsigned ClassID = 5284 TTI.getRegisterClassForType(true, Inst->getType()); 5285 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); 5286 } 5287 } 5288 } 5289 5290 for (const auto &Pair : RegUsage) { 5291 auto &Entry = MaxUsages[J][Pair.first]; 5292 Entry = std::max(Entry, Pair.second); 5293 } 5294 } 5295 5296 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " 5297 << OpenIntervals.size() << '\n'); 5298 5299 // Add the current instruction to the list of open intervals. 5300 OpenIntervals.insert(I); 5301 } 5302 5303 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { 5304 // Note that elements in this SmallMapVector will be default constructed 5305 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5306 // there is no previous entry for ClassID. 5307 SmallMapVector<unsigned, unsigned, 4> Invariant; 5308 5309 for (auto *Inst : LoopInvariants) { 5310 // FIXME: The target might use more than one register for the type 5311 // even in the scalar case. 5312 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5313 auto *I = cast<Instruction>(U); 5314 return TheLoop != LI->getLoopFor(I->getParent()) || 5315 isScalarAfterVectorization(I, VFs[Idx]); 5316 }); 5317 5318 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; 5319 unsigned ClassID = 5320 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5321 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5322 } 5323 5324 LLVM_DEBUG({ 5325 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; 5326 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() 5327 << " item\n"; 5328 for (const auto &pair : MaxUsages[Idx]) { 5329 dbgs() << "LV(REG): RegisterClass: " 5330 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5331 << " registers\n"; 5332 } 5333 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5334 << " item\n"; 5335 for (const auto &pair : Invariant) { 5336 dbgs() << "LV(REG): RegisterClass: " 5337 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5338 << " registers\n"; 5339 } 5340 }); 5341 5342 RU.LoopInvariantRegs = Invariant; 5343 RU.MaxLocalUsers = MaxUsages[Idx]; 5344 RUs[Idx] = RU; 5345 } 5346 5347 return RUs; 5348 } 5349 5350 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5351 ElementCount VF) { 5352 // TODO: Cost model for emulated masked load/store is completely 5353 // broken. This hack guides the cost model to use an artificially 5354 // high enough value to practically disable vectorization with such 5355 // operations, except where previously deployed legality hack allowed 5356 // using very low cost values. This is to avoid regressions coming simply 5357 // from moving "masked load/store" check from legality to cost model. 5358 // Masked Load/Gather emulation was previously never allowed. 5359 // Limited number of Masked Store/Scatter emulation was allowed. 5360 assert((isPredicatedInst(I)) && 5361 "Expecting a scalar emulated instruction"); 5362 return isa<LoadInst>(I) || 5363 (isa<StoreInst>(I) && 5364 NumPredStores > NumberOfStoresToPredicate); 5365 } 5366 5367 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5368 // If we aren't vectorizing the loop, or if we've already collected the 5369 // instructions to scalarize, there's nothing to do. Collection may already 5370 // have occurred if we have a user-selected VF and are now computing the 5371 // expected cost for interleaving. 5372 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5373 return; 5374 5375 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5376 // not profitable to scalarize any instructions, the presence of VF in the 5377 // map will indicate that we've analyzed it already. 5378 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5379 5380 PredicatedBBsAfterVectorization[VF].clear(); 5381 5382 // Find all the instructions that are scalar with predication in the loop and 5383 // determine if it would be better to not if-convert the blocks they are in. 5384 // If so, we also record the instructions to scalarize. 5385 for (BasicBlock *BB : TheLoop->blocks()) { 5386 if (!blockNeedsPredicationForAnyReason(BB)) 5387 continue; 5388 for (Instruction &I : *BB) 5389 if (isScalarWithPredication(&I, VF)) { 5390 ScalarCostsTy ScalarCosts; 5391 // Do not apply discount logic for: 5392 // 1. Scalars after vectorization, as there will only be a single copy 5393 // of the instruction. 5394 // 2. Scalable VF, as that would lead to invalid scalarization costs. 5395 // 3. Emulated masked memrefs, if a hacked cost is needed. 5396 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && 5397 !useEmulatedMaskMemRefHack(&I, VF) && 5398 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { 5399 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5400 // Check if we decided to scalarize a call. If so, update the widening 5401 // decision of the call to CM_Scalarize with the computed scalar cost. 5402 for (const auto &[I, _] : ScalarCosts) { 5403 auto *CI = dyn_cast<CallInst>(I); 5404 if (!CI || !CallWideningDecisions.contains({CI, VF})) 5405 continue; 5406 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize; 5407 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI]; 5408 } 5409 } 5410 // Remember that BB will remain after vectorization. 5411 PredicatedBBsAfterVectorization[VF].insert(BB); 5412 for (auto *Pred : predecessors(BB)) { 5413 if (Pred->getSingleSuccessor() == BB) 5414 PredicatedBBsAfterVectorization[VF].insert(Pred); 5415 } 5416 } 5417 } 5418 } 5419 5420 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5421 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5422 assert(!isUniformAfterVectorization(PredInst, VF) && 5423 "Instruction marked uniform-after-vectorization will be predicated"); 5424 5425 // Initialize the discount to zero, meaning that the scalar version and the 5426 // vector version cost the same. 5427 InstructionCost Discount = 0; 5428 5429 // Holds instructions to analyze. The instructions we visit are mapped in 5430 // ScalarCosts. Those instructions are the ones that would be scalarized if 5431 // we find that the scalar version costs less. 5432 SmallVector<Instruction *, 8> Worklist; 5433 5434 // Returns true if the given instruction can be scalarized. 5435 auto CanBeScalarized = [&](Instruction *I) -> bool { 5436 // We only attempt to scalarize instructions forming a single-use chain 5437 // from the original predicated block that would otherwise be vectorized. 5438 // Although not strictly necessary, we give up on instructions we know will 5439 // already be scalar to avoid traversing chains that are unlikely to be 5440 // beneficial. 5441 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5442 isScalarAfterVectorization(I, VF)) 5443 return false; 5444 5445 // If the instruction is scalar with predication, it will be analyzed 5446 // separately. We ignore it within the context of PredInst. 5447 if (isScalarWithPredication(I, VF)) 5448 return false; 5449 5450 // If any of the instruction's operands are uniform after vectorization, 5451 // the instruction cannot be scalarized. This prevents, for example, a 5452 // masked load from being scalarized. 5453 // 5454 // We assume we will only emit a value for lane zero of an instruction 5455 // marked uniform after vectorization, rather than VF identical values. 5456 // Thus, if we scalarize an instruction that uses a uniform, we would 5457 // create uses of values corresponding to the lanes we aren't emitting code 5458 // for. This behavior can be changed by allowing getScalarValue to clone 5459 // the lane zero values for uniforms rather than asserting. 5460 for (Use &U : I->operands()) 5461 if (auto *J = dyn_cast<Instruction>(U.get())) 5462 if (isUniformAfterVectorization(J, VF)) 5463 return false; 5464 5465 // Otherwise, we can scalarize the instruction. 5466 return true; 5467 }; 5468 5469 // Compute the expected cost discount from scalarizing the entire expression 5470 // feeding the predicated instruction. We currently only consider expressions 5471 // that are single-use instruction chains. 5472 Worklist.push_back(PredInst); 5473 while (!Worklist.empty()) { 5474 Instruction *I = Worklist.pop_back_val(); 5475 5476 // If we've already analyzed the instruction, there's nothing to do. 5477 if (ScalarCosts.contains(I)) 5478 continue; 5479 5480 // Compute the cost of the vector instruction. Note that this cost already 5481 // includes the scalarization overhead of the predicated instruction. 5482 InstructionCost VectorCost = getInstructionCost(I, VF); 5483 5484 // Compute the cost of the scalarized instruction. This cost is the cost of 5485 // the instruction as if it wasn't if-converted and instead remained in the 5486 // predicated block. We will scale this cost by block probability after 5487 // computing the scalarization overhead. 5488 InstructionCost ScalarCost = 5489 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); 5490 5491 // Compute the scalarization overhead of needed insertelement instructions 5492 // and phi nodes. 5493 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5494 ScalarCost += TTI.getScalarizationOverhead( 5495 cast<VectorType>(toVectorTy(I->getType(), VF)), 5496 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5497 /*Extract*/ false, CostKind); 5498 ScalarCost += 5499 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5500 } 5501 5502 // Compute the scalarization overhead of needed extractelement 5503 // instructions. For each of the instruction's operands, if the operand can 5504 // be scalarized, add it to the worklist; otherwise, account for the 5505 // overhead. 5506 for (Use &U : I->operands()) 5507 if (auto *J = dyn_cast<Instruction>(U.get())) { 5508 assert(VectorType::isValidElementType(J->getType()) && 5509 "Instruction has non-scalar type"); 5510 if (CanBeScalarized(J)) 5511 Worklist.push_back(J); 5512 else if (needsExtract(J, VF)) { 5513 ScalarCost += TTI.getScalarizationOverhead( 5514 cast<VectorType>(toVectorTy(J->getType(), VF)), 5515 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5516 /*Extract*/ true, CostKind); 5517 } 5518 } 5519 5520 // Scale the total scalar cost by block probability. 5521 ScalarCost /= getReciprocalPredBlockProb(); 5522 5523 // Compute the discount. A non-negative discount means the vector version 5524 // of the instruction costs more, and scalarizing would be beneficial. 5525 Discount += VectorCost - ScalarCost; 5526 ScalarCosts[I] = ScalarCost; 5527 } 5528 5529 return Discount; 5530 } 5531 5532 InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { 5533 InstructionCost Cost; 5534 5535 // If the vector loop gets executed exactly once with the given VF, ignore the 5536 // costs of comparison and induction instructions, as they'll get simplified 5537 // away. 5538 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF; 5539 auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5540 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking()) 5541 addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(), 5542 ValuesToIgnoreForVF); 5543 5544 // For each block. 5545 for (BasicBlock *BB : TheLoop->blocks()) { 5546 InstructionCost BlockCost; 5547 5548 // For each instruction in the old loop. 5549 for (Instruction &I : BB->instructionsWithoutDebug()) { 5550 // Skip ignored values. 5551 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) || 5552 (VF.isVector() && VecValuesToIgnore.count(&I))) 5553 continue; 5554 5555 InstructionCost C = getInstructionCost(&I, VF); 5556 5557 // Check if we should override the cost. 5558 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) 5559 C = InstructionCost(ForceTargetInstructionCost); 5560 5561 BlockCost += C; 5562 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " 5563 << VF << " For instruction: " << I << '\n'); 5564 } 5565 5566 // If we are vectorizing a predicated block, it will have been 5567 // if-converted. This means that the block's instructions (aside from 5568 // stores and instructions that may divide by zero) will now be 5569 // unconditionally executed. For the scalar case, we may not always execute 5570 // the predicated block, if it is an if-else block. Thus, scale the block's 5571 // cost by the probability of executing it. blockNeedsPredication from 5572 // Legal is used so as to not include all blocks in tail folded loops. 5573 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5574 BlockCost /= getReciprocalPredBlockProb(); 5575 5576 Cost += BlockCost; 5577 } 5578 5579 return Cost; 5580 } 5581 5582 /// Gets Address Access SCEV after verifying that the access pattern 5583 /// is loop invariant except the induction variable dependence. 5584 /// 5585 /// This SCEV can be sent to the Target in order to estimate the address 5586 /// calculation cost. 5587 static const SCEV *getAddressAccessSCEV( 5588 Value *Ptr, 5589 LoopVectorizationLegality *Legal, 5590 PredicatedScalarEvolution &PSE, 5591 const Loop *TheLoop) { 5592 5593 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5594 if (!Gep) 5595 return nullptr; 5596 5597 // We are looking for a gep with all loop invariant indices except for one 5598 // which should be an induction variable. 5599 auto *SE = PSE.getSE(); 5600 unsigned NumOperands = Gep->getNumOperands(); 5601 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { 5602 Value *Opd = Gep->getOperand(Idx); 5603 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5604 !Legal->isInductionVariable(Opd)) 5605 return nullptr; 5606 } 5607 5608 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5609 return PSE.getSCEV(Ptr); 5610 } 5611 5612 InstructionCost 5613 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5614 ElementCount VF) { 5615 assert(VF.isVector() && 5616 "Scalarization cost of instruction implies vectorization."); 5617 if (VF.isScalable()) 5618 return InstructionCost::getInvalid(); 5619 5620 Type *ValTy = getLoadStoreType(I); 5621 auto *SE = PSE.getSE(); 5622 5623 unsigned AS = getLoadStoreAddressSpace(I); 5624 Value *Ptr = getLoadStorePointerOperand(I); 5625 Type *PtrTy = toVectorTy(Ptr->getType(), VF); 5626 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5627 // that it is being called from this specific place. 5628 5629 // Figure out whether the access is strided and get the stride value 5630 // if it's known in compile time 5631 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5632 5633 // Get the cost of the scalar memory instruction and address computation. 5634 InstructionCost Cost = 5635 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5636 5637 // Don't pass *I here, since it is scalar but will actually be part of a 5638 // vectorized loop where the user of it is a vectorized instruction. 5639 const Align Alignment = getLoadStoreAlignment(I); 5640 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 5641 ValTy->getScalarType(), 5642 Alignment, AS, CostKind); 5643 5644 // Get the overhead of the extractelement and insertelement instructions 5645 // we might create due to scalarization. 5646 Cost += getScalarizationOverhead(I, VF); 5647 5648 // If we have a predicated load/store, it will need extra i1 extracts and 5649 // conditional branches, but may not be executed for each vector lane. Scale 5650 // the cost by the probability of executing the predicated block. 5651 if (isPredicatedInst(I)) { 5652 Cost /= getReciprocalPredBlockProb(); 5653 5654 // Add the cost of an i1 extract and a branch 5655 auto *VecI1Ty = 5656 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 5657 Cost += TTI.getScalarizationOverhead( 5658 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 5659 /*Insert=*/false, /*Extract=*/true, CostKind); 5660 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 5661 5662 if (useEmulatedMaskMemRefHack(I, VF)) 5663 // Artificially setting to a high enough value to practically disable 5664 // vectorization with such operations. 5665 Cost = 3000000; 5666 } 5667 5668 return Cost; 5669 } 5670 5671 InstructionCost 5672 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5673 ElementCount VF) { 5674 Type *ValTy = getLoadStoreType(I); 5675 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5676 Value *Ptr = getLoadStorePointerOperand(I); 5677 unsigned AS = getLoadStoreAddressSpace(I); 5678 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 5679 5680 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5681 "Stride should be 1 or -1 for consecutive memory access"); 5682 const Align Alignment = getLoadStoreAlignment(I); 5683 InstructionCost Cost = 0; 5684 if (Legal->isMaskRequired(I)) { 5685 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5686 CostKind); 5687 } else { 5688 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5689 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5690 CostKind, OpInfo, I); 5691 } 5692 5693 bool Reverse = ConsecutiveStride < 0; 5694 if (Reverse) 5695 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5696 CostKind, 0); 5697 return Cost; 5698 } 5699 5700 InstructionCost 5701 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5702 ElementCount VF) { 5703 assert(Legal->isUniformMemOp(*I, VF)); 5704 5705 Type *ValTy = getLoadStoreType(I); 5706 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5707 const Align Alignment = getLoadStoreAlignment(I); 5708 unsigned AS = getLoadStoreAddressSpace(I); 5709 if (isa<LoadInst>(I)) { 5710 return TTI.getAddressComputationCost(ValTy) + 5711 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5712 CostKind) + 5713 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {}, 5714 CostKind); 5715 } 5716 StoreInst *SI = cast<StoreInst>(I); 5717 5718 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 5719 return TTI.getAddressComputationCost(ValTy) + 5720 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5721 CostKind) + 5722 (IsLoopInvariantStoreValue 5723 ? 0 5724 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5725 CostKind, VF.getKnownMinValue() - 1)); 5726 } 5727 5728 InstructionCost 5729 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5730 ElementCount VF) { 5731 Type *ValTy = getLoadStoreType(I); 5732 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5733 const Align Alignment = getLoadStoreAlignment(I); 5734 const Value *Ptr = getLoadStorePointerOperand(I); 5735 5736 return TTI.getAddressComputationCost(VectorTy) + 5737 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5738 Legal->isMaskRequired(I), Alignment, 5739 CostKind, I); 5740 } 5741 5742 InstructionCost 5743 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5744 ElementCount VF) { 5745 const auto *Group = getInterleavedAccessGroup(I); 5746 assert(Group && "Fail to get an interleaved access group."); 5747 5748 Instruction *InsertPos = Group->getInsertPos(); 5749 Type *ValTy = getLoadStoreType(InsertPos); 5750 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF)); 5751 unsigned AS = getLoadStoreAddressSpace(InsertPos); 5752 5753 unsigned InterleaveFactor = Group->getFactor(); 5754 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5755 5756 // Holds the indices of existing members in the interleaved group. 5757 SmallVector<unsigned, 4> Indices; 5758 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 5759 if (Group->getMember(IF)) 5760 Indices.push_back(IF); 5761 5762 // Calculate the cost of the whole interleaved group. 5763 bool UseMaskForGaps = 5764 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 5765 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 5766 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 5767 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5768 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I), 5769 UseMaskForGaps); 5770 5771 if (Group->isReverse()) { 5772 // TODO: Add support for reversed masked interleaved access. 5773 assert(!Legal->isMaskRequired(I) && 5774 "Reverse masked interleaved access not supported."); 5775 Cost += Group->getNumMembers() * 5776 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {}, 5777 CostKind, 0); 5778 } 5779 return Cost; 5780 } 5781 5782 std::optional<InstructionCost> 5783 LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, 5784 ElementCount VF, 5785 Type *Ty) const { 5786 using namespace llvm::PatternMatch; 5787 // Early exit for no inloop reductions 5788 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 5789 return std::nullopt; 5790 auto *VectorTy = cast<VectorType>(Ty); 5791 5792 // We are looking for a pattern of, and finding the minimal acceptable cost: 5793 // reduce(mul(ext(A), ext(B))) or 5794 // reduce(mul(A, B)) or 5795 // reduce(ext(A)) or 5796 // reduce(A). 5797 // The basic idea is that we walk down the tree to do that, finding the root 5798 // reduction instruction in InLoopReductionImmediateChains. From there we find 5799 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 5800 // of the components. If the reduction cost is lower then we return it for the 5801 // reduction instruction and 0 for the other instructions in the pattern. If 5802 // it is not we return an invalid cost specifying the orignal cost method 5803 // should be used. 5804 Instruction *RetI = I; 5805 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 5806 if (!RetI->hasOneUser()) 5807 return std::nullopt; 5808 RetI = RetI->user_back(); 5809 } 5810 5811 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 5812 RetI->user_back()->getOpcode() == Instruction::Add) { 5813 RetI = RetI->user_back(); 5814 } 5815 5816 // Test if the found instruction is a reduction, and if not return an invalid 5817 // cost specifying the parent to use the original cost modelling. 5818 if (!InLoopReductionImmediateChains.count(RetI)) 5819 return std::nullopt; 5820 5821 // Find the reduction this chain is a part of and calculate the basic cost of 5822 // the reduction on its own. 5823 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 5824 Instruction *ReductionPhi = LastChain; 5825 while (!isa<PHINode>(ReductionPhi)) 5826 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 5827 5828 const RecurrenceDescriptor &RdxDesc = 5829 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 5830 5831 InstructionCost BaseCost; 5832 RecurKind RK = RdxDesc.getRecurrenceKind(); 5833 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 5834 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); 5835 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, 5836 RdxDesc.getFastMathFlags(), CostKind); 5837 } else { 5838 BaseCost = TTI.getArithmeticReductionCost( 5839 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 5840 } 5841 5842 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5843 // normal fmul instruction to the cost of the fadd reduction. 5844 if (RK == RecurKind::FMulAdd) 5845 BaseCost += 5846 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 5847 5848 // If we're using ordered reductions then we can just return the base cost 5849 // here, since getArithmeticReductionCost calculates the full ordered 5850 // reduction cost when FP reassociation is not allowed. 5851 if (useOrderedReductions(RdxDesc)) 5852 return BaseCost; 5853 5854 // Get the operand that was not the reduction chain and match it to one of the 5855 // patterns, returning the better cost if it is found. 5856 Instruction *RedOp = RetI->getOperand(1) == LastChain 5857 ? dyn_cast<Instruction>(RetI->getOperand(0)) 5858 : dyn_cast<Instruction>(RetI->getOperand(1)); 5859 5860 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 5861 5862 Instruction *Op0, *Op1; 5863 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5864 match(RedOp, 5865 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 5866 match(Op0, m_ZExtOrSExt(m_Value())) && 5867 Op0->getOpcode() == Op1->getOpcode() && 5868 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 5869 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 5870 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 5871 5872 // Matched reduce.add(ext(mul(ext(A), ext(B))) 5873 // Note that the extend opcodes need to all match, or if A==B they will have 5874 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5875 // which is equally fine. 5876 bool IsUnsigned = isa<ZExtInst>(Op0); 5877 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 5878 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 5879 5880 InstructionCost ExtCost = 5881 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 5882 TTI::CastContextHint::None, CostKind, Op0); 5883 InstructionCost MulCost = 5884 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 5885 InstructionCost Ext2Cost = 5886 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 5887 TTI::CastContextHint::None, CostKind, RedOp); 5888 5889 InstructionCost RedCost = TTI.getMulAccReductionCost( 5890 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5891 5892 if (RedCost.isValid() && 5893 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 5894 return I == RetI ? RedCost : 0; 5895 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 5896 !TheLoop->isLoopInvariant(RedOp)) { 5897 // Matched reduce(ext(A)) 5898 bool IsUnsigned = isa<ZExtInst>(RedOp); 5899 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 5900 InstructionCost RedCost = TTI.getExtendedReductionCost( 5901 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 5902 RdxDesc.getFastMathFlags(), CostKind); 5903 5904 InstructionCost ExtCost = 5905 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 5906 TTI::CastContextHint::None, CostKind, RedOp); 5907 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 5908 return I == RetI ? RedCost : 0; 5909 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5910 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 5911 if (match(Op0, m_ZExtOrSExt(m_Value())) && 5912 Op0->getOpcode() == Op1->getOpcode() && 5913 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 5914 bool IsUnsigned = isa<ZExtInst>(Op0); 5915 Type *Op0Ty = Op0->getOperand(0)->getType(); 5916 Type *Op1Ty = Op1->getOperand(0)->getType(); 5917 Type *LargestOpTy = 5918 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 5919 : Op0Ty; 5920 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 5921 5922 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 5923 // different sizes. We take the largest type as the ext to reduce, and add 5924 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 5925 InstructionCost ExtCost0 = TTI.getCastInstrCost( 5926 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 5927 TTI::CastContextHint::None, CostKind, Op0); 5928 InstructionCost ExtCost1 = TTI.getCastInstrCost( 5929 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 5930 TTI::CastContextHint::None, CostKind, Op1); 5931 InstructionCost MulCost = 5932 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5933 5934 InstructionCost RedCost = TTI.getMulAccReductionCost( 5935 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5936 InstructionCost ExtraExtCost = 0; 5937 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 5938 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 5939 ExtraExtCost = TTI.getCastInstrCost( 5940 ExtraExtOp->getOpcode(), ExtType, 5941 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 5942 TTI::CastContextHint::None, CostKind, ExtraExtOp); 5943 } 5944 5945 if (RedCost.isValid() && 5946 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 5947 return I == RetI ? RedCost : 0; 5948 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 5949 // Matched reduce.add(mul()) 5950 InstructionCost MulCost = 5951 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5952 5953 InstructionCost RedCost = TTI.getMulAccReductionCost( 5954 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 5955 5956 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 5957 return I == RetI ? RedCost : 0; 5958 } 5959 } 5960 5961 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 5962 } 5963 5964 InstructionCost 5965 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5966 ElementCount VF) { 5967 // Calculate scalar cost only. Vectorization cost should be ready at this 5968 // moment. 5969 if (VF.isScalar()) { 5970 Type *ValTy = getLoadStoreType(I); 5971 const Align Alignment = getLoadStoreAlignment(I); 5972 unsigned AS = getLoadStoreAddressSpace(I); 5973 5974 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5975 return TTI.getAddressComputationCost(ValTy) + 5976 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind, 5977 OpInfo, I); 5978 } 5979 return getWideningCost(I, VF); 5980 } 5981 5982 InstructionCost 5983 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5984 ElementCount VF) const { 5985 5986 // There is no mechanism yet to create a scalable scalarization loop, 5987 // so this is currently Invalid. 5988 if (VF.isScalable()) 5989 return InstructionCost::getInvalid(); 5990 5991 if (VF.isScalar()) 5992 return 0; 5993 5994 InstructionCost Cost = 0; 5995 Type *RetTy = toVectorTy(I->getType(), VF); 5996 if (!RetTy->isVoidTy() && 5997 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5998 Cost += TTI.getScalarizationOverhead( 5999 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6000 /*Insert*/ true, 6001 /*Extract*/ false, CostKind); 6002 6003 // Some targets keep addresses scalar. 6004 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6005 return Cost; 6006 6007 // Some targets support efficient element stores. 6008 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6009 return Cost; 6010 6011 // Collect operands to consider. 6012 CallInst *CI = dyn_cast<CallInst>(I); 6013 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6014 6015 // Skip operands that do not require extraction/scalarization and do not incur 6016 // any overhead. 6017 SmallVector<Type *> Tys; 6018 for (auto *V : filterExtractingOperands(Ops, VF)) 6019 Tys.push_back(maybeVectorizeType(V->getType(), VF)); 6020 return Cost + TTI.getOperandsScalarizationOverhead( 6021 filterExtractingOperands(Ops, VF), Tys, CostKind); 6022 } 6023 6024 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6025 if (VF.isScalar()) 6026 return; 6027 NumPredStores = 0; 6028 for (BasicBlock *BB : TheLoop->blocks()) { 6029 // For each instruction in the old loop. 6030 for (Instruction &I : *BB) { 6031 Value *Ptr = getLoadStorePointerOperand(&I); 6032 if (!Ptr) 6033 continue; 6034 6035 // TODO: We should generate better code and update the cost model for 6036 // predicated uniform stores. Today they are treated as any other 6037 // predicated store (see added test cases in 6038 // invariant-store-vectorization.ll). 6039 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6040 NumPredStores++; 6041 6042 if (Legal->isUniformMemOp(I, VF)) { 6043 auto IsLegalToScalarize = [&]() { 6044 if (!VF.isScalable()) 6045 // Scalarization of fixed length vectors "just works". 6046 return true; 6047 6048 // We have dedicated lowering for unpredicated uniform loads and 6049 // stores. Note that even with tail folding we know that at least 6050 // one lane is active (i.e. generalized predication is not possible 6051 // here), and the logic below depends on this fact. 6052 if (!foldTailByMasking()) 6053 return true; 6054 6055 // For scalable vectors, a uniform memop load is always 6056 // uniform-by-parts and we know how to scalarize that. 6057 if (isa<LoadInst>(I)) 6058 return true; 6059 6060 // A uniform store isn't neccessarily uniform-by-part 6061 // and we can't assume scalarization. 6062 auto &SI = cast<StoreInst>(I); 6063 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6064 }; 6065 6066 const InstructionCost GatherScatterCost = 6067 isLegalGatherOrScatter(&I, VF) ? 6068 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6069 6070 // Load: Scalar load + broadcast 6071 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6072 // FIXME: This cost is a significant under-estimate for tail folded 6073 // memory ops. 6074 const InstructionCost ScalarizationCost = 6075 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF) 6076 : InstructionCost::getInvalid(); 6077 6078 // Choose better solution for the current VF, Note that Invalid 6079 // costs compare as maximumal large. If both are invalid, we get 6080 // scalable invalid which signals a failure and a vectorization abort. 6081 if (GatherScatterCost < ScalarizationCost) 6082 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6083 else 6084 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6085 continue; 6086 } 6087 6088 // We assume that widening is the best solution when possible. 6089 if (memoryInstructionCanBeWidened(&I, VF)) { 6090 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6091 int ConsecutiveStride = Legal->isConsecutivePtr( 6092 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6093 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6094 "Expected consecutive stride."); 6095 InstWidening Decision = 6096 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6097 setWideningDecision(&I, VF, Decision, Cost); 6098 continue; 6099 } 6100 6101 // Choose between Interleaving, Gather/Scatter or Scalarization. 6102 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6103 unsigned NumAccesses = 1; 6104 if (isAccessInterleaved(&I)) { 6105 const auto *Group = getInterleavedAccessGroup(&I); 6106 assert(Group && "Fail to get an interleaved access group."); 6107 6108 // Make one decision for the whole group. 6109 if (getWideningDecision(&I, VF) != CM_Unknown) 6110 continue; 6111 6112 NumAccesses = Group->getNumMembers(); 6113 if (interleavedAccessCanBeWidened(&I, VF)) 6114 InterleaveCost = getInterleaveGroupCost(&I, VF); 6115 } 6116 6117 InstructionCost GatherScatterCost = 6118 isLegalGatherOrScatter(&I, VF) 6119 ? getGatherScatterCost(&I, VF) * NumAccesses 6120 : InstructionCost::getInvalid(); 6121 6122 InstructionCost ScalarizationCost = 6123 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6124 6125 // Choose better solution for the current VF, 6126 // write down this decision and use it during vectorization. 6127 InstructionCost Cost; 6128 InstWidening Decision; 6129 if (InterleaveCost <= GatherScatterCost && 6130 InterleaveCost < ScalarizationCost) { 6131 Decision = CM_Interleave; 6132 Cost = InterleaveCost; 6133 } else if (GatherScatterCost < ScalarizationCost) { 6134 Decision = CM_GatherScatter; 6135 Cost = GatherScatterCost; 6136 } else { 6137 Decision = CM_Scalarize; 6138 Cost = ScalarizationCost; 6139 } 6140 // If the instructions belongs to an interleave group, the whole group 6141 // receives the same decision. The whole group receives the cost, but 6142 // the cost will actually be assigned to one instruction. 6143 if (const auto *Group = getInterleavedAccessGroup(&I)) 6144 setWideningDecision(Group, VF, Decision, Cost); 6145 else 6146 setWideningDecision(&I, VF, Decision, Cost); 6147 } 6148 } 6149 6150 // Make sure that any load of address and any other address computation 6151 // remains scalar unless there is gather/scatter support. This avoids 6152 // inevitable extracts into address registers, and also has the benefit of 6153 // activating LSR more, since that pass can't optimize vectorized 6154 // addresses. 6155 if (TTI.prefersVectorizedAddressing()) 6156 return; 6157 6158 // Start with all scalar pointer uses. 6159 SmallPtrSet<Instruction *, 8> AddrDefs; 6160 for (BasicBlock *BB : TheLoop->blocks()) 6161 for (Instruction &I : *BB) { 6162 Instruction *PtrDef = 6163 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6164 if (PtrDef && TheLoop->contains(PtrDef) && 6165 getWideningDecision(&I, VF) != CM_GatherScatter) 6166 AddrDefs.insert(PtrDef); 6167 } 6168 6169 // Add all instructions used to generate the addresses. 6170 SmallVector<Instruction *, 4> Worklist; 6171 append_range(Worklist, AddrDefs); 6172 while (!Worklist.empty()) { 6173 Instruction *I = Worklist.pop_back_val(); 6174 for (auto &Op : I->operands()) 6175 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6176 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6177 AddrDefs.insert(InstOp).second) 6178 Worklist.push_back(InstOp); 6179 } 6180 6181 for (auto *I : AddrDefs) { 6182 if (isa<LoadInst>(I)) { 6183 // Setting the desired widening decision should ideally be handled in 6184 // by cost functions, but since this involves the task of finding out 6185 // if the loaded register is involved in an address computation, it is 6186 // instead changed here when we know this is the case. 6187 InstWidening Decision = getWideningDecision(I, VF); 6188 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6189 // Scalarize a widened load of address. 6190 setWideningDecision( 6191 I, VF, CM_Scalarize, 6192 (VF.getKnownMinValue() * 6193 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6194 else if (const auto *Group = getInterleavedAccessGroup(I)) { 6195 // Scalarize an interleave group of address loads. 6196 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6197 if (Instruction *Member = Group->getMember(I)) 6198 setWideningDecision( 6199 Member, VF, CM_Scalarize, 6200 (VF.getKnownMinValue() * 6201 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6202 } 6203 } 6204 } else 6205 // Make sure I gets scalarized and a cost estimate without 6206 // scalarization overhead. 6207 ForcedScalars[VF].insert(I); 6208 } 6209 } 6210 6211 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6212 assert(!VF.isScalar() && 6213 "Trying to set a vectorization decision for a scalar VF"); 6214 6215 auto ForcedScalar = ForcedScalars.find(VF); 6216 for (BasicBlock *BB : TheLoop->blocks()) { 6217 // For each instruction in the old loop. 6218 for (Instruction &I : *BB) { 6219 CallInst *CI = dyn_cast<CallInst>(&I); 6220 6221 if (!CI) 6222 continue; 6223 6224 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6225 InstructionCost VectorCost = InstructionCost::getInvalid(); 6226 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6227 Function *ScalarFunc = CI->getCalledFunction(); 6228 Type *ScalarRetTy = CI->getType(); 6229 SmallVector<Type *, 4> Tys, ScalarTys; 6230 for (auto &ArgOp : CI->args()) 6231 ScalarTys.push_back(ArgOp->getType()); 6232 6233 // Estimate cost of scalarized vector call. The source operands are 6234 // assumed to be vectors, so we need to extract individual elements from 6235 // there, execute VF scalar calls, and then gather the result into the 6236 // vector return value. 6237 InstructionCost ScalarCallCost = 6238 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6239 6240 // Compute costs of unpacking argument values for the scalar calls and 6241 // packing the return values to a vector. 6242 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 6243 6244 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6245 // Honor ForcedScalars and UniformAfterVectorization decisions. 6246 // TODO: For calls, it might still be more profitable to widen. Use 6247 // VPlan-based cost model to compare different options. 6248 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() && 6249 ForcedScalar->second.contains(CI)) || 6250 isUniformAfterVectorization(CI, VF))) { 6251 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr, 6252 Intrinsic::not_intrinsic, std::nullopt, 6253 ScalarCost); 6254 continue; 6255 } 6256 6257 bool MaskRequired = Legal->isMaskRequired(CI); 6258 // Compute corresponding vector type for return value and arguments. 6259 Type *RetTy = toVectorTy(ScalarRetTy, VF); 6260 for (Type *ScalarTy : ScalarTys) 6261 Tys.push_back(toVectorTy(ScalarTy, VF)); 6262 6263 // An in-loop reduction using an fmuladd intrinsic is a special case; 6264 // we don't want the normal cost for that intrinsic. 6265 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6266 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) { 6267 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6268 getVectorIntrinsicIDForCall(CI, TLI), 6269 std::nullopt, *RedCost); 6270 continue; 6271 } 6272 6273 // Find the cost of vectorizing the call, if we can find a suitable 6274 // vector variant of the function. 6275 bool UsesMask = false; 6276 VFInfo FuncInfo; 6277 Function *VecFunc = nullptr; 6278 // Search through any available variants for one we can use at this VF. 6279 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6280 // Must match requested VF. 6281 if (Info.Shape.VF != VF) 6282 continue; 6283 6284 // Must take a mask argument if one is required 6285 if (MaskRequired && !Info.isMasked()) 6286 continue; 6287 6288 // Check that all parameter kinds are supported 6289 bool ParamsOk = true; 6290 for (VFParameter Param : Info.Shape.Parameters) { 6291 switch (Param.ParamKind) { 6292 case VFParamKind::Vector: 6293 break; 6294 case VFParamKind::OMP_Uniform: { 6295 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6296 // Make sure the scalar parameter in the loop is invariant. 6297 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6298 TheLoop)) 6299 ParamsOk = false; 6300 break; 6301 } 6302 case VFParamKind::OMP_Linear: { 6303 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6304 // Find the stride for the scalar parameter in this loop and see if 6305 // it matches the stride for the variant. 6306 // TODO: do we need to figure out the cost of an extract to get the 6307 // first lane? Or do we hope that it will be folded away? 6308 ScalarEvolution *SE = PSE.getSE(); 6309 const auto *SAR = 6310 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6311 6312 if (!SAR || SAR->getLoop() != TheLoop) { 6313 ParamsOk = false; 6314 break; 6315 } 6316 6317 const SCEVConstant *Step = 6318 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6319 6320 if (!Step || 6321 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6322 ParamsOk = false; 6323 6324 break; 6325 } 6326 case VFParamKind::GlobalPredicate: 6327 UsesMask = true; 6328 break; 6329 default: 6330 ParamsOk = false; 6331 break; 6332 } 6333 } 6334 6335 if (!ParamsOk) 6336 continue; 6337 6338 // Found a suitable candidate, stop here. 6339 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6340 FuncInfo = Info; 6341 break; 6342 } 6343 6344 // Add in the cost of synthesizing a mask if one wasn't required. 6345 InstructionCost MaskCost = 0; 6346 if (VecFunc && UsesMask && !MaskRequired) 6347 MaskCost = TTI.getShuffleCost( 6348 TargetTransformInfo::SK_Broadcast, 6349 VectorType::get(IntegerType::getInt1Ty( 6350 VecFunc->getFunctionType()->getContext()), 6351 VF), 6352 {}, CostKind); 6353 6354 if (TLI && VecFunc && !CI->isNoBuiltin()) 6355 VectorCost = 6356 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6357 6358 // Find the cost of an intrinsic; some targets may have instructions that 6359 // perform the operation without needing an actual call. 6360 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6361 if (IID != Intrinsic::not_intrinsic) 6362 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6363 6364 InstructionCost Cost = ScalarCost; 6365 InstWidening Decision = CM_Scalarize; 6366 6367 if (VectorCost <= Cost) { 6368 Cost = VectorCost; 6369 Decision = CM_VectorCall; 6370 } 6371 6372 if (IntrinsicCost <= Cost) { 6373 Cost = IntrinsicCost; 6374 Decision = CM_IntrinsicCall; 6375 } 6376 6377 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6378 FuncInfo.getParamIndexForOptionalMask(), Cost); 6379 } 6380 } 6381 } 6382 6383 bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) { 6384 if (!Legal->isInvariant(Op)) 6385 return false; 6386 // Consider Op invariant, if it or its operands aren't predicated 6387 // instruction in the loop. In that case, it is not trivially hoistable. 6388 auto *OpI = dyn_cast<Instruction>(Op); 6389 return !OpI || !TheLoop->contains(OpI) || 6390 (!isPredicatedInst(OpI) && 6391 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) && 6392 all_of(OpI->operands(), 6393 [this](Value *Op) { return shouldConsiderInvariant(Op); })); 6394 } 6395 6396 InstructionCost 6397 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6398 ElementCount VF) { 6399 // If we know that this instruction will remain uniform, check the cost of 6400 // the scalar version. 6401 if (isUniformAfterVectorization(I, VF)) 6402 VF = ElementCount::getFixed(1); 6403 6404 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6405 return InstsToScalarize[VF][I]; 6406 6407 // Forced scalars do not have any scalarization overhead. 6408 auto ForcedScalar = ForcedScalars.find(VF); 6409 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6410 auto InstSet = ForcedScalar->second; 6411 if (InstSet.count(I)) 6412 return getInstructionCost(I, ElementCount::getFixed(1)) * 6413 VF.getKnownMinValue(); 6414 } 6415 6416 Type *RetTy = I->getType(); 6417 if (canTruncateToMinimalBitwidth(I, VF)) 6418 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6419 auto *SE = PSE.getSE(); 6420 6421 auto HasSingleCopyAfterVectorization = [this](Instruction *I, 6422 ElementCount VF) -> bool { 6423 if (VF.isScalar()) 6424 return true; 6425 6426 auto Scalarized = InstsToScalarize.find(VF); 6427 assert(Scalarized != InstsToScalarize.end() && 6428 "VF not yet analyzed for scalarization profitability"); 6429 return !Scalarized->second.count(I) && 6430 llvm::all_of(I->users(), [&](User *U) { 6431 auto *UI = cast<Instruction>(U); 6432 return !Scalarized->second.count(UI); 6433 }); 6434 }; 6435 (void)HasSingleCopyAfterVectorization; 6436 6437 Type *VectorTy; 6438 if (isScalarAfterVectorization(I, VF)) { 6439 // With the exception of GEPs and PHIs, after scalarization there should 6440 // only be one copy of the instruction generated in the loop. This is 6441 // because the VF is either 1, or any instructions that need scalarizing 6442 // have already been dealt with by the time we get here. As a result, 6443 // it means we don't have to multiply the instruction cost by VF. 6444 assert(I->getOpcode() == Instruction::GetElementPtr || 6445 I->getOpcode() == Instruction::PHI || 6446 (I->getOpcode() == Instruction::BitCast && 6447 I->getType()->isPointerTy()) || 6448 HasSingleCopyAfterVectorization(I, VF)); 6449 VectorTy = RetTy; 6450 } else 6451 VectorTy = toVectorTy(RetTy, VF); 6452 6453 if (VF.isVector() && VectorTy->isVectorTy() && 6454 !TTI.getNumberOfParts(VectorTy)) 6455 return InstructionCost::getInvalid(); 6456 6457 // TODO: We need to estimate the cost of intrinsic calls. 6458 switch (I->getOpcode()) { 6459 case Instruction::GetElementPtr: 6460 // We mark this instruction as zero-cost because the cost of GEPs in 6461 // vectorized code depends on whether the corresponding memory instruction 6462 // is scalarized or not. Therefore, we handle GEPs with the memory 6463 // instruction cost. 6464 return 0; 6465 case Instruction::Br: { 6466 // In cases of scalarized and predicated instructions, there will be VF 6467 // predicated blocks in the vectorized loop. Each branch around these 6468 // blocks requires also an extract of its vector compare i1 element. 6469 // Note that the conditional branch from the loop latch will be replaced by 6470 // a single branch controlling the loop, so there is no extra overhead from 6471 // scalarization. 6472 bool ScalarPredicatedBB = false; 6473 BranchInst *BI = cast<BranchInst>(I); 6474 if (VF.isVector() && BI->isConditional() && 6475 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6476 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && 6477 BI->getParent() != TheLoop->getLoopLatch()) 6478 ScalarPredicatedBB = true; 6479 6480 if (ScalarPredicatedBB) { 6481 // Not possible to scalarize scalable vector with predicated instructions. 6482 if (VF.isScalable()) 6483 return InstructionCost::getInvalid(); 6484 // Return cost for branches around scalarized and predicated blocks. 6485 auto *VecI1Ty = 6486 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6487 return ( 6488 TTI.getScalarizationOverhead( 6489 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), 6490 /*Insert*/ false, /*Extract*/ true, CostKind) + 6491 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6492 } 6493 6494 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6495 // The back-edge branch will remain, as will all scalar branches. 6496 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6497 6498 // This branch will be eliminated by if-conversion. 6499 return 0; 6500 // Note: We currently assume zero cost for an unconditional branch inside 6501 // a predicated block since it will become a fall-through, although we 6502 // may decide in the future to call TTI for all branches. 6503 } 6504 case Instruction::Switch: { 6505 if (VF.isScalar()) 6506 return TTI.getCFInstrCost(Instruction::Switch, CostKind); 6507 auto *Switch = cast<SwitchInst>(I); 6508 return Switch->getNumCases() * 6509 TTI.getCmpSelInstrCost( 6510 Instruction::ICmp, 6511 toVectorTy(Switch->getCondition()->getType(), VF), 6512 toVectorTy(Type::getInt1Ty(I->getContext()), VF), 6513 CmpInst::ICMP_EQ, CostKind); 6514 } 6515 case Instruction::PHI: { 6516 auto *Phi = cast<PHINode>(I); 6517 6518 // First-order recurrences are replaced by vector shuffles inside the loop. 6519 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6520 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6521 // penultimate value of the recurrence. 6522 // TODO: Consider vscale_range info. 6523 if (VF.isScalable() && VF.getKnownMinValue() == 1) 6524 return InstructionCost::getInvalid(); 6525 SmallVector<int> Mask(VF.getKnownMinValue()); 6526 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6527 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6528 cast<VectorType>(VectorTy), Mask, CostKind, 6529 VF.getKnownMinValue() - 1); 6530 } 6531 6532 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6533 // converted into select instructions. We require N - 1 selects per phi 6534 // node, where N is the number of incoming values. 6535 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { 6536 Type *ResultTy = Phi->getType(); 6537 6538 // All instructions in an Any-of reduction chain are narrowed to bool. 6539 // Check if that is the case for this phi node. 6540 auto *HeaderUser = cast_if_present<PHINode>( 6541 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * { 6542 auto *Phi = dyn_cast<PHINode>(U); 6543 if (Phi && Phi->getParent() == TheLoop->getHeader()) 6544 return Phi; 6545 return nullptr; 6546 })); 6547 if (HeaderUser) { 6548 auto &ReductionVars = Legal->getReductionVars(); 6549 auto Iter = ReductionVars.find(HeaderUser); 6550 if (Iter != ReductionVars.end() && 6551 RecurrenceDescriptor::isAnyOfRecurrenceKind( 6552 Iter->second.getRecurrenceKind())) 6553 ResultTy = Type::getInt1Ty(Phi->getContext()); 6554 } 6555 return (Phi->getNumIncomingValues() - 1) * 6556 TTI.getCmpSelInstrCost( 6557 Instruction::Select, toVectorTy(ResultTy, VF), 6558 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6559 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6560 } 6561 6562 // When tail folding with EVL, if the phi is part of an out of loop 6563 // reduction then it will be transformed into a wide vp_merge. 6564 if (VF.isVector() && foldTailWithEVL() && 6565 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) { 6566 IntrinsicCostAttributes ICA( 6567 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF), 6568 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)}); 6569 return TTI.getIntrinsicInstrCost(ICA, CostKind); 6570 } 6571 6572 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6573 } 6574 case Instruction::UDiv: 6575 case Instruction::SDiv: 6576 case Instruction::URem: 6577 case Instruction::SRem: 6578 if (VF.isVector() && isPredicatedInst(I)) { 6579 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6580 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6581 ScalarCost : SafeDivisorCost; 6582 } 6583 // We've proven all lanes safe to speculate, fall through. 6584 [[fallthrough]]; 6585 case Instruction::Add: 6586 case Instruction::Sub: { 6587 auto Info = Legal->getHistogramInfo(I); 6588 if (Info && VF.isVector()) { 6589 const HistogramInfo *HGram = Info.value(); 6590 // Assume that a non-constant update value (or a constant != 1) requires 6591 // a multiply, and add that into the cost. 6592 InstructionCost MulCost = TTI::TCC_Free; 6593 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1)); 6594 if (!RHS || RHS->getZExtValue() != 1) 6595 MulCost = 6596 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6597 6598 // Find the cost of the histogram operation itself. 6599 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); 6600 Type *ScalarTy = I->getType(); 6601 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF); 6602 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add, 6603 Type::getVoidTy(I->getContext()), 6604 {PtrTy, ScalarTy, MaskTy}); 6605 6606 // Add the costs together with the add/sub operation. 6607 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost + 6608 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind); 6609 } 6610 [[fallthrough]]; 6611 } 6612 case Instruction::FAdd: 6613 case Instruction::FSub: 6614 case Instruction::Mul: 6615 case Instruction::FMul: 6616 case Instruction::FDiv: 6617 case Instruction::FRem: 6618 case Instruction::Shl: 6619 case Instruction::LShr: 6620 case Instruction::AShr: 6621 case Instruction::And: 6622 case Instruction::Or: 6623 case Instruction::Xor: { 6624 // If we're speculating on the stride being 1, the multiplication may 6625 // fold away. We can generalize this for all operations using the notion 6626 // of neutral elements. (TODO) 6627 if (I->getOpcode() == Instruction::Mul && 6628 (PSE.getSCEV(I->getOperand(0))->isOne() || 6629 PSE.getSCEV(I->getOperand(1))->isOne())) 6630 return 0; 6631 6632 // Detect reduction patterns 6633 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) 6634 return *RedCost; 6635 6636 // Certain instructions can be cheaper to vectorize if they have a constant 6637 // second vector operand. One example of this are shifts on x86. 6638 Value *Op2 = I->getOperand(1); 6639 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) && 6640 isa<SCEVConstant>(PSE.getSCEV(Op2))) { 6641 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue(); 6642 } 6643 auto Op2Info = TTI.getOperandInfo(Op2); 6644 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6645 shouldConsiderInvariant(Op2)) 6646 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6647 6648 SmallVector<const Value *, 4> Operands(I->operand_values()); 6649 return TTI.getArithmeticInstrCost( 6650 I->getOpcode(), VectorTy, CostKind, 6651 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6652 Op2Info, Operands, I, TLI); 6653 } 6654 case Instruction::FNeg: { 6655 return TTI.getArithmeticInstrCost( 6656 I->getOpcode(), VectorTy, CostKind, 6657 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6658 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6659 I->getOperand(0), I); 6660 } 6661 case Instruction::Select: { 6662 SelectInst *SI = cast<SelectInst>(I); 6663 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6664 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6665 6666 const Value *Op0, *Op1; 6667 using namespace llvm::PatternMatch; 6668 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6669 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6670 // select x, y, false --> x & y 6671 // select x, true, y --> x | y 6672 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6673 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6674 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6675 Op1->getType()->getScalarSizeInBits() == 1); 6676 6677 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6678 return TTI.getArithmeticInstrCost( 6679 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6680 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6681 } 6682 6683 Type *CondTy = SI->getCondition()->getType(); 6684 if (!ScalarCond) 6685 CondTy = VectorType::get(CondTy, VF); 6686 6687 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6688 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6689 Pred = Cmp->getPredicate(); 6690 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6691 CostKind, {TTI::OK_AnyValue, TTI::OP_None}, 6692 {TTI::OK_AnyValue, TTI::OP_None}, I); 6693 } 6694 case Instruction::ICmp: 6695 case Instruction::FCmp: { 6696 Type *ValTy = I->getOperand(0)->getType(); 6697 6698 if (canTruncateToMinimalBitwidth(I, VF)) { 6699 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6700 (void)Op0AsInstruction; 6701 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) || 6702 MinBWs[I] == MinBWs[Op0AsInstruction]) && 6703 "if both the operand and the compare are marked for " 6704 "truncation, they must have the same bitwidth"); 6705 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); 6706 } 6707 6708 VectorTy = toVectorTy(ValTy, VF); 6709 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 6710 cast<CmpInst>(I)->getPredicate(), CostKind, 6711 {TTI::OK_AnyValue, TTI::OP_None}, 6712 {TTI::OK_AnyValue, TTI::OP_None}, I); 6713 } 6714 case Instruction::Store: 6715 case Instruction::Load: { 6716 ElementCount Width = VF; 6717 if (Width.isVector()) { 6718 InstWidening Decision = getWideningDecision(I, Width); 6719 assert(Decision != CM_Unknown && 6720 "CM decision should be taken at this point"); 6721 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 6722 return InstructionCost::getInvalid(); 6723 if (Decision == CM_Scalarize) 6724 Width = ElementCount::getFixed(1); 6725 } 6726 VectorTy = toVectorTy(getLoadStoreType(I), Width); 6727 return getMemoryInstructionCost(I, VF); 6728 } 6729 case Instruction::BitCast: 6730 if (I->getType()->isPointerTy()) 6731 return 0; 6732 [[fallthrough]]; 6733 case Instruction::ZExt: 6734 case Instruction::SExt: 6735 case Instruction::FPToUI: 6736 case Instruction::FPToSI: 6737 case Instruction::FPExt: 6738 case Instruction::PtrToInt: 6739 case Instruction::IntToPtr: 6740 case Instruction::SIToFP: 6741 case Instruction::UIToFP: 6742 case Instruction::Trunc: 6743 case Instruction::FPTrunc: { 6744 // Computes the CastContextHint from a Load/Store instruction. 6745 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6746 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6747 "Expected a load or a store!"); 6748 6749 if (VF.isScalar() || !TheLoop->contains(I)) 6750 return TTI::CastContextHint::Normal; 6751 6752 switch (getWideningDecision(I, VF)) { 6753 case LoopVectorizationCostModel::CM_GatherScatter: 6754 return TTI::CastContextHint::GatherScatter; 6755 case LoopVectorizationCostModel::CM_Interleave: 6756 return TTI::CastContextHint::Interleave; 6757 case LoopVectorizationCostModel::CM_Scalarize: 6758 case LoopVectorizationCostModel::CM_Widen: 6759 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6760 : TTI::CastContextHint::Normal; 6761 case LoopVectorizationCostModel::CM_Widen_Reverse: 6762 return TTI::CastContextHint::Reversed; 6763 case LoopVectorizationCostModel::CM_Unknown: 6764 llvm_unreachable("Instr did not go through cost modelling?"); 6765 case LoopVectorizationCostModel::CM_VectorCall: 6766 case LoopVectorizationCostModel::CM_IntrinsicCall: 6767 llvm_unreachable_internal("Instr has invalid widening decision"); 6768 } 6769 6770 llvm_unreachable("Unhandled case!"); 6771 }; 6772 6773 unsigned Opcode = I->getOpcode(); 6774 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6775 // For Trunc, the context is the only user, which must be a StoreInst. 6776 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6777 if (I->hasOneUse()) 6778 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6779 CCH = ComputeCCH(Store); 6780 } 6781 // For Z/Sext, the context is the operand, which must be a LoadInst. 6782 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6783 Opcode == Instruction::FPExt) { 6784 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6785 CCH = ComputeCCH(Load); 6786 } 6787 6788 // We optimize the truncation of induction variables having constant 6789 // integer steps. The cost of these truncations is the same as the scalar 6790 // operation. 6791 if (isOptimizableIVTruncate(I, VF)) { 6792 auto *Trunc = cast<TruncInst>(I); 6793 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6794 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6795 } 6796 6797 // Detect reduction patterns 6798 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) 6799 return *RedCost; 6800 6801 Type *SrcScalarTy = I->getOperand(0)->getType(); 6802 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6803 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6804 SrcScalarTy = 6805 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); 6806 Type *SrcVecTy = 6807 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6808 6809 if (canTruncateToMinimalBitwidth(I, VF)) { 6810 // If the result type is <= the source type, there will be no extend 6811 // after truncating the users to the minimal required bitwidth. 6812 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && 6813 (I->getOpcode() == Instruction::ZExt || 6814 I->getOpcode() == Instruction::SExt)) 6815 return 0; 6816 } 6817 6818 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6819 } 6820 case Instruction::Call: 6821 return getVectorCallCost(cast<CallInst>(I), VF); 6822 case Instruction::ExtractValue: 6823 return TTI.getInstructionCost(I, CostKind); 6824 case Instruction::Alloca: 6825 // We cannot easily widen alloca to a scalable alloca, as 6826 // the result would need to be a vector of pointers. 6827 if (VF.isScalable()) 6828 return InstructionCost::getInvalid(); 6829 [[fallthrough]]; 6830 default: 6831 // This opcode is unknown. Assume that it is the same as 'mul'. 6832 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6833 } // end of switch. 6834 } 6835 6836 void LoopVectorizationCostModel::collectValuesToIgnore() { 6837 // Ignore ephemeral values. 6838 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6839 6840 SmallVector<Value *, 4> DeadInterleavePointerOps; 6841 SmallVector<Value *, 4> DeadOps; 6842 6843 // If a scalar epilogue is required, users outside the loop won't use 6844 // live-outs from the vector loop but from the scalar epilogue. Ignore them if 6845 // that is the case. 6846 bool RequiresScalarEpilogue = requiresScalarEpilogue(true); 6847 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) { 6848 return RequiresScalarEpilogue && 6849 !TheLoop->contains(cast<Instruction>(U)->getParent()); 6850 }; 6851 6852 LoopBlocksDFS DFS(TheLoop); 6853 DFS.perform(LI); 6854 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps; 6855 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO()))) 6856 for (Instruction &I : reverse(*BB)) { 6857 // Find all stores to invariant variables. Since they are going to sink 6858 // outside the loop we do not need calculate cost for them. 6859 StoreInst *SI; 6860 if ((SI = dyn_cast<StoreInst>(&I)) && 6861 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 6862 ValuesToIgnore.insert(&I); 6863 DeadInvariantStoreOps[SI->getPointerOperand()].push_back( 6864 SI->getValueOperand()); 6865 } 6866 6867 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) 6868 continue; 6869 6870 // Add instructions that would be trivially dead and are only used by 6871 // values already ignored to DeadOps to seed worklist. 6872 if (wouldInstructionBeTriviallyDead(&I, TLI) && 6873 all_of(I.users(), [this, IsLiveOutDead](User *U) { 6874 return VecValuesToIgnore.contains(U) || 6875 ValuesToIgnore.contains(U) || IsLiveOutDead(U); 6876 })) 6877 DeadOps.push_back(&I); 6878 6879 // For interleave groups, we only create a pointer for the start of the 6880 // interleave group. Queue up addresses of group members except the insert 6881 // position for further processing. 6882 if (isAccessInterleaved(&I)) { 6883 auto *Group = getInterleavedAccessGroup(&I); 6884 if (Group->getInsertPos() == &I) 6885 continue; 6886 Value *PointerOp = getLoadStorePointerOperand(&I); 6887 DeadInterleavePointerOps.push_back(PointerOp); 6888 } 6889 6890 // Queue branches for analysis. They are dead, if their successors only 6891 // contain dead instructions. 6892 if (auto *Br = dyn_cast<BranchInst>(&I)) { 6893 if (Br->isConditional()) 6894 DeadOps.push_back(&I); 6895 } 6896 } 6897 6898 // Mark ops feeding interleave group members as free, if they are only used 6899 // by other dead computations. 6900 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { 6901 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); 6902 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { 6903 Instruction *UI = cast<Instruction>(U); 6904 return !VecValuesToIgnore.contains(U) && 6905 (!isAccessInterleaved(UI) || 6906 getInterleavedAccessGroup(UI)->getInsertPos() == UI); 6907 })) 6908 continue; 6909 VecValuesToIgnore.insert(Op); 6910 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); 6911 } 6912 6913 for (const auto &[_, Ops] : DeadInvariantStoreOps) { 6914 for (Value *Op : ArrayRef(Ops).drop_back()) 6915 DeadOps.push_back(Op); 6916 } 6917 // Mark ops that would be trivially dead and are only used by ignored 6918 // instructions as free. 6919 BasicBlock *Header = TheLoop->getHeader(); 6920 6921 // Returns true if the block contains only dead instructions. Such blocks will 6922 // be removed by VPlan-to-VPlan transforms and won't be considered by the 6923 // VPlan-based cost model, so skip them in the legacy cost-model as well. 6924 auto IsEmptyBlock = [this](BasicBlock *BB) { 6925 return all_of(*BB, [this](Instruction &I) { 6926 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) || 6927 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional()); 6928 }); 6929 }; 6930 for (unsigned I = 0; I != DeadOps.size(); ++I) { 6931 auto *Op = dyn_cast<Instruction>(DeadOps[I]); 6932 6933 // Check if the branch should be considered dead. 6934 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) { 6935 BasicBlock *ThenBB = Br->getSuccessor(0); 6936 BasicBlock *ElseBB = Br->getSuccessor(1); 6937 // Don't considers branches leaving the loop for simplification. 6938 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB)) 6939 continue; 6940 bool ThenEmpty = IsEmptyBlock(ThenBB); 6941 bool ElseEmpty = IsEmptyBlock(ElseBB); 6942 if ((ThenEmpty && ElseEmpty) || 6943 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB && 6944 ElseBB->phis().empty()) || 6945 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB && 6946 ThenBB->phis().empty())) { 6947 VecValuesToIgnore.insert(Br); 6948 DeadOps.push_back(Br->getCondition()); 6949 } 6950 continue; 6951 } 6952 6953 // Skip any op that shouldn't be considered dead. 6954 if (!Op || !TheLoop->contains(Op) || 6955 (isa<PHINode>(Op) && Op->getParent() == Header) || 6956 !wouldInstructionBeTriviallyDead(Op, TLI) || 6957 any_of(Op->users(), [this, IsLiveOutDead](User *U) { 6958 return !VecValuesToIgnore.contains(U) && 6959 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U); 6960 })) 6961 continue; 6962 6963 if (!TheLoop->contains(Op->getParent())) 6964 continue; 6965 6966 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore 6967 // which applies for both scalar and vector versions. Otherwise it is only 6968 // dead in vector versions, so only add it to VecValuesToIgnore. 6969 if (all_of(Op->users(), 6970 [this](User *U) { return ValuesToIgnore.contains(U); })) 6971 ValuesToIgnore.insert(Op); 6972 6973 VecValuesToIgnore.insert(Op); 6974 DeadOps.append(Op->op_begin(), Op->op_end()); 6975 } 6976 6977 // Ignore type-promoting instructions we identified during reduction 6978 // detection. 6979 for (const auto &Reduction : Legal->getReductionVars()) { 6980 const RecurrenceDescriptor &RedDes = Reduction.second; 6981 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6982 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6983 } 6984 // Ignore type-casting instructions we identified during induction 6985 // detection. 6986 for (const auto &Induction : Legal->getInductionVars()) { 6987 const InductionDescriptor &IndDes = Induction.second; 6988 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6989 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6990 } 6991 } 6992 6993 void LoopVectorizationCostModel::collectInLoopReductions() { 6994 for (const auto &Reduction : Legal->getReductionVars()) { 6995 PHINode *Phi = Reduction.first; 6996 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6997 6998 // We don't collect reductions that are type promoted (yet). 6999 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7000 continue; 7001 7002 // If the target would prefer this reduction to happen "in-loop", then we 7003 // want to record it as such. 7004 unsigned Opcode = RdxDesc.getOpcode(); 7005 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7006 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7007 TargetTransformInfo::ReductionFlags())) 7008 continue; 7009 7010 // Check that we can correctly put the reductions into the loop, by 7011 // finding the chain of operations that leads from the phi to the loop 7012 // exit value. 7013 SmallVector<Instruction *, 4> ReductionOperations = 7014 RdxDesc.getReductionOpChain(Phi, TheLoop); 7015 bool InLoop = !ReductionOperations.empty(); 7016 7017 if (InLoop) { 7018 InLoopReductions.insert(Phi); 7019 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7020 Instruction *LastChain = Phi; 7021 for (auto *I : ReductionOperations) { 7022 InLoopReductionImmediateChains[I] = LastChain; 7023 LastChain = I; 7024 } 7025 } 7026 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7027 << " reduction for phi: " << *Phi << "\n"); 7028 } 7029 } 7030 7031 // This function will select a scalable VF if the target supports scalable 7032 // vectors and a fixed one otherwise. 7033 // TODO: we could return a pair of values that specify the max VF and 7034 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7035 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7036 // doesn't have a cost model that can choose which plan to execute if 7037 // more than one is generated. 7038 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7039 LoopVectorizationCostModel &CM) { 7040 unsigned WidestType; 7041 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7042 7043 TargetTransformInfo::RegisterKind RegKind = 7044 TTI.enableScalableVectorization() 7045 ? TargetTransformInfo::RGK_ScalableVector 7046 : TargetTransformInfo::RGK_FixedWidthVector; 7047 7048 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7049 unsigned N = RegSize.getKnownMinValue() / WidestType; 7050 return ElementCount::get(N, RegSize.isScalable()); 7051 } 7052 7053 VectorizationFactor 7054 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7055 ElementCount VF = UserVF; 7056 // Outer loop handling: They may require CFG and instruction level 7057 // transformations before even evaluating whether vectorization is profitable. 7058 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7059 // the vectorization pipeline. 7060 if (!OrigLoop->isInnermost()) { 7061 // If the user doesn't provide a vectorization factor, determine a 7062 // reasonable one. 7063 if (UserVF.isZero()) { 7064 VF = determineVPlanVF(TTI, CM); 7065 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7066 7067 // Make sure we have a VF > 1 for stress testing. 7068 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7069 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7070 << "overriding computed VF.\n"); 7071 VF = ElementCount::getFixed(4); 7072 } 7073 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7074 !ForceTargetSupportsScalableVectors) { 7075 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7076 << "not supported by the target.\n"); 7077 reportVectorizationFailure( 7078 "Scalable vectorization requested but not supported by the target", 7079 "the scalable user-specified vectorization width for outer-loop " 7080 "vectorization cannot be used because the target does not support " 7081 "scalable vectors.", 7082 "ScalableVFUnfeasible", ORE, OrigLoop); 7083 return VectorizationFactor::Disabled(); 7084 } 7085 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7086 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7087 "VF needs to be a power of two"); 7088 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7089 << "VF " << VF << " to build VPlans.\n"); 7090 buildVPlans(VF, VF); 7091 7092 // For VPlan build stress testing, we bail out after VPlan construction. 7093 if (VPlanBuildStressTest) 7094 return VectorizationFactor::Disabled(); 7095 7096 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7097 } 7098 7099 LLVM_DEBUG( 7100 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7101 "VPlan-native path.\n"); 7102 return VectorizationFactor::Disabled(); 7103 } 7104 7105 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7106 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7107 CM.collectValuesToIgnore(); 7108 CM.collectElementTypesForWidening(); 7109 7110 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7111 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7112 return; 7113 7114 // Invalidate interleave groups if all blocks of loop will be predicated. 7115 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7116 !useMaskedInterleavedAccesses(TTI)) { 7117 LLVM_DEBUG( 7118 dbgs() 7119 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7120 "which requires masked-interleaved support.\n"); 7121 if (CM.InterleaveInfo.invalidateGroups()) 7122 // Invalidating interleave groups also requires invalidating all decisions 7123 // based on them, which includes widening decisions and uniform and scalar 7124 // values. 7125 CM.invalidateCostModelingDecisions(); 7126 } 7127 7128 if (CM.foldTailByMasking()) 7129 Legal->prepareToFoldTailByMasking(); 7130 7131 ElementCount MaxUserVF = 7132 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7133 if (UserVF) { 7134 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) { 7135 reportVectorizationInfo( 7136 "UserVF ignored because it may be larger than the maximal safe VF", 7137 "InvalidUserVF", ORE, OrigLoop); 7138 } else { 7139 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7140 "VF needs to be a power of two"); 7141 // Collect the instructions (and their associated costs) that will be more 7142 // profitable to scalarize. 7143 CM.collectInLoopReductions(); 7144 if (CM.selectUserVectorizationFactor(UserVF)) { 7145 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7146 buildVPlansWithVPRecipes(UserVF, UserVF); 7147 LLVM_DEBUG(printPlans(dbgs())); 7148 return; 7149 } 7150 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7151 "InvalidCost", ORE, OrigLoop); 7152 } 7153 } 7154 7155 // Collect the Vectorization Factor Candidates. 7156 SmallVector<ElementCount> VFCandidates; 7157 for (auto VF = ElementCount::getFixed(1); 7158 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7159 VFCandidates.push_back(VF); 7160 for (auto VF = ElementCount::getScalable(1); 7161 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7162 VFCandidates.push_back(VF); 7163 7164 CM.collectInLoopReductions(); 7165 for (const auto &VF : VFCandidates) { 7166 // Collect Uniform and Scalar instructions after vectorization with VF. 7167 CM.collectUniformsAndScalars(VF); 7168 7169 // Collect the instructions (and their associated costs) that will be more 7170 // profitable to scalarize. 7171 if (VF.isVector()) 7172 CM.collectInstsToScalarize(VF); 7173 } 7174 7175 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7176 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7177 7178 LLVM_DEBUG(printPlans(dbgs())); 7179 } 7180 7181 InstructionCost VPCostContext::getLegacyCost(Instruction *UI, 7182 ElementCount VF) const { 7183 if (ForceTargetInstructionCost.getNumOccurrences()) 7184 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences()); 7185 return CM.getInstructionCost(UI, VF); 7186 } 7187 7188 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { 7189 return CM.ValuesToIgnore.contains(UI) || 7190 (IsVector && CM.VecValuesToIgnore.contains(UI)) || 7191 SkipCostComputation.contains(UI); 7192 } 7193 7194 InstructionCost 7195 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, 7196 VPCostContext &CostCtx) const { 7197 InstructionCost Cost; 7198 // Cost modeling for inductions is inaccurate in the legacy cost model 7199 // compared to the recipes that are generated. To match here initially during 7200 // VPlan cost model bring up directly use the induction costs from the legacy 7201 // cost model. Note that we do this as pre-processing; the VPlan may not have 7202 // any recipes associated with the original induction increment instruction 7203 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 7204 // the cost of induction phis and increments (both that are represented by 7205 // recipes and those that are not), to avoid distinguishing between them here, 7206 // and skip all recipes that represent induction phis and increments (the 7207 // former case) later on, if they exist, to avoid counting them twice. 7208 // Similarly we pre-compute the cost of any optimized truncates. 7209 // TODO: Switch to more accurate costing based on VPlan. 7210 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { 7211 Instruction *IVInc = cast<Instruction>( 7212 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 7213 SmallVector<Instruction *> IVInsts = {IVInc}; 7214 for (unsigned I = 0; I != IVInsts.size(); I++) { 7215 for (Value *Op : IVInsts[I]->operands()) { 7216 auto *OpI = dyn_cast<Instruction>(Op); 7217 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse()) 7218 continue; 7219 IVInsts.push_back(OpI); 7220 } 7221 } 7222 IVInsts.push_back(IV); 7223 for (User *U : IV->users()) { 7224 auto *CI = cast<Instruction>(U); 7225 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) 7226 continue; 7227 IVInsts.push_back(CI); 7228 } 7229 7230 // If the vector loop gets executed exactly once with the given VF, ignore 7231 // the costs of comparison and induction instructions, as they'll get 7232 // simplified away. 7233 // TODO: Remove this code after stepping away from the legacy cost model and 7234 // adding code to simplify VPlans before calculating their costs. 7235 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop); 7236 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking()) 7237 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(), 7238 CostCtx.SkipCostComputation); 7239 7240 for (Instruction *IVInst : IVInsts) { 7241 if (CostCtx.skipCostComputation(IVInst, VF.isVector())) 7242 continue; 7243 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); 7244 LLVM_DEBUG({ 7245 dbgs() << "Cost of " << InductionCost << " for VF " << VF 7246 << ": induction instruction " << *IVInst << "\n"; 7247 }); 7248 Cost += InductionCost; 7249 CostCtx.SkipCostComputation.insert(IVInst); 7250 } 7251 } 7252 7253 /// Compute the cost of all exiting conditions of the loop using the legacy 7254 /// cost model. This is to match the legacy behavior, which adds the cost of 7255 /// all exit conditions. Note that this over-estimates the cost, as there will 7256 /// be a single condition to control the vector loop. 7257 SmallVector<BasicBlock *> Exiting; 7258 CM.TheLoop->getExitingBlocks(Exiting); 7259 SetVector<Instruction *> ExitInstrs; 7260 // Collect all exit conditions. 7261 for (BasicBlock *EB : Exiting) { 7262 auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); 7263 if (!Term) 7264 continue; 7265 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { 7266 ExitInstrs.insert(CondI); 7267 } 7268 } 7269 // Compute the cost of all instructions only feeding the exit conditions. 7270 for (unsigned I = 0; I != ExitInstrs.size(); ++I) { 7271 Instruction *CondI = ExitInstrs[I]; 7272 if (!OrigLoop->contains(CondI) || 7273 !CostCtx.SkipCostComputation.insert(CondI).second) 7274 continue; 7275 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); 7276 LLVM_DEBUG({ 7277 dbgs() << "Cost of " << CondICost << " for VF " << VF 7278 << ": exit condition instruction " << *CondI << "\n"; 7279 }); 7280 Cost += CondICost; 7281 for (Value *Op : CondI->operands()) { 7282 auto *OpI = dyn_cast<Instruction>(Op); 7283 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { 7284 return OrigLoop->contains(cast<Instruction>(U)->getParent()) && 7285 !ExitInstrs.contains(cast<Instruction>(U)); 7286 })) 7287 continue; 7288 ExitInstrs.insert(OpI); 7289 } 7290 } 7291 7292 // The legacy cost model has special logic to compute the cost of in-loop 7293 // reductions, which may be smaller than the sum of all instructions involved 7294 // in the reduction. 7295 // TODO: Switch to costing based on VPlan once the logic has been ported. 7296 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { 7297 if (ForceTargetInstructionCost.getNumOccurrences()) 7298 continue; 7299 7300 if (!CM.isInLoopReduction(RedPhi)) 7301 continue; 7302 7303 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); 7304 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), 7305 ChainOps.end()); 7306 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { 7307 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; 7308 }; 7309 // Also include the operands of instructions in the chain, as the cost-model 7310 // may mark extends as free. 7311 // 7312 // For ARM, some of the instruction can folded into the reducion 7313 // instruction. So we need to mark all folded instructions free. 7314 // For example: We can fold reduce(mul(ext(A), ext(B))) into one 7315 // instruction. 7316 for (auto *ChainOp : ChainOps) { 7317 for (Value *Op : ChainOp->operands()) { 7318 if (auto *I = dyn_cast<Instruction>(Op)) { 7319 ChainOpsAndOperands.insert(I); 7320 if (I->getOpcode() == Instruction::Mul) { 7321 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0)); 7322 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1)); 7323 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && 7324 Ext0->getOpcode() == Ext1->getOpcode()) { 7325 ChainOpsAndOperands.insert(Ext0); 7326 ChainOpsAndOperands.insert(Ext1); 7327 } 7328 } 7329 } 7330 } 7331 } 7332 7333 // Pre-compute the cost for I, if it has a reduction pattern cost. 7334 for (Instruction *I : ChainOpsAndOperands) { 7335 auto ReductionCost = 7336 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF)); 7337 if (!ReductionCost) 7338 continue; 7339 7340 assert(!CostCtx.SkipCostComputation.contains(I) && 7341 "reduction op visited multiple times"); 7342 CostCtx.SkipCostComputation.insert(I); 7343 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7344 << ":\n in-loop reduction " << *I << "\n"); 7345 Cost += *ReductionCost; 7346 } 7347 } 7348 7349 // Pre-compute the costs for branches except for the backedge, as the number 7350 // of replicate regions in a VPlan may not directly match the number of 7351 // branches, which would lead to different decisions. 7352 // TODO: Compute cost of branches for each replicate region in the VPlan, 7353 // which is more accurate than the legacy cost model. 7354 for (BasicBlock *BB : OrigLoop->blocks()) { 7355 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector())) 7356 continue; 7357 CostCtx.SkipCostComputation.insert(BB->getTerminator()); 7358 if (BB == OrigLoop->getLoopLatch()) 7359 continue; 7360 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); 7361 Cost += BranchCost; 7362 } 7363 7364 // Pre-compute costs for instructions that are forced-scalar or profitable to 7365 // scalarize. Their costs will be computed separately in the legacy cost 7366 // model. 7367 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) { 7368 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector())) 7369 continue; 7370 CostCtx.SkipCostComputation.insert(ForcedScalar); 7371 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF); 7372 LLVM_DEBUG({ 7373 dbgs() << "Cost of " << ForcedCost << " for VF " << VF 7374 << ": forced scalar " << *ForcedScalar << "\n"; 7375 }); 7376 Cost += ForcedCost; 7377 } 7378 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) { 7379 if (CostCtx.skipCostComputation(Scalarized, VF.isVector())) 7380 continue; 7381 CostCtx.SkipCostComputation.insert(Scalarized); 7382 LLVM_DEBUG({ 7383 dbgs() << "Cost of " << ScalarCost << " for VF " << VF 7384 << ": profitable to scalarize " << *Scalarized << "\n"; 7385 }); 7386 Cost += ScalarCost; 7387 } 7388 7389 return Cost; 7390 } 7391 7392 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, 7393 ElementCount VF) const { 7394 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, 7395 CM.CostKind); 7396 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); 7397 7398 // Now compute and add the VPlan-based cost. 7399 Cost += Plan.cost(VF, CostCtx); 7400 #ifndef NDEBUG 7401 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); 7402 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost 7403 << " (Estimated cost per lane: "); 7404 if (Cost.isValid()) { 7405 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth; 7406 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane)); 7407 } else /* No point dividing an invalid cost - it will still be invalid */ 7408 LLVM_DEBUG(dbgs() << "Invalid"); 7409 LLVM_DEBUG(dbgs() << ")\n"); 7410 #endif 7411 return Cost; 7412 } 7413 7414 #ifndef NDEBUG 7415 /// Return true if the original loop \ TheLoop contains any instructions that do 7416 /// not have corresponding recipes in \p Plan and are not marked to be ignored 7417 /// in \p CostCtx. This means the VPlan contains simplification that the legacy 7418 /// cost-model did not account for. 7419 static bool planContainsAdditionalSimplifications(VPlan &Plan, 7420 VPCostContext &CostCtx, 7421 Loop *TheLoop) { 7422 // First collect all instructions for the recipes in Plan. 7423 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * { 7424 if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) 7425 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); 7426 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) 7427 return &WidenMem->getIngredient(); 7428 return nullptr; 7429 }; 7430 7431 DenseSet<Instruction *> SeenInstrs; 7432 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); 7433 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 7434 for (VPRecipeBase &R : *VPBB) { 7435 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) { 7436 auto *IG = IR->getInterleaveGroup(); 7437 unsigned NumMembers = IG->getNumMembers(); 7438 for (unsigned I = 0; I != NumMembers; ++I) { 7439 if (Instruction *M = IG->getMember(I)) 7440 SeenInstrs.insert(M); 7441 } 7442 continue; 7443 } 7444 // The VPlan-based cost model is more accurate for partial reduction and 7445 // comparing against the legacy cost isn't desirable. 7446 if (isa<VPPartialReductionRecipe>(&R)) 7447 return true; 7448 if (Instruction *UI = GetInstructionForCost(&R)) 7449 SeenInstrs.insert(UI); 7450 } 7451 } 7452 7453 // Return true if the loop contains any instructions that are not also part of 7454 // the VPlan or are skipped for VPlan-based cost computations. This indicates 7455 // that the VPlan contains extra simplifications. 7456 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx, 7457 TheLoop](BasicBlock *BB) { 7458 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) { 7459 if (isa<PHINode>(&I) && BB == TheLoop->getHeader()) 7460 return false; 7461 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); 7462 }); 7463 }); 7464 } 7465 #endif 7466 7467 VectorizationFactor LoopVectorizationPlanner::computeBestVF() { 7468 if (VPlans.empty()) 7469 return VectorizationFactor::Disabled(); 7470 // If there is a single VPlan with a single VF, return it directly. 7471 VPlan &FirstPlan = *VPlans[0]; 7472 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) 7473 return {*FirstPlan.vectorFactors().begin(), 0, 0}; 7474 7475 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: " 7476 << (CM.CostKind == TTI::TCK_RecipThroughput 7477 ? "Reciprocal Throughput\n" 7478 : CM.CostKind == TTI::TCK_Latency 7479 ? "Instruction Latency\n" 7480 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n" 7481 : CM.CostKind == TTI::TCK_SizeAndLatency 7482 ? "Code Size and Latency\n" 7483 : "Unknown\n")); 7484 7485 ElementCount ScalarVF = ElementCount::getFixed(1); 7486 assert(hasPlanWithVF(ScalarVF) && 7487 "More than a single plan/VF w/o any plan having scalar VF"); 7488 7489 // TODO: Compute scalar cost using VPlan-based cost model. 7490 InstructionCost ScalarCost = CM.expectedCost(ScalarVF); 7491 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n"); 7492 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost); 7493 VectorizationFactor BestFactor = ScalarFactor; 7494 7495 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 7496 if (ForceVectorization) { 7497 // Ignore scalar width, because the user explicitly wants vectorization. 7498 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 7499 // evaluation. 7500 BestFactor.Cost = InstructionCost::getMax(); 7501 } 7502 7503 for (auto &P : VPlans) { 7504 for (ElementCount VF : P->vectorFactors()) { 7505 if (VF.isScalar()) 7506 continue; 7507 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 7508 LLVM_DEBUG( 7509 dbgs() 7510 << "LV: Not considering vector loop of width " << VF 7511 << " because it will not generate any vector instructions.\n"); 7512 continue; 7513 } 7514 7515 InstructionCost Cost = cost(*P, VF); 7516 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); 7517 if (isMoreProfitable(CurrentFactor, BestFactor)) 7518 BestFactor = CurrentFactor; 7519 7520 // If profitable add it to ProfitableVF list. 7521 if (isMoreProfitable(CurrentFactor, ScalarFactor)) 7522 ProfitableVFs.push_back(CurrentFactor); 7523 } 7524 } 7525 7526 #ifndef NDEBUG 7527 // Select the optimal vectorization factor according to the legacy cost-model. 7528 // This is now only used to verify the decisions by the new VPlan-based 7529 // cost-model and will be retired once the VPlan-based cost-model is 7530 // stabilized. 7531 VectorizationFactor LegacyVF = selectVectorizationFactor(); 7532 VPlan &BestPlan = getPlanFor(BestFactor.Width); 7533 7534 // Pre-compute the cost and use it to check if BestPlan contains any 7535 // simplifications not accounted for in the legacy cost model. If that's the 7536 // case, don't trigger the assertion, as the extra simplifications may cause a 7537 // different VF to be picked by the VPlan-based cost model. 7538 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, 7539 CM.CostKind); 7540 precomputeCosts(BestPlan, BestFactor.Width, CostCtx); 7541 assert((BestFactor.Width == LegacyVF.Width || 7542 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), 7543 CostCtx, OrigLoop) || 7544 planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), 7545 CostCtx, OrigLoop)) && 7546 " VPlan cost model and legacy cost model disagreed"); 7547 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && 7548 "when vectorizing, the scalar cost must be computed."); 7549 #endif 7550 7551 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); 7552 return BestFactor; 7553 } 7554 7555 static void addRuntimeUnrollDisableMetaData(Loop *L) { 7556 SmallVector<Metadata *, 4> MDs; 7557 // Reserve first location for self reference to the LoopID metadata node. 7558 MDs.push_back(nullptr); 7559 bool IsUnrollMetadata = false; 7560 MDNode *LoopID = L->getLoopID(); 7561 if (LoopID) { 7562 // First find existing loop unrolling disable metadata. 7563 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { 7564 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I)); 7565 if (MD) { 7566 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7567 IsUnrollMetadata = 7568 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7569 } 7570 MDs.push_back(LoopID->getOperand(I)); 7571 } 7572 } 7573 7574 if (!IsUnrollMetadata) { 7575 // Add runtime unroll disable metadata. 7576 LLVMContext &Context = L->getHeader()->getContext(); 7577 SmallVector<Metadata *, 1> DisableOperands; 7578 DisableOperands.push_back( 7579 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7580 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7581 MDs.push_back(DisableNode); 7582 MDNode *NewLoopID = MDNode::get(Context, MDs); 7583 // Set operand 0 to refer to the loop id itself. 7584 NewLoopID->replaceOperandWith(0, NewLoopID); 7585 L->setLoopID(NewLoopID); 7586 } 7587 } 7588 7589 // If \p R is a ComputeReductionResult when vectorizing the epilog loop, 7590 // fix the reduction's scalar PHI node by adding the incoming value from the 7591 // main vector loop. 7592 static void fixReductionScalarResumeWhenVectorizingEpilog( 7593 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, 7594 BasicBlock *BypassBlock) { 7595 auto *EpiRedResult = dyn_cast<VPInstruction>(R); 7596 if (!EpiRedResult || 7597 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7598 return; 7599 7600 auto *EpiRedHeaderPhi = 7601 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0)); 7602 const RecurrenceDescriptor &RdxDesc = 7603 EpiRedHeaderPhi->getRecurrenceDescriptor(); 7604 Value *MainResumeValue = 7605 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); 7606 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 7607 RdxDesc.getRecurrenceKind())) { 7608 auto *Cmp = cast<ICmpInst>(MainResumeValue); 7609 assert(Cmp->getPredicate() == CmpInst::ICMP_NE && 7610 "AnyOf expected to start with ICMP_NE"); 7611 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && 7612 "AnyOf expected to start by comparing main resume value to original " 7613 "start value"); 7614 MainResumeValue = Cmp->getOperand(0); 7615 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 7616 RdxDesc.getRecurrenceKind())) { 7617 using namespace llvm::PatternMatch; 7618 Value *Cmp, *OrigResumeV; 7619 bool IsExpectedPattern = 7620 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), 7621 m_Specific(RdxDesc.getSentinelValue()), 7622 m_Value(OrigResumeV))) && 7623 match(Cmp, 7624 m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), 7625 m_Specific(RdxDesc.getRecurrenceStartValue()))); 7626 assert(IsExpectedPattern && "Unexpected reduction resume pattern"); 7627 (void)IsExpectedPattern; 7628 MainResumeValue = OrigResumeV; 7629 } 7630 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue); 7631 7632 // When fixing reductions in the epilogue loop we should already have 7633 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry 7634 // over the incoming values correctly. 7635 using namespace VPlanPatternMatch; 7636 auto IsResumePhi = [](VPUser *U) { 7637 return match( 7638 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue())); 7639 }; 7640 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && 7641 "ResumePhi must have a single user"); 7642 auto *EpiResumePhiVPI = 7643 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi)); 7644 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true)); 7645 EpiResumePhi->setIncomingValueForBlock( 7646 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); 7647 } 7648 7649 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( 7650 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7651 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue, 7652 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7653 assert(BestVPlan.hasVF(BestVF) && 7654 "Trying to execute plan with unsupported VF"); 7655 assert(BestVPlan.hasUF(BestUF) && 7656 "Trying to execute plan with unsupported UF"); 7657 assert( 7658 ((VectorizingEpilogue && ExpandedSCEVs) || 7659 (!VectorizingEpilogue && !ExpandedSCEVs)) && 7660 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7661 7662 // TODO: Move to VPlan transform stage once the transition to the VPlan-based 7663 // cost model is complete for better cost estimates. 7664 VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, 7665 OrigLoop->getHeader()->getContext()); 7666 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7667 VPlanTransforms::convertToConcreteRecipes(BestVPlan); 7668 7669 // Perform the actual loop transformation. 7670 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, 7671 &BestVPlan, OrigLoop->getParentLoop(), 7672 Legal->getWidestInductionType()); 7673 7674 #ifdef EXPENSIVE_CHECKS 7675 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 7676 #endif 7677 7678 // 0. Generate SCEV-dependent code in the entry, including TripCount, before 7679 // making any changes to the CFG. 7680 if (!BestVPlan.getEntry()->empty()) 7681 BestVPlan.getEntry()->execute(&State); 7682 7683 if (!ILV.getTripCount()) 7684 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0))); 7685 else 7686 assert(VectorizingEpilogue && "should only re-use the existing trip " 7687 "count during epilogue vectorization"); 7688 7689 // 1. Set up the skeleton for vectorization, including vector pre-header and 7690 // middle block. The vector loop is created during VPlan execution. 7691 VPBasicBlock *VectorPH = 7692 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor()); 7693 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton( 7694 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); 7695 if (VectorizingEpilogue) 7696 VPlanTransforms::removeDeadRecipes(BestVPlan); 7697 7698 // Only use noalias metadata when using memory checks guaranteeing no overlap 7699 // across all iterations. 7700 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7701 std::unique_ptr<LoopVersioning> LVer = nullptr; 7702 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7703 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7704 7705 // We currently don't use LoopVersioning for the actual loop cloning but we 7706 // still use it to add the noalias metadata. 7707 // TODO: Find a better way to re-use LoopVersioning functionality to add 7708 // metadata. 7709 LVer = std::make_unique<LoopVersioning>( 7710 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7711 PSE.getSE()); 7712 State.LVer = &*LVer; 7713 State.LVer->prepareNoAliasMetadata(); 7714 } 7715 7716 ILV.printDebugTracesAtStart(); 7717 7718 //===------------------------------------------------===// 7719 // 7720 // Notice: any optimization or new instruction that go 7721 // into the code below should also be implemented in 7722 // the cost-model. 7723 // 7724 //===------------------------------------------------===// 7725 7726 // 2. Copy and widen instructions from the old loop into the new loop. 7727 BestVPlan.prepareToExecute( 7728 ILV.getTripCount(), 7729 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State); 7730 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB); 7731 7732 BestVPlan.execute(&State); 7733 7734 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7735 // 2.5 When vectorizing the epilogue, fix reduction and induction resume 7736 // values from the additional bypass block. 7737 if (VectorizingEpilogue) { 7738 assert(!ILV.Legal->hasUncountableEarlyExit() && 7739 "Epilogue vectorisation not yet supported with early exits"); 7740 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); 7741 for (VPRecipeBase &R : *MiddleVPBB) { 7742 fixReductionScalarResumeWhenVectorizingEpilog( 7743 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); 7744 } 7745 BasicBlock *PH = OrigLoop->getLoopPreheader(); 7746 for (const auto &[IVPhi, _] : Legal->getInductionVars()) { 7747 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); 7748 Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); 7749 Inc->setIncomingValueForBlock(BypassBlock, V); 7750 } 7751 } 7752 7753 // 2.6. Maintain Loop Hints 7754 // Keep all loop hints from the original loop on the vector loop (we'll 7755 // replace the vectorizer-specific hints below). 7756 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) { 7757 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7758 7759 std::optional<MDNode *> VectorizedLoopID = 7760 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7761 LLVMLoopVectorizeFollowupVectorized}); 7762 7763 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); 7764 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7765 if (VectorizedLoopID) { 7766 L->setLoopID(*VectorizedLoopID); 7767 } else { 7768 // Keep all loop hints from the original loop on the vector loop (we'll 7769 // replace the vectorizer-specific hints below). 7770 if (MDNode *LID = OrigLoop->getLoopID()) 7771 L->setLoopID(LID); 7772 7773 LoopVectorizeHints Hints(L, true, *ORE); 7774 Hints.setAlreadyVectorized(); 7775 } 7776 TargetTransformInfo::UnrollingPreferences UP; 7777 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7778 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue) 7779 addRuntimeUnrollDisableMetaData(L); 7780 } 7781 7782 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7783 // predication, updating analyses. 7784 ILV.fixVectorizedLoop(State); 7785 7786 ILV.printDebugTracesAtEnd(); 7787 7788 // 4. Adjust branch weight of the branch in the middle block. 7789 if (BestVPlan.getVectorLoopRegion()) { 7790 auto *MiddleVPBB = BestVPlan.getMiddleBlock(); 7791 auto *MiddleTerm = 7792 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); 7793 if (MiddleTerm->isConditional() && 7794 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7795 // Assume that `Count % VectorTripCount` is equally distributed. 7796 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); 7797 assert(TripCount > 0 && "trip count should not be zero"); 7798 const uint32_t Weights[] = {1, TripCount - 1}; 7799 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); 7800 } 7801 } 7802 7803 return State.ExpandedSCEVs; 7804 } 7805 7806 //===--------------------------------------------------------------------===// 7807 // EpilogueVectorizerMainLoop 7808 //===--------------------------------------------------------------------===// 7809 7810 /// This function is partially responsible for generating the control flow 7811 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7812 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7813 const SCEV2ValueTy &ExpandedSCEVs) { 7814 createVectorLoopSkeleton(""); 7815 7816 // Generate the code to check the minimum iteration count of the vector 7817 // epilogue (see below). 7818 EPI.EpilogueIterationCountCheck = 7819 emitIterationCountCheck(LoopScalarPreHeader, true); 7820 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7821 7822 // Generate the code to check any assumptions that we've made for SCEV 7823 // expressions. 7824 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7825 7826 // Generate the code that checks at runtime if arrays overlap. We put the 7827 // checks into a separate block to make the more common case of few elements 7828 // faster. 7829 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7830 7831 // Generate the iteration count check for the main loop, *after* the check 7832 // for the epilogue loop, so that the path-length is shorter for the case 7833 // that goes directly through the vector epilogue. The longer-path length for 7834 // the main loop is compensated for, by the gain from vectorizing the larger 7835 // trip count. Note: the branch will get updated later on when we vectorize 7836 // the epilogue. 7837 EPI.MainLoopIterationCountCheck = 7838 emitIterationCountCheck(LoopScalarPreHeader, false); 7839 7840 // Generate the induction variable. 7841 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7842 7843 return LoopVectorPreHeader; 7844 } 7845 7846 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7847 LLVM_DEBUG({ 7848 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7849 << "Main Loop VF:" << EPI.MainLoopVF 7850 << ", Main Loop UF:" << EPI.MainLoopUF 7851 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7852 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7853 }); 7854 } 7855 7856 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7857 DEBUG_WITH_TYPE(VerboseDebug, { 7858 dbgs() << "intermediate fn:\n" 7859 << *OrigLoop->getHeader()->getParent() << "\n"; 7860 }); 7861 } 7862 7863 BasicBlock * 7864 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7865 bool ForEpilogue) { 7866 assert(Bypass && "Expected valid bypass basic block."); 7867 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7868 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7869 Value *Count = getTripCount(); 7870 // Reuse existing vector loop preheader for TC checks. 7871 // Note that new preheader block is generated for vector loop. 7872 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7873 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7874 7875 // Generate code to check if the loop's trip count is less than VF * UF of the 7876 // main vector loop. 7877 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7878 : VF.isVector()) 7879 ? ICmpInst::ICMP_ULE 7880 : ICmpInst::ICMP_ULT; 7881 7882 Value *CheckMinIters = Builder.CreateICmp( 7883 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7884 "min.iters.check"); 7885 7886 if (!ForEpilogue) 7887 TCCheckBlock->setName("vector.main.loop.iter.check"); 7888 7889 // Create new preheader for vector loop. 7890 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7891 DT, LI, nullptr, "vector.ph"); 7892 7893 if (ForEpilogue) { 7894 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7895 DT->getNode(Bypass)->getIDom()) && 7896 "TC check is expected to dominate Bypass"); 7897 7898 LoopBypassBlocks.push_back(TCCheckBlock); 7899 7900 // Save the trip count so we don't have to regenerate it in the 7901 // vec.epilog.iter.check. This is safe to do because the trip count 7902 // generated here dominates the vector epilog iter check. 7903 EPI.TripCount = Count; 7904 } 7905 7906 BranchInst &BI = 7907 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7908 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7909 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 7910 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7911 7912 introduceCheckBlockInVPlan(TCCheckBlock); 7913 return TCCheckBlock; 7914 } 7915 7916 //===--------------------------------------------------------------------===// 7917 // EpilogueVectorizerEpilogueLoop 7918 //===--------------------------------------------------------------------===// 7919 7920 /// This function is partially responsible for generating the control flow 7921 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7922 BasicBlock * 7923 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7924 const SCEV2ValueTy &ExpandedSCEVs) { 7925 createVectorLoopSkeleton("vec.epilog."); 7926 7927 // Now, compare the remaining count and if there aren't enough iterations to 7928 // execute the vectorized epilogue skip to the scalar part. 7929 LoopVectorPreHeader->setName("vec.epilog.ph"); 7930 BasicBlock *VecEpilogueIterationCountCheck = 7931 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, 7932 nullptr, "vec.epilog.iter.check", true); 7933 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7934 VecEpilogueIterationCountCheck); 7935 AdditionalBypassBlock = VecEpilogueIterationCountCheck; 7936 7937 // Adjust the control flow taking the state info from the main loop 7938 // vectorization into account. 7939 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7940 "expected this to be saved from the previous pass."); 7941 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7942 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7943 7944 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7945 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7946 7947 if (EPI.SCEVSafetyCheck) 7948 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7949 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7950 if (EPI.MemSafetyCheck) 7951 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7952 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7953 7954 DT->changeImmediateDominator(LoopScalarPreHeader, 7955 EPI.EpilogueIterationCountCheck); 7956 // Keep track of bypass blocks, as they feed start values to the induction and 7957 // reduction phis in the scalar loop preheader. 7958 if (EPI.SCEVSafetyCheck) 7959 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7960 if (EPI.MemSafetyCheck) 7961 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7962 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7963 7964 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7965 // reductions which merge control-flow from the latch block and the middle 7966 // block. Update the incoming values here and move the Phi into the preheader. 7967 SmallVector<PHINode *, 4> PhisInBlock; 7968 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7969 PhisInBlock.push_back(&Phi); 7970 7971 for (PHINode *Phi : PhisInBlock) { 7972 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7973 Phi->replaceIncomingBlockWith( 7974 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7975 VecEpilogueIterationCountCheck); 7976 7977 // If the phi doesn't have an incoming value from the 7978 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7979 // value and also those from other check blocks. This is needed for 7980 // reduction phis only. 7981 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7982 return EPI.EpilogueIterationCountCheck == IncB; 7983 })) 7984 continue; 7985 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7986 if (EPI.SCEVSafetyCheck) 7987 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7988 if (EPI.MemSafetyCheck) 7989 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7990 } 7991 7992 // Generate bypass values from the additional bypass block. Note that when the 7993 // vectorized epilogue is skipped due to iteration count check, then the 7994 // resume value for the induction variable comes from the trip count of the 7995 // main vector loop, passed as the second argument. 7996 createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount); 7997 return LoopVectorPreHeader; 7998 } 7999 8000 BasicBlock * 8001 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8002 BasicBlock *Bypass, BasicBlock *Insert) { 8003 8004 assert(EPI.TripCount && 8005 "Expected trip count to have been saved in the first pass."); 8006 assert( 8007 (!isa<Instruction>(EPI.TripCount) || 8008 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8009 "saved trip count does not dominate insertion point."); 8010 Value *TC = EPI.TripCount; 8011 IRBuilder<> Builder(Insert->getTerminator()); 8012 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8013 8014 // Generate code to check if the loop's trip count is less than VF * UF of the 8015 // vector epilogue loop. 8016 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8017 ? ICmpInst::ICMP_ULE 8018 : ICmpInst::ICMP_ULT; 8019 8020 Value *CheckMinIters = 8021 Builder.CreateICmp(P, Count, 8022 createStepForVF(Builder, Count->getType(), 8023 EPI.EpilogueVF, EPI.EpilogueUF), 8024 "min.epilog.iters.check"); 8025 8026 BranchInst &BI = 8027 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 8028 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 8029 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 8030 unsigned EpilogueLoopStep = 8031 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 8032 // We assume the remaining `Count` is equally distributed in 8033 // [0, MainLoopStep) 8034 // So the probability for `Count < EpilogueLoopStep` should be 8035 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 8036 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 8037 const uint32_t Weights[] = {EstimatedSkipCount, 8038 MainLoopStep - EstimatedSkipCount}; 8039 setBranchWeights(BI, Weights, /*IsExpected=*/false); 8040 } 8041 ReplaceInstWithInst(Insert->getTerminator(), &BI); 8042 LoopBypassBlocks.push_back(Insert); 8043 8044 // A new entry block has been created for the epilogue VPlan. Hook it in, as 8045 // otherwise we would try to modify the entry to the main vector loop. 8046 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); 8047 VPBasicBlock *OldEntry = Plan.getEntry(); 8048 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); 8049 Plan.setEntry(NewEntry); 8050 // OldEntry is now dead and will be cleaned up when the plan gets destroyed. 8051 8052 introduceCheckBlockInVPlan(Insert); 8053 return Insert; 8054 } 8055 8056 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8057 LLVM_DEBUG({ 8058 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8059 << "Epilogue Loop VF:" << EPI.EpilogueVF 8060 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8061 }); 8062 } 8063 8064 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8065 DEBUG_WITH_TYPE(VerboseDebug, { 8066 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8067 }); 8068 } 8069 8070 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 8071 VPRecipeBuilder::mapToVPValues(User::op_range Operands) { 8072 std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 8073 return getVPValueOrAddLiveIn(Op); 8074 }; 8075 return map_range(Operands, Fn); 8076 } 8077 8078 void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { 8079 BasicBlock *Src = SI->getParent(); 8080 assert(!OrigLoop->isLoopExiting(Src) && 8081 all_of(successors(Src), 8082 [this](BasicBlock *Succ) { 8083 return OrigLoop->getHeader() != Succ; 8084 }) && 8085 "unsupported switch either exiting loop or continuing to header"); 8086 // Create masks where the terminator in Src is a switch. We create mask for 8087 // all edges at the same time. This is more efficient, as we can create and 8088 // collect compares for all cases once. 8089 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition()); 8090 BasicBlock *DefaultDst = SI->getDefaultDest(); 8091 MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares; 8092 for (auto &C : SI->cases()) { 8093 BasicBlock *Dst = C.getCaseSuccessor(); 8094 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); 8095 // Cases whose destination is the same as default are redundant and can be 8096 // ignored - they will get there anyhow. 8097 if (Dst == DefaultDst) 8098 continue; 8099 auto &Compares = Dst2Compares[Dst]; 8100 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); 8101 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); 8102 } 8103 8104 // We need to handle 2 separate cases below for all entries in Dst2Compares, 8105 // which excludes destinations matching the default destination. 8106 VPValue *SrcMask = getBlockInMask(Src); 8107 VPValue *DefaultMask = nullptr; 8108 for (const auto &[Dst, Conds] : Dst2Compares) { 8109 // 1. Dst is not the default destination. Dst is reached if any of the cases 8110 // with destination == Dst are taken. Join the conditions for each case 8111 // whose destination == Dst using an OR. 8112 VPValue *Mask = Conds[0]; 8113 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front()) 8114 Mask = Builder.createOr(Mask, V); 8115 if (SrcMask) 8116 Mask = Builder.createLogicalAnd(SrcMask, Mask); 8117 EdgeMaskCache[{Src, Dst}] = Mask; 8118 8119 // 2. Create the mask for the default destination, which is reached if none 8120 // of the cases with destination != default destination are taken. Join the 8121 // conditions for each case where the destination is != Dst using an OR and 8122 // negate it. 8123 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; 8124 } 8125 8126 if (DefaultMask) { 8127 DefaultMask = Builder.createNot(DefaultMask); 8128 if (SrcMask) 8129 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); 8130 } 8131 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; 8132 } 8133 8134 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 8135 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8136 8137 // Look for cached value. 8138 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8139 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8140 if (ECEntryIt != EdgeMaskCache.end()) 8141 return ECEntryIt->second; 8142 8143 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) { 8144 createSwitchEdgeMasks(SI); 8145 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); 8146 return EdgeMaskCache[Edge]; 8147 } 8148 8149 VPValue *SrcMask = getBlockInMask(Src); 8150 8151 // The terminator has to be a branch inst! 8152 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8153 assert(BI && "Unexpected terminator found"); 8154 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8155 return EdgeMaskCache[Edge] = SrcMask; 8156 8157 // If source is an exiting block, we know the exit edge is dynamically dead 8158 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8159 // adding uses of an otherwise potentially dead instruction unless we are 8160 // vectorizing a loop with uncountable exits. In that case, we always 8161 // materialize the mask. 8162 if (OrigLoop->isLoopExiting(Src) && 8163 Src != Legal->getUncountableEarlyExitingBlock()) 8164 return EdgeMaskCache[Edge] = SrcMask; 8165 8166 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); 8167 assert(EdgeMask && "No Edge Mask found for condition"); 8168 8169 if (BI->getSuccessor(0) != Dst) 8170 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8171 8172 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8173 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 8174 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 8175 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8176 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); 8177 } 8178 8179 return EdgeMaskCache[Edge] = EdgeMask; 8180 } 8181 8182 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { 8183 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8184 8185 // Look for cached value. 8186 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8187 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); 8188 assert(ECEntryIt != EdgeMaskCache.end() && 8189 "looking up mask for edge which has not been created"); 8190 return ECEntryIt->second; 8191 } 8192 8193 void VPRecipeBuilder::createHeaderMask() { 8194 BasicBlock *Header = OrigLoop->getHeader(); 8195 8196 // When not folding the tail, use nullptr to model all-true mask. 8197 if (!CM.foldTailByMasking()) { 8198 BlockMaskCache[Header] = nullptr; 8199 return; 8200 } 8201 8202 // Introduce the early-exit compare IV <= BTC to form header block mask. 8203 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8204 // constructing the desired canonical IV in the header block as its first 8205 // non-phi instructions. 8206 8207 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8208 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8209 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8210 HeaderVPBB->insert(IV, NewInsertionPoint); 8211 8212 VPBuilder::InsertPointGuard Guard(Builder); 8213 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8214 VPValue *BlockMask = nullptr; 8215 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8216 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8217 BlockMaskCache[Header] = BlockMask; 8218 } 8219 8220 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8221 // Return the cached value. 8222 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8223 assert(BCEntryIt != BlockMaskCache.end() && 8224 "Trying to access mask for block without one."); 8225 return BCEntryIt->second; 8226 } 8227 8228 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { 8229 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8230 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8231 assert(OrigLoop->getHeader() != BB && 8232 "Loop header must have cached block mask"); 8233 8234 // All-one mask is modelled as no-mask following the convention for masked 8235 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8236 VPValue *BlockMask = nullptr; 8237 // This is the block mask. We OR all unique incoming edges. 8238 for (auto *Predecessor : 8239 SetVector<BasicBlock *>(pred_begin(BB), pred_end(BB))) { 8240 VPValue *EdgeMask = createEdgeMask(Predecessor, BB); 8241 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8242 BlockMaskCache[BB] = EdgeMask; 8243 return; 8244 } 8245 8246 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8247 BlockMask = EdgeMask; 8248 continue; 8249 } 8250 8251 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8252 } 8253 8254 BlockMaskCache[BB] = BlockMask; 8255 } 8256 8257 VPWidenMemoryRecipe * 8258 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, 8259 VFRange &Range) { 8260 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8261 "Must be called with either a load or store"); 8262 8263 auto WillWiden = [&](ElementCount VF) -> bool { 8264 LoopVectorizationCostModel::InstWidening Decision = 8265 CM.getWideningDecision(I, VF); 8266 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8267 "CM decision should be taken at this point."); 8268 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8269 return true; 8270 if (CM.isScalarAfterVectorization(I, VF) || 8271 CM.isProfitableToScalarize(I, VF)) 8272 return false; 8273 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8274 }; 8275 8276 if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range)) 8277 return nullptr; 8278 8279 VPValue *Mask = nullptr; 8280 if (Legal->isMaskRequired(I)) 8281 Mask = getBlockInMask(I->getParent()); 8282 8283 // Determine if the pointer operand of the access is either consecutive or 8284 // reverse consecutive. 8285 LoopVectorizationCostModel::InstWidening Decision = 8286 CM.getWideningDecision(I, Range.Start); 8287 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8288 bool Consecutive = 8289 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8290 8291 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8292 if (Consecutive) { 8293 auto *GEP = dyn_cast<GetElementPtrInst>( 8294 Ptr->getUnderlyingValue()->stripPointerCasts()); 8295 VPSingleDefRecipe *VectorPtr; 8296 if (Reverse) { 8297 // When folding the tail, we may compute an address that we don't in the 8298 // original scalar loop and it may not be inbounds. Drop Inbounds in that 8299 // case. 8300 GEPNoWrapFlags Flags = 8301 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds()) 8302 ? GEPNoWrapFlags::none() 8303 : GEPNoWrapFlags::inBounds(); 8304 VectorPtr = new VPReverseVectorPointerRecipe( 8305 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc()); 8306 } else { 8307 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), 8308 GEP ? GEP->getNoWrapFlags() 8309 : GEPNoWrapFlags::none(), 8310 I->getDebugLoc()); 8311 } 8312 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8313 Ptr = VectorPtr; 8314 } 8315 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8316 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, 8317 I->getDebugLoc()); 8318 8319 StoreInst *Store = cast<StoreInst>(I); 8320 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, 8321 Reverse, I->getDebugLoc()); 8322 } 8323 8324 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8325 /// insert a recipe to expand the step for the induction recipe. 8326 static VPWidenIntOrFpInductionRecipe * 8327 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8328 VPValue *Start, const InductionDescriptor &IndDesc, 8329 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { 8330 assert(IndDesc.getStartValue() == 8331 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8332 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8333 "step must be loop invariant"); 8334 8335 VPValue *Step = 8336 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8337 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8338 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8339 IndDesc, TruncI, 8340 TruncI->getDebugLoc()); 8341 } 8342 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8343 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), 8344 IndDesc, Phi->getDebugLoc()); 8345 } 8346 8347 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 8348 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { 8349 8350 // Check if this is an integer or fp induction. If so, build the recipe that 8351 // produces its scalar and vector values. 8352 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8353 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8354 *PSE.getSE(), *OrigLoop); 8355 8356 // Check if this is pointer induction. If so, build the recipe for it. 8357 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8358 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8359 *PSE.getSE()); 8360 return new VPWidenPointerInductionRecipe( 8361 Phi, Operands[0], Step, *II, 8362 LoopVectorizationPlanner::getDecisionAndClampRange( 8363 [&](ElementCount VF) { 8364 return CM.isScalarAfterVectorization(Phi, VF); 8365 }, 8366 Range), 8367 Phi->getDebugLoc()); 8368 } 8369 return nullptr; 8370 } 8371 8372 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8373 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { 8374 // Optimize the special case where the source is a constant integer 8375 // induction variable. Notice that we can only optimize the 'trunc' case 8376 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8377 // (c) other casts depend on pointer size. 8378 8379 // Determine whether \p K is a truncation based on an induction variable that 8380 // can be optimized. 8381 auto IsOptimizableIVTruncate = 8382 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8383 return [=](ElementCount VF) -> bool { 8384 return CM.isOptimizableIVTruncate(K, VF); 8385 }; 8386 }; 8387 8388 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8389 IsOptimizableIVTruncate(I), Range)) { 8390 8391 auto *Phi = cast<PHINode>(I->getOperand(0)); 8392 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8393 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); 8394 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8395 *OrigLoop); 8396 } 8397 return nullptr; 8398 } 8399 8400 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, 8401 ArrayRef<VPValue *> Operands) { 8402 unsigned NumIncoming = Phi->getNumIncomingValues(); 8403 8404 // We know that all PHIs in non-header blocks are converted into selects, so 8405 // we don't have to worry about the insertion order and we can just use the 8406 // builder. At this point we generate the predication tree. There may be 8407 // duplications since this is a simple recursive scan, but future 8408 // optimizations will clean it up. 8409 SmallVector<VPValue *, 2> OperandsWithMask; 8410 8411 for (unsigned In = 0; In < NumIncoming; In++) { 8412 OperandsWithMask.push_back(Operands[In]); 8413 VPValue *EdgeMask = 8414 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); 8415 if (!EdgeMask) { 8416 assert(In == 0 && "Both null and non-null edge masks found"); 8417 assert(all_equal(Operands) && 8418 "Distinct incoming values with one having a full mask"); 8419 break; 8420 } 8421 OperandsWithMask.push_back(EdgeMask); 8422 } 8423 return new VPBlendRecipe(Phi, OperandsWithMask); 8424 } 8425 8426 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8427 ArrayRef<VPValue *> Operands, 8428 VFRange &Range) { 8429 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8430 [this, CI](ElementCount VF) { 8431 return CM.isScalarWithPredication(CI, VF); 8432 }, 8433 Range); 8434 8435 if (IsPredicated) 8436 return nullptr; 8437 8438 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8439 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8440 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8441 ID == Intrinsic::pseudoprobe || 8442 ID == Intrinsic::experimental_noalias_scope_decl)) 8443 return nullptr; 8444 8445 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8446 8447 // Is it beneficial to perform intrinsic call compared to lib call? 8448 bool ShouldUseVectorIntrinsic = 8449 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8450 [&](ElementCount VF) -> bool { 8451 return CM.getCallWideningDecision(CI, VF).Kind == 8452 LoopVectorizationCostModel::CM_IntrinsicCall; 8453 }, 8454 Range); 8455 if (ShouldUseVectorIntrinsic) 8456 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), 8457 CI->getDebugLoc()); 8458 8459 Function *Variant = nullptr; 8460 std::optional<unsigned> MaskPos; 8461 // Is better to call a vectorized version of the function than to to scalarize 8462 // the call? 8463 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8464 [&](ElementCount VF) -> bool { 8465 // The following case may be scalarized depending on the VF. 8466 // The flag shows whether we can use a usual Call for vectorized 8467 // version of the instruction. 8468 8469 // If we've found a variant at a previous VF, then stop looking. A 8470 // vectorized variant of a function expects input in a certain shape 8471 // -- basically the number of input registers, the number of lanes 8472 // per register, and whether there's a mask required. 8473 // We store a pointer to the variant in the VPWidenCallRecipe, so 8474 // once we have an appropriate variant it's only valid for that VF. 8475 // This will force a different vplan to be generated for each VF that 8476 // finds a valid variant. 8477 if (Variant) 8478 return false; 8479 LoopVectorizationCostModel::CallWideningDecision Decision = 8480 CM.getCallWideningDecision(CI, VF); 8481 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8482 Variant = Decision.Variant; 8483 MaskPos = Decision.MaskPos; 8484 return true; 8485 } 8486 8487 return false; 8488 }, 8489 Range); 8490 if (ShouldUseVectorCall) { 8491 if (MaskPos.has_value()) { 8492 // We have 2 cases that would require a mask: 8493 // 1) The block needs to be predicated, either due to a conditional 8494 // in the scalar loop or use of an active lane mask with 8495 // tail-folding, and we use the appropriate mask for the block. 8496 // 2) No mask is required for the block, but the only available 8497 // vector variant at this VF requires a mask, so we synthesize an 8498 // all-true mask. 8499 VPValue *Mask = nullptr; 8500 if (Legal->isMaskRequired(CI)) 8501 Mask = getBlockInMask(CI->getParent()); 8502 else 8503 Mask = Plan.getOrAddLiveIn( 8504 ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); 8505 8506 Ops.insert(Ops.begin() + *MaskPos, Mask); 8507 } 8508 8509 Ops.push_back(Operands.back()); 8510 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); 8511 } 8512 8513 return nullptr; 8514 } 8515 8516 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8517 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8518 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8519 // Instruction should be widened, unless it is scalar after vectorization, 8520 // scalarization is profitable or it is predicated. 8521 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8522 return CM.isScalarAfterVectorization(I, VF) || 8523 CM.isProfitableToScalarize(I, VF) || 8524 CM.isScalarWithPredication(I, VF); 8525 }; 8526 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8527 Range); 8528 } 8529 8530 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8531 ArrayRef<VPValue *> Operands, 8532 VPBasicBlock *VPBB) { 8533 switch (I->getOpcode()) { 8534 default: 8535 return nullptr; 8536 case Instruction::SDiv: 8537 case Instruction::UDiv: 8538 case Instruction::SRem: 8539 case Instruction::URem: { 8540 // If not provably safe, use a select to form a safe divisor before widening the 8541 // div/rem operation itself. Otherwise fall through to general handling below. 8542 if (CM.isPredicatedInst(I)) { 8543 SmallVector<VPValue *> Ops(Operands); 8544 VPValue *Mask = getBlockInMask(I->getParent()); 8545 VPValue *One = 8546 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); 8547 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); 8548 Ops[1] = SafeRHS; 8549 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8550 } 8551 [[fallthrough]]; 8552 } 8553 case Instruction::Add: 8554 case Instruction::And: 8555 case Instruction::AShr: 8556 case Instruction::FAdd: 8557 case Instruction::FCmp: 8558 case Instruction::FDiv: 8559 case Instruction::FMul: 8560 case Instruction::FNeg: 8561 case Instruction::FRem: 8562 case Instruction::FSub: 8563 case Instruction::ICmp: 8564 case Instruction::LShr: 8565 case Instruction::Mul: 8566 case Instruction::Or: 8567 case Instruction::Select: 8568 case Instruction::Shl: 8569 case Instruction::Sub: 8570 case Instruction::Xor: 8571 case Instruction::Freeze: 8572 SmallVector<VPValue *> NewOps(Operands); 8573 if (Instruction::isBinaryOp(I->getOpcode())) { 8574 // The legacy cost model uses SCEV to check if some of the operands are 8575 // constants. To match the legacy cost model's behavior, use SCEV to try 8576 // to replace operands with constants. 8577 ScalarEvolution &SE = *PSE.getSE(); 8578 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) { 8579 Value *V = Op->getUnderlyingValue(); 8580 if (isa<Constant>(V) || !SE.isSCEVable(V->getType())) 8581 return Op; 8582 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V)); 8583 if (!C) 8584 return Op; 8585 return Plan.getOrAddLiveIn(C->getValue()); 8586 }; 8587 // For Mul, the legacy cost model checks both operands. 8588 if (I->getOpcode() == Instruction::Mul) 8589 NewOps[0] = GetConstantViaSCEV(NewOps[0]); 8590 // For other binops, the legacy cost model only checks the second operand. 8591 NewOps[1] = GetConstantViaSCEV(NewOps[1]); 8592 } 8593 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end())); 8594 }; 8595 } 8596 8597 VPHistogramRecipe * 8598 VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, 8599 ArrayRef<VPValue *> Operands) { 8600 // FIXME: Support other operations. 8601 unsigned Opcode = HI->Update->getOpcode(); 8602 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && 8603 "Histogram update operation must be an Add or Sub"); 8604 8605 SmallVector<VPValue *, 3> HGramOps; 8606 // Bucket address. 8607 HGramOps.push_back(Operands[1]); 8608 // Increment value. 8609 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); 8610 8611 // In case of predicated execution (due to tail-folding, or conditional 8612 // execution, or both), pass the relevant mask. 8613 if (Legal->isMaskRequired(HI->Store)) 8614 HGramOps.push_back(getBlockInMask(HI->Store->getParent())); 8615 8616 return new VPHistogramRecipe(Opcode, 8617 make_range(HGramOps.begin(), HGramOps.end()), 8618 HI->Store->getDebugLoc()); 8619 } 8620 8621 void VPRecipeBuilder::fixHeaderPhis() { 8622 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8623 for (VPHeaderPHIRecipe *R : PhisToFix) { 8624 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8625 VPRecipeBase *IncR = 8626 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8627 R->addOperand(IncR->getVPSingleValue()); 8628 } 8629 } 8630 8631 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, 8632 VFRange &Range) { 8633 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8634 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8635 Range); 8636 8637 bool IsPredicated = CM.isPredicatedInst(I); 8638 8639 // Even if the instruction is not marked as uniform, there are certain 8640 // intrinsic calls that can be effectively treated as such, so we check for 8641 // them here. Conservatively, we only do this for scalable vectors, since 8642 // for fixed-width VFs we can always fall back on full scalarization. 8643 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8644 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8645 case Intrinsic::assume: 8646 case Intrinsic::lifetime_start: 8647 case Intrinsic::lifetime_end: 8648 // For scalable vectors if one of the operands is variant then we still 8649 // want to mark as uniform, which will generate one instruction for just 8650 // the first lane of the vector. We can't scalarize the call in the same 8651 // way as for fixed-width vectors because we don't know how many lanes 8652 // there are. 8653 // 8654 // The reasons for doing it this way for scalable vectors are: 8655 // 1. For the assume intrinsic generating the instruction for the first 8656 // lane is still be better than not generating any at all. For 8657 // example, the input may be a splat across all lanes. 8658 // 2. For the lifetime start/end intrinsics the pointer operand only 8659 // does anything useful when the input comes from a stack object, 8660 // which suggests it should always be uniform. For non-stack objects 8661 // the effect is to poison the object, which still allows us to 8662 // remove the call. 8663 IsUniform = true; 8664 break; 8665 default: 8666 break; 8667 } 8668 } 8669 VPValue *BlockInMask = nullptr; 8670 if (!IsPredicated) { 8671 // Finalize the recipe for Instr, first if it is not predicated. 8672 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8673 } else { 8674 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8675 // Instructions marked for predication are replicated and a mask operand is 8676 // added initially. Masked replicate recipes will later be placed under an 8677 // if-then construct to prevent side-effects. Generate recipes to compute 8678 // the block mask for this region. 8679 BlockInMask = getBlockInMask(I->getParent()); 8680 } 8681 8682 // Note that there is some custom logic to mark some intrinsics as uniform 8683 // manually above for scalable vectors, which this assert needs to account for 8684 // as well. 8685 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || 8686 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && 8687 "Should not predicate a uniform recipe"); 8688 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), 8689 IsUniform, BlockInMask); 8690 return Recipe; 8691 } 8692 8693 /// Find all possible partial reductions in the loop and track all of those that 8694 /// are valid so recipes can be formed later. 8695 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { 8696 // Find all possible partial reductions. 8697 SmallVector<std::pair<PartialReductionChain, unsigned>> 8698 PartialReductionChains; 8699 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { 8700 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range, 8701 PartialReductionChains); 8702 } 8703 8704 // A partial reduction is invalid if any of its extends are used by 8705 // something that isn't another partial reduction. This is because the 8706 // extends are intended to be lowered along with the reduction itself. 8707 8708 // Build up a set of partial reduction bin ops for efficient use checking. 8709 SmallSet<User *, 4> PartialReductionBinOps; 8710 for (const auto &[PartialRdx, _] : PartialReductionChains) 8711 PartialReductionBinOps.insert(PartialRdx.BinOp); 8712 8713 auto ExtendIsOnlyUsedByPartialReductions = 8714 [&PartialReductionBinOps](Instruction *Extend) { 8715 return all_of(Extend->users(), [&](const User *U) { 8716 return PartialReductionBinOps.contains(U); 8717 }); 8718 }; 8719 8720 // Check if each use of a chain's two extends is a partial reduction 8721 // and only add those that don't have non-partial reduction users. 8722 for (auto Pair : PartialReductionChains) { 8723 PartialReductionChain Chain = Pair.first; 8724 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) && 8725 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)) 8726 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second)); 8727 } 8728 } 8729 8730 bool VPRecipeBuilder::getScaledReductions( 8731 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range, 8732 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) { 8733 8734 if (!CM.TheLoop->contains(RdxExitInstr)) 8735 return false; 8736 8737 // TODO: Allow scaling reductions when predicating. The select at 8738 // the end of the loop chooses between the phi value and most recent 8739 // reduction result, both of which have different VFs to the active lane 8740 // mask when scaling. 8741 if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent())) 8742 return false; 8743 8744 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr); 8745 if (!Update) 8746 return false; 8747 8748 Value *Op = Update->getOperand(0); 8749 Value *PhiOp = Update->getOperand(1); 8750 if (Op == PHI) 8751 std::swap(Op, PhiOp); 8752 8753 // Try and get a scaled reduction from the first non-phi operand. 8754 // If one is found, we use the discovered reduction instruction in 8755 // place of the accumulator for costing. 8756 if (auto *OpInst = dyn_cast<Instruction>(Op)) { 8757 if (getScaledReductions(PHI, OpInst, Range, Chains)) { 8758 PHI = Chains.rbegin()->first.Reduction; 8759 8760 Op = Update->getOperand(0); 8761 PhiOp = Update->getOperand(1); 8762 if (Op == PHI) 8763 std::swap(Op, PhiOp); 8764 } 8765 } 8766 if (PhiOp != PHI) 8767 return false; 8768 8769 auto *BinOp = dyn_cast<BinaryOperator>(Op); 8770 if (!BinOp || !BinOp->hasOneUse()) 8771 return false; 8772 8773 using namespace llvm::PatternMatch; 8774 Value *A, *B; 8775 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || 8776 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) 8777 return false; 8778 8779 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0)); 8780 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1)); 8781 8782 TTI::PartialReductionExtendKind OpAExtend = 8783 TargetTransformInfo::getPartialReductionExtendKind(ExtA); 8784 TTI::PartialReductionExtendKind OpBExtend = 8785 TargetTransformInfo::getPartialReductionExtendKind(ExtB); 8786 8787 PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp); 8788 8789 unsigned TargetScaleFactor = 8790 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( 8791 A->getType()->getPrimitiveSizeInBits()); 8792 8793 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8794 [&](ElementCount VF) { 8795 InstructionCost Cost = TTI->getPartialReductionCost( 8796 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(), 8797 VF, OpAExtend, OpBExtend, 8798 std::make_optional(BinOp->getOpcode())); 8799 return Cost.isValid(); 8800 }, 8801 Range)) { 8802 Chains.push_back(std::make_pair(Chain, TargetScaleFactor)); 8803 return true; 8804 } 8805 8806 return false; 8807 } 8808 8809 VPRecipeBase * 8810 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8811 ArrayRef<VPValue *> Operands, 8812 VFRange &Range, VPBasicBlock *VPBB) { 8813 // First, check for specific widening recipes that deal with inductions, Phi 8814 // nodes, calls and memory operations. 8815 VPRecipeBase *Recipe; 8816 if (auto *Phi = dyn_cast<PHINode>(Instr)) { 8817 if (Phi->getParent() != OrigLoop->getHeader()) 8818 return tryToBlend(Phi, Operands); 8819 8820 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8821 return Recipe; 8822 8823 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8824 assert((Legal->isReductionVariable(Phi) || 8825 Legal->isFixedOrderRecurrence(Phi)) && 8826 "can only widen reductions and fixed-order recurrences here"); 8827 VPValue *StartV = Operands[0]; 8828 if (Legal->isReductionVariable(Phi)) { 8829 const RecurrenceDescriptor &RdxDesc = 8830 Legal->getReductionVars().find(Phi)->second; 8831 assert(RdxDesc.getRecurrenceStartValue() == 8832 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8833 8834 // If the PHI is used by a partial reduction, set the scale factor. 8835 unsigned ScaleFactor = 8836 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); 8837 PhiRecipe = new VPReductionPHIRecipe( 8838 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), 8839 CM.useOrderedReductions(RdxDesc), ScaleFactor); 8840 } else { 8841 // TODO: Currently fixed-order recurrences are modeled as chains of 8842 // first-order recurrences. If there are no users of the intermediate 8843 // recurrences in the chain, the fixed order recurrence should be modeled 8844 // directly, enabling more efficient codegen. 8845 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8846 } 8847 8848 PhisToFix.push_back(PhiRecipe); 8849 return PhiRecipe; 8850 } 8851 8852 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8853 cast<TruncInst>(Instr), Operands, Range))) 8854 return Recipe; 8855 8856 // All widen recipes below deal only with VF > 1. 8857 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8858 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8859 return nullptr; 8860 8861 if (auto *CI = dyn_cast<CallInst>(Instr)) 8862 return tryToWidenCall(CI, Operands, Range); 8863 8864 if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) 8865 if (auto HistInfo = Legal->getHistogramInfo(SI)) 8866 return tryToWidenHistogram(*HistInfo, Operands); 8867 8868 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8869 return tryToWidenMemory(Instr, Operands, Range); 8870 8871 if (getScalingForReduction(Instr)) 8872 return tryToCreatePartialReduction(Instr, Operands); 8873 8874 if (!shouldWiden(Instr, Range)) 8875 return nullptr; 8876 8877 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) 8878 return new VPWidenGEPRecipe(GEP, 8879 make_range(Operands.begin(), Operands.end())); 8880 8881 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8882 return new VPWidenSelectRecipe( 8883 *SI, make_range(Operands.begin(), Operands.end())); 8884 } 8885 8886 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8887 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), 8888 *CI); 8889 } 8890 8891 return tryToWiden(Instr, Operands, VPBB); 8892 } 8893 8894 VPRecipeBase * 8895 VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, 8896 ArrayRef<VPValue *> Operands) { 8897 assert(Operands.size() == 2 && 8898 "Unexpected number of operands for partial reduction"); 8899 8900 VPValue *BinOp = Operands[0]; 8901 VPValue *Accumulator = Operands[1]; 8902 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); 8903 if (isa<VPReductionPHIRecipe>(BinOpRecipe) || 8904 isa<VPPartialReductionRecipe>(BinOpRecipe)) 8905 std::swap(BinOp, Accumulator); 8906 8907 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, 8908 Accumulator, Reduction); 8909 } 8910 8911 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8912 ElementCount MaxVF) { 8913 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8914 8915 auto MaxVFTimes2 = MaxVF * 2; 8916 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8917 VFRange SubRange = {VF, MaxVFTimes2}; 8918 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8919 // Now optimize the initial VPlan. 8920 if (!Plan->hasVF(ElementCount::getFixed(1))) 8921 VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, 8922 *Plan, CM.getMinimalBitwidths()); 8923 VPlanTransforms::optimize(*Plan); 8924 // TODO: try to put it close to addActiveLaneMask(). 8925 // Discard the plan if it is not EVL-compatible 8926 if (CM.foldTailWithEVL() && 8927 !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength, 8928 *Plan, CM.getMaxSafeElements())) 8929 break; 8930 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 8931 VPlans.push_back(std::move(Plan)); 8932 } 8933 VF = SubRange.End; 8934 } 8935 } 8936 8937 // Add the necessary canonical IV and branch recipes required to control the 8938 // loop. 8939 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8940 DebugLoc DL) { 8941 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8942 auto *StartV = Plan.getOrAddLiveIn(StartIdx); 8943 8944 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8945 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8946 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8947 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8948 Header->insert(CanonicalIVPHI, Header->begin()); 8949 8950 VPBuilder Builder(TopRegion->getExitingBasicBlock()); 8951 // Add a VPInstruction to increment the scalar canonical IV by VF * UF. 8952 auto *CanonicalIVIncrement = Builder.createOverflowingOp( 8953 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, 8954 "index.next"); 8955 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8956 8957 // Add the BranchOnCount VPInstruction to the latch. 8958 Builder.createNaryOp(VPInstruction::BranchOnCount, 8959 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8960 } 8961 8962 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the 8963 /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute 8964 /// the end value of the induction. 8965 static VPInstruction *addResumePhiRecipeForInduction( 8966 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, 8967 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { 8968 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV); 8969 // Truncated wide inductions resume from the last lane of their vector value 8970 // in the last vector iteration which is handled elsewhere. 8971 if (WideIntOrFp && WideIntOrFp->getTruncInst()) 8972 return nullptr; 8973 8974 VPValue *Start = WideIV->getStartValue(); 8975 VPValue *Step = WideIV->getStepValue(); 8976 const InductionDescriptor &ID = WideIV->getInductionDescriptor(); 8977 VPValue *EndValue = VectorTC; 8978 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) { 8979 EndValue = VectorPHBuilder.createDerivedIV( 8980 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), 8981 Start, VectorTC, Step); 8982 } 8983 8984 // EndValue is derived from the vector trip count (which has the same type as 8985 // the widest induction) and thus may be wider than the induction here. 8986 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV); 8987 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) { 8988 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue, 8989 ScalarTypeOfWideIV, 8990 WideIV->getDebugLoc()); 8991 } 8992 8993 auto *ResumePhiRecipe = 8994 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start}, 8995 WideIV->getDebugLoc(), "bc.resume.val"); 8996 return ResumePhiRecipe; 8997 } 8998 8999 /// Create resume phis in the scalar preheader for first-order recurrences, 9000 /// reductions and inductions, and update the VPIRInstructions wrapping the 9001 /// original phis in the scalar header. End values for inductions are added to 9002 /// \p IVEndValues. 9003 static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, 9004 DenseMap<VPValue *, VPValue *> &IVEndValues) { 9005 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); 9006 auto *ScalarPH = Plan.getScalarPreheader(); 9007 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); 9008 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9009 VPBuilder VectorPHBuilder( 9010 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())); 9011 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9012 VPBuilder ScalarPHBuilder(ScalarPH); 9013 VPValue *OneVPV = Plan.getOrAddLiveIn( 9014 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); 9015 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { 9016 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR); 9017 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction()); 9018 if (!ScalarPhiI) 9019 break; 9020 9021 // TODO: Extract final value from induction recipe initially, optimize to 9022 // pre-computed end value together in optimizeInductionExitUsers. 9023 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI)); 9024 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) { 9025 if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( 9026 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, 9027 &Plan.getVectorTripCount())) { 9028 assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi && 9029 "Expected a ResumePhi"); 9030 IVEndValues[WideIVR] = ResumePhi->getOperand(0); 9031 ScalarPhiIRI->addOperand(ResumePhi); 9032 continue; 9033 } 9034 // TODO: Also handle truncated inductions here. Computing end-values 9035 // separately should be done as VPlan-to-VPlan optimization, after 9036 // legalizing all resume values to use the last lane from the loop. 9037 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() && 9038 "should only skip truncated wide inductions"); 9039 continue; 9040 } 9041 9042 // The backedge value provides the value to resume coming out of a loop, 9043 // which for FORs is a vector whose last element needs to be extracted. The 9044 // start value provides the value if the loop is bypassed. 9045 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR); 9046 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); 9047 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9048 "Cannot handle loops with uncountable early exits"); 9049 if (IsFOR) 9050 ResumeFromVectorLoop = MiddleBuilder.createNaryOp( 9051 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, 9052 "vector.recur.extract"); 9053 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; 9054 auto *ResumePhiR = ScalarPHBuilder.createNaryOp( 9055 VPInstruction::ResumePhi, 9056 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); 9057 ScalarPhiIRI->addOperand(ResumePhiR); 9058 } 9059 } 9060 9061 // Collect VPIRInstructions for phis in the exit blocks that are modeled 9062 // in VPlan and add the exiting VPValue as operand. 9063 static SetVector<VPIRInstruction *> 9064 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, 9065 VPlan &Plan) { 9066 SetVector<VPIRInstruction *> ExitUsersToFix; 9067 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { 9068 for (VPRecipeBase &R : *ExitVPBB) { 9069 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R); 9070 if (!ExitIRI) 9071 continue; 9072 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction()); 9073 if (!ExitPhi) 9074 break; 9075 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) { 9076 assert(ExitIRI->getNumOperands() == 9077 ExitVPBB->getPredecessors().size() && 9078 "early-exit must update exit values on construction"); 9079 continue; 9080 } 9081 BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); 9082 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); 9083 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); 9084 ExitIRI->addOperand(V); 9085 if (V->isLiveIn()) 9086 continue; 9087 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && 9088 "Only recipes defined inside a region should need fixing."); 9089 ExitUsersToFix.insert(ExitIRI); 9090 } 9091 } 9092 return ExitUsersToFix; 9093 } 9094 9095 // Add exit values to \p Plan. Extracts are added for each entry in \p 9096 // ExitUsersToFix if needed and their operands are updated. 9097 static void 9098 addUsersInExitBlocks(VPlan &Plan, 9099 const SetVector<VPIRInstruction *> &ExitUsersToFix) { 9100 if (ExitUsersToFix.empty()) 9101 return; 9102 9103 auto *MiddleVPBB = Plan.getMiddleBlock(); 9104 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9105 9106 // Introduce extract for exiting values and update the VPIRInstructions 9107 // modeling the corresponding LCSSA phis. 9108 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9109 assert(ExitIRI->getNumOperands() == 1 && 9110 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && 9111 "exit values from early exits must be fixed when branch to " 9112 "early-exit is added"); 9113 ExitIRI->extractLastLaneOfOperand(B); 9114 } 9115 } 9116 9117 /// Handle users in the exit block for first order reductions in the original 9118 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi 9119 /// users in the original exit block using the VPIRInstruction wrapping to the 9120 /// LCSSA phi. 9121 static void addExitUsersForFirstOrderRecurrences( 9122 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) { 9123 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 9124 auto *ScalarPHVPBB = Plan.getScalarPreheader(); 9125 auto *MiddleVPBB = Plan.getMiddleBlock(); 9126 VPBuilder ScalarPHBuilder(ScalarPHVPBB); 9127 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); 9128 VPValue *TwoVPV = Plan.getOrAddLiveIn( 9129 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); 9130 9131 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { 9132 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); 9133 if (!FOR) 9134 continue; 9135 9136 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && 9137 "Cannot handle loops with uncountable early exits"); 9138 9139 // This is the second phase of vectorizing first-order recurrences, creating 9140 // extract for users outside the loop. An overview of the transformation is 9141 // described below. Suppose we have the following loop with some use after 9142 // the loop of the last a[i-1], 9143 // 9144 // for (int i = 0; i < n; ++i) { 9145 // t = a[i - 1]; 9146 // b[i] = a[i] - t; 9147 // } 9148 // use t; 9149 // 9150 // There is a first-order recurrence on "a". For this loop, the shorthand 9151 // scalar IR looks like: 9152 // 9153 // scalar.ph: 9154 // s.init = a[-1] 9155 // br scalar.body 9156 // 9157 // scalar.body: 9158 // i = phi [0, scalar.ph], [i+1, scalar.body] 9159 // s1 = phi [s.init, scalar.ph], [s2, scalar.body] 9160 // s2 = a[i] 9161 // b[i] = s2 - s1 9162 // br cond, scalar.body, exit.block 9163 // 9164 // exit.block: 9165 // use = lcssa.phi [s1, scalar.body] 9166 // 9167 // In this example, s1 is a recurrence because it's value depends on the 9168 // previous iteration. In the first phase of vectorization, we created a 9169 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts 9170 // for users in the scalar preheader and exit block. 9171 // 9172 // vector.ph: 9173 // v_init = vector(..., ..., ..., a[-1]) 9174 // br vector.body 9175 // 9176 // vector.body 9177 // i = phi [0, vector.ph], [i+4, vector.body] 9178 // v1 = phi [v_init, vector.ph], [v2, vector.body] 9179 // v2 = a[i, i+1, i+2, i+3] 9180 // b[i] = v2 - v1 9181 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) 9182 // b[i, i+1, i+2, i+3] = v2 - v1 9183 // br cond, vector.body, middle.block 9184 // 9185 // middle.block: 9186 // vector.recur.extract.for.phi = v2(2) 9187 // vector.recur.extract = v2(3) 9188 // br cond, scalar.ph, exit.block 9189 // 9190 // scalar.ph: 9191 // scalar.recur.init = phi [vector.recur.extract, middle.block], 9192 // [s.init, otherwise] 9193 // br scalar.body 9194 // 9195 // scalar.body: 9196 // i = phi [0, scalar.ph], [i+1, scalar.body] 9197 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] 9198 // s2 = a[i] 9199 // b[i] = s2 - s1 9200 // br cond, scalar.body, exit.block 9201 // 9202 // exit.block: 9203 // lo = lcssa.phi [s1, scalar.body], 9204 // [vector.recur.extract.for.phi, middle.block] 9205 // 9206 // Now update VPIRInstructions modeling LCSSA phis in the exit block. 9207 // Extract the penultimate value of the recurrence and use it as operand for 9208 // the VPIRInstruction modeling the phi. 9209 for (VPIRInstruction *ExitIRI : ExitUsersToFix) { 9210 if (ExitIRI->getOperand(0) != FOR) 9211 continue; 9212 VPValue *PenultimateElement = MiddleBuilder.createNaryOp( 9213 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, 9214 "vector.recur.extract.for.phi"); 9215 ExitIRI->setOperand(0, PenultimateElement); 9216 ExitUsersToFix.remove(ExitIRI); 9217 } 9218 } 9219 } 9220 9221 VPlanPtr 9222 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 9223 9224 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9225 9226 // --------------------------------------------------------------------------- 9227 // Build initial VPlan: Scan the body of the loop in a topological order to 9228 // visit each basic block after having visited its predecessor basic blocks. 9229 // --------------------------------------------------------------------------- 9230 9231 // Create initial VPlan skeleton, having a basic block for the pre-header 9232 // which contains SCEV expansions that need to happen before the CFG is 9233 // modified; a basic block for the vector pre-header, followed by a region for 9234 // the vector loop, followed by the middle basic block. The skeleton vector 9235 // loop region contains a header and latch basic blocks. 9236 9237 bool RequiresScalarEpilogueCheck = 9238 LoopVectorizationPlanner::getDecisionAndClampRange( 9239 [this](ElementCount VF) { 9240 return !CM.requiresScalarEpilogue(VF.isVector()); 9241 }, 9242 Range); 9243 VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), 9244 PSE, RequiresScalarEpilogueCheck, 9245 CM.foldTailByMasking(), OrigLoop); 9246 9247 // Don't use getDecisionAndClampRange here, because we don't know the UF 9248 // so this function is better to be conservative, rather than to split 9249 // it up into different VPlans. 9250 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 9251 bool IVUpdateMayOverflow = false; 9252 for (ElementCount VF : Range) 9253 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 9254 9255 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9256 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 9257 // Use NUW for the induction increment if we proved that it won't overflow in 9258 // the vector loop or when not folding the tail. In the later case, we know 9259 // that the canonical induction increment will not overflow as the vector trip 9260 // count is >= increment and a multiple of the increment. 9261 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; 9262 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 9263 9264 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9265 Builder); 9266 9267 // --------------------------------------------------------------------------- 9268 // Pre-construction: record ingredients whose recipes we'll need to further 9269 // process after constructing the initial VPlan. 9270 // --------------------------------------------------------------------------- 9271 9272 // For each interleave group which is relevant for this (possibly trimmed) 9273 // Range, add it to the set of groups to be later applied to the VPlan and add 9274 // placeholders for its members' Recipes which we'll be replacing with a 9275 // single VPInterleaveRecipe. 9276 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9277 auto ApplyIG = [IG, this](ElementCount VF) -> bool { 9278 bool Result = (VF.isVector() && // Query is illegal for VF == 1 9279 CM.getWideningDecision(IG->getInsertPos(), VF) == 9280 LoopVectorizationCostModel::CM_Interleave); 9281 // For scalable vectors, the only interleave factor currently supported 9282 // is 2 since we require the (de)interleave2 intrinsics instead of 9283 // shufflevectors. 9284 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 9285 "Unsupported interleave factor for scalable vectors"); 9286 return Result; 9287 }; 9288 if (!getDecisionAndClampRange(ApplyIG, Range)) 9289 continue; 9290 InterleaveGroups.insert(IG); 9291 } 9292 9293 // --------------------------------------------------------------------------- 9294 // Construct recipes for the instructions in the loop 9295 // --------------------------------------------------------------------------- 9296 9297 // Scan the body of the loop in a topological order to visit each basic block 9298 // after having visited its predecessor basic blocks. 9299 LoopBlocksDFS DFS(OrigLoop); 9300 DFS.perform(LI); 9301 9302 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 9303 VPBasicBlock *VPBB = HeaderVPBB; 9304 BasicBlock *HeaderBB = OrigLoop->getHeader(); 9305 bool NeedsMasks = 9306 CM.foldTailByMasking() || 9307 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { 9308 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); 9309 return Legal->blockNeedsPredication(BB) || NeedsBlends; 9310 }); 9311 9312 RecipeBuilder.collectScaledReductions(Range); 9313 9314 auto *MiddleVPBB = Plan->getMiddleBlock(); 9315 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); 9316 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9317 // Relevant instructions from basic block BB will be grouped into VPRecipe 9318 // ingredients and fill a new VPBasicBlock. 9319 if (VPBB != HeaderVPBB) 9320 VPBB->setName(BB->getName()); 9321 Builder.setInsertPoint(VPBB); 9322 9323 if (VPBB == HeaderVPBB) 9324 RecipeBuilder.createHeaderMask(); 9325 else if (NeedsMasks) 9326 RecipeBuilder.createBlockInMask(BB); 9327 9328 // Introduce each ingredient into VPlan. 9329 // TODO: Model and preserve debug intrinsics in VPlan. 9330 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 9331 Instruction *Instr = &I; 9332 SmallVector<VPValue *, 4> Operands; 9333 auto *Phi = dyn_cast<PHINode>(Instr); 9334 if (Phi && Phi->getParent() == HeaderBB) { 9335 Operands.push_back(Plan->getOrAddLiveIn( 9336 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9337 } else { 9338 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); 9339 Operands = {OpRange.begin(), OpRange.end()}; 9340 } 9341 9342 // The stores with invariant address inside the loop will be deleted, and 9343 // in the exit block, a uniform store recipe will be created for the final 9344 // invariant store of the reduction. 9345 StoreInst *SI; 9346 if ((SI = dyn_cast<StoreInst>(&I)) && 9347 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { 9348 // Only create recipe for the final invariant store of the reduction. 9349 if (!Legal->isInvariantStoreOfReduction(SI)) 9350 continue; 9351 auto *Recipe = new VPReplicateRecipe( 9352 SI, RecipeBuilder.mapToVPValues(Instr->operands()), 9353 true /* IsUniform */); 9354 Recipe->insertBefore(*MiddleVPBB, MBIP); 9355 continue; 9356 } 9357 9358 VPRecipeBase *Recipe = 9359 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); 9360 if (!Recipe) 9361 Recipe = RecipeBuilder.handleReplication(Instr, Range); 9362 9363 RecipeBuilder.setRecipe(Instr, Recipe); 9364 if (isa<VPHeaderPHIRecipe>(Recipe)) { 9365 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 9366 // the following cases, VPHeaderPHIRecipes may be created after non-phi 9367 // recipes and need to be moved to the phi section of HeaderVPBB: 9368 // * tail-folding (non-phi recipes computing the header mask are 9369 // introduced earlier than regular header phi recipes, and should appear 9370 // after them) 9371 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 9372 9373 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 9374 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 9375 "unexpected recipe needs moving"); 9376 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9377 } else 9378 VPBB->appendRecipe(Recipe); 9379 } 9380 9381 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB); 9382 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9383 } 9384 9385 // After here, VPBB should not be used. 9386 VPBB = nullptr; 9387 9388 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9389 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9390 "entry block must be set to a VPRegionBlock having a non-empty entry " 9391 "VPBasicBlock"); 9392 RecipeBuilder.fixHeaderPhis(); 9393 9394 // Update wide induction increments to use the same step as the corresponding 9395 // wide induction. This enables detecting induction increments directly in 9396 // VPlan and removes redundant splats. 9397 for (const auto &[Phi, ID] : Legal->getInductionVars()) { 9398 auto *IVInc = cast<Instruction>( 9399 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 9400 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add) 9401 continue; 9402 VPWidenInductionRecipe *WideIV = 9403 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi)); 9404 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc); 9405 R->setOperand(1, WideIV->getStepValue()); 9406 } 9407 9408 if (auto *UncountableExitingBlock = 9409 Legal->getUncountableEarlyExitingBlock()) { 9410 if (!VPlanTransforms::handleUncountableEarlyExit( 9411 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, 9412 RecipeBuilder)) { 9413 reportVectorizationFailure( 9414 "Some exit values in loop with uncountable exit not supported yet", 9415 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); 9416 return nullptr; 9417 } 9418 } 9419 DenseMap<VPValue *, VPValue *> IVEndValues; 9420 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); 9421 SetVector<VPIRInstruction *> ExitUsersToFix = 9422 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); 9423 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); 9424 addUsersInExitBlocks(*Plan, ExitUsersToFix); 9425 9426 // --------------------------------------------------------------------------- 9427 // Transform initial VPlan: Apply previously taken decisions, in order, to 9428 // bring the VPlan to its final state. 9429 // --------------------------------------------------------------------------- 9430 9431 // Adjust the recipes for any inloop reductions. 9432 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); 9433 9434 // Interleave memory: for each Interleave Group we marked earlier as relevant 9435 // for this VPlan, replace the Recipes widening its memory instructions with a 9436 // single VPInterleaveRecipe at its insertion point. 9437 VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, 9438 InterleaveGroups, RecipeBuilder, 9439 CM.isScalarEpilogueAllowed()); 9440 9441 for (ElementCount VF : Range) 9442 Plan->addVF(VF); 9443 Plan->setName("Initial VPlan"); 9444 9445 // Replace VPValues for known constant strides guaranteed by predicate scalar 9446 // evolution. 9447 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { 9448 auto *R = cast<VPRecipeBase>(&U); 9449 return R->getParent()->getParent() || 9450 R->getParent() == 9451 Plan->getVectorLoopRegion()->getSinglePredecessor(); 9452 }; 9453 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9454 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9455 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9456 // Only handle constant strides for now. 9457 if (!ScevStride) 9458 continue; 9459 9460 auto *CI = Plan->getOrAddLiveIn( 9461 ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); 9462 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) 9463 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9464 9465 // The versioned value may not be used in the loop directly but through a 9466 // sext/zext. Add new live-ins in those cases. 9467 for (Value *U : StrideV->users()) { 9468 if (!isa<SExtInst, ZExtInst>(U)) 9469 continue; 9470 VPValue *StrideVPV = Plan->getLiveIn(U); 9471 if (!StrideVPV) 9472 continue; 9473 unsigned BW = U->getType()->getScalarSizeInBits(); 9474 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) 9475 : ScevStride->getAPInt().zext(BW); 9476 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); 9477 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); 9478 } 9479 } 9480 9481 auto BlockNeedsPredication = [this](BasicBlock *BB) { 9482 return Legal->blockNeedsPredication(BB); 9483 }; 9484 VPlanTransforms::runPass(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan, 9485 BlockNeedsPredication); 9486 9487 // Sink users of fixed-order recurrence past the recipe defining the previous 9488 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9489 if (!VPlanTransforms::runPass(VPlanTransforms::adjustFixedOrderRecurrences, 9490 *Plan, Builder)) 9491 return nullptr; 9492 9493 if (useActiveLaneMask(Style)) { 9494 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 9495 // TailFoldingStyle is visible there. 9496 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 9497 bool WithoutRuntimeCheck = 9498 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 9499 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 9500 WithoutRuntimeCheck); 9501 } 9502 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues); 9503 9504 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9505 return Plan; 9506 } 9507 9508 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9509 // Outer loop handling: They may require CFG and instruction level 9510 // transformations before even evaluating whether vectorization is profitable. 9511 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9512 // the vectorization pipeline. 9513 assert(!OrigLoop->isInnermost()); 9514 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9515 9516 // Create new empty VPlan 9517 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE, 9518 true, false, OrigLoop); 9519 9520 // Build hierarchical CFG 9521 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9522 HCFGBuilder.buildHierarchicalCFG(); 9523 9524 for (ElementCount VF : Range) 9525 Plan->addVF(VF); 9526 9527 VPlanTransforms::VPInstructionsToVPRecipes( 9528 Plan, 9529 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9530 *PSE.getSE(), *TLI); 9531 9532 // Tail folding is not supported for outer loops, so the induction increment 9533 // is guaranteed to not wrap. 9534 bool HasNUW = true; 9535 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 9536 DebugLoc()); 9537 9538 // Collect mapping of IR header phis to header phi recipes, to be used in 9539 // addScalarResumePhis. 9540 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE, 9541 Builder); 9542 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9543 if (isa<VPCanonicalIVPHIRecipe>(&R)) 9544 continue; 9545 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R); 9546 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); 9547 } 9548 DenseMap<VPValue *, VPValue *> IVEndValues; 9549 // TODO: IVEndValues are not used yet in the native path, to optimize exit 9550 // values. 9551 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); 9552 9553 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 9554 return Plan; 9555 } 9556 9557 // Adjust the recipes for reductions. For in-loop reductions the chain of 9558 // instructions leading from the loop exit instr to the phi need to be converted 9559 // to reductions, with one operand being vector and the other being the scalar 9560 // reduction chain. For other reductions, a select is introduced between the phi 9561 // and users outside the vector region when folding the tail. 9562 // 9563 // A ComputeReductionResult recipe is added to the middle block, also for 9564 // in-loop reductions which compute their result in-loop, because generating 9565 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 9566 // 9567 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9568 // with a boolean reduction phi node to check if the condition is true in any 9569 // iteration. The final value is selected by the final ComputeReductionResult. 9570 void LoopVectorizationPlanner::adjustRecipesForReductions( 9571 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 9572 using namespace VPlanPatternMatch; 9573 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 9574 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 9575 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); 9576 SmallVector<VPRecipeBase *> ToDelete; 9577 9578 for (VPRecipeBase &R : Header->phis()) { 9579 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9580 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9581 continue; 9582 9583 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9584 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9585 assert( 9586 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9587 !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) && 9588 "AnyOf and FindLast reductions are not allowed for in-loop reductions"); 9589 9590 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9591 SetVector<VPSingleDefRecipe *> Worklist; 9592 Worklist.insert(PhiR); 9593 for (unsigned I = 0; I != Worklist.size(); ++I) { 9594 VPSingleDefRecipe *Cur = Worklist[I]; 9595 for (VPUser *U : Cur->users()) { 9596 auto *UserRecipe = cast<VPSingleDefRecipe>(U); 9597 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { 9598 assert((UserRecipe->getParent() == MiddleVPBB || 9599 UserRecipe->getParent() == Plan->getScalarPreheader()) && 9600 "U must be either in the loop region, the middle block or the " 9601 "scalar preheader."); 9602 continue; 9603 } 9604 Worklist.insert(UserRecipe); 9605 } 9606 } 9607 9608 // Visit operation "Links" along the reduction chain top-down starting from 9609 // the phi until LoopExitValue. We keep track of the previous item 9610 // (PreviousLink) to tell which of the two operands of a Link will remain 9611 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9612 // the select instructions. Blend recipes of in-loop reduction phi's will 9613 // get folded to their non-phi operand, as the reduction recipe handles the 9614 // condition directly. 9615 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 9616 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 9617 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9618 9619 // Index of the first operand which holds a non-mask vector operand. 9620 unsigned IndexOfFirstOperand; 9621 // Recognize a call to the llvm.fmuladd intrinsic. 9622 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9623 VPValue *VecOp; 9624 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9625 if (IsFMulAdd) { 9626 assert( 9627 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9628 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9629 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9630 isa<VPWidenIntrinsicRecipe>(CurrentLink)) && 9631 CurrentLink->getOperand(2) == PreviousLink && 9632 "expected a call where the previous link is the added operand"); 9633 9634 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9635 // need to create an fmul recipe (multiplying the first two operands of 9636 // the fmuladd together) to use as the vector operand for the fadd 9637 // reduction. 9638 VPInstruction *FMulRecipe = new VPInstruction( 9639 Instruction::FMul, 9640 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9641 CurrentLinkI->getFastMathFlags()); 9642 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9643 VecOp = FMulRecipe; 9644 } else { 9645 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); 9646 if (PhiR->isInLoop() && Blend) { 9647 assert(Blend->getNumIncomingValues() == 2 && 9648 "Blend must have 2 incoming values"); 9649 if (Blend->getIncomingValue(0) == PhiR) 9650 Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); 9651 else { 9652 assert(Blend->getIncomingValue(1) == PhiR && 9653 "PhiR must be an operand of the blend"); 9654 Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); 9655 } 9656 continue; 9657 } 9658 9659 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9660 if (isa<VPWidenRecipe>(CurrentLink)) { 9661 assert(isa<CmpInst>(CurrentLinkI) && 9662 "need to have the compare of the select"); 9663 continue; 9664 } 9665 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9666 "must be a select recipe"); 9667 IndexOfFirstOperand = 1; 9668 } else { 9669 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9670 "Expected to replace a VPWidenSC"); 9671 IndexOfFirstOperand = 0; 9672 } 9673 // Note that for non-commutable operands (cmp-selects), the semantics of 9674 // the cmp-select are captured in the recurrence kind. 9675 unsigned VecOpId = 9676 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9677 ? IndexOfFirstOperand + 1 9678 : IndexOfFirstOperand; 9679 VecOp = CurrentLink->getOperand(VecOpId); 9680 assert(VecOp != PreviousLink && 9681 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9682 (VecOpId - IndexOfFirstOperand)) == 9683 PreviousLink && 9684 "PreviousLink must be the operand other than VecOp"); 9685 } 9686 9687 BasicBlock *BB = CurrentLinkI->getParent(); 9688 VPValue *CondOp = nullptr; 9689 if (CM.blockNeedsPredicationForAnyReason(BB)) 9690 CondOp = RecipeBuilder.getBlockInMask(BB); 9691 9692 auto *RedRecipe = new VPReductionRecipe( 9693 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp, 9694 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); 9695 // Append the recipe to the end of the VPBasicBlock because we need to 9696 // ensure that it comes after all of it's inputs, including CondOp. 9697 // Delete CurrentLink as it will be invalid if its operand is replaced 9698 // with a reduction defined at the bottom of the block in the next link. 9699 LinkVPBB->appendRecipe(RedRecipe); 9700 CurrentLink->replaceAllUsesWith(RedRecipe); 9701 ToDelete.push_back(CurrentLink); 9702 PreviousLink = RedRecipe; 9703 } 9704 } 9705 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); 9706 Builder.setInsertPoint(&*LatchVPBB->begin()); 9707 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); 9708 for (VPRecipeBase &R : 9709 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9710 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9711 if (!PhiR) 9712 continue; 9713 9714 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9715 // If tail is folded by masking, introduce selects between the phi 9716 // and the users outside the vector region of each reduction, at the 9717 // beginning of the dedicated latch block. 9718 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9719 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9720 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9721 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9722 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9723 "reduction recipe must be defined before latch"); 9724 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9725 std::optional<FastMathFlags> FMFs = 9726 PhiTy->isFloatingPointTy() 9727 ? std::make_optional(RdxDesc.getFastMathFlags()) 9728 : std::nullopt; 9729 NewExitingVPV = 9730 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9731 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9732 return isa<VPInstruction>(&U) && 9733 cast<VPInstruction>(&U)->getOpcode() == 9734 VPInstruction::ComputeReductionResult; 9735 }); 9736 if (CM.usePredicatedReductionSelect( 9737 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy)) 9738 PhiR->setOperand(1, NewExitingVPV); 9739 } 9740 9741 // If the vector reduction can be performed in a smaller type, we truncate 9742 // then extend the loop exit value to enable InstCombine to evaluate the 9743 // entire expression in the smaller type. 9744 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9745 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && 9746 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 9747 RdxDesc.getRecurrenceKind())) { 9748 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9749 Type *RdxTy = RdxDesc.getRecurrenceType(); 9750 auto *Trunc = 9751 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9752 auto *Extnd = 9753 RdxDesc.isSigned() 9754 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9755 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9756 9757 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9758 Extnd->insertAfter(Trunc); 9759 if (PhiR->getOperand(1) == NewExitingVPV) 9760 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9761 NewExitingVPV = Extnd; 9762 } 9763 9764 // We want code in the middle block to appear to execute on the location of 9765 // the scalar loop's latch terminator because: (a) it is all compiler 9766 // generated, (b) these instructions are always executed after evaluating 9767 // the latch conditional branch, and (c) other passes may add new 9768 // predecessors which terminate on this line. This is the easiest way to 9769 // ensure we don't accidentally cause an extra step back into the loop while 9770 // debugging. 9771 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9772 9773 // TODO: At the moment ComputeReductionResult also drives creation of the 9774 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9775 // even for in-loop reductions, until the reduction resume value handling is 9776 // also modeled in VPlan. 9777 auto *FinalReductionResult = new VPInstruction( 9778 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9779 // Update all users outside the vector region. 9780 OrigExitingVPV->replaceUsesWithIf( 9781 FinalReductionResult, [](VPUser &User, unsigned) { 9782 auto *Parent = cast<VPRecipeBase>(&User)->getParent(); 9783 return Parent && !Parent->getParent(); 9784 }); 9785 FinalReductionResult->insertBefore(*MiddleVPBB, IP); 9786 9787 // Adjust AnyOf reductions; replace the reduction phi for the selected value 9788 // with a boolean reduction phi node to check if the condition is true in 9789 // any iteration. The final value is selected by the final 9790 // ComputeReductionResult. 9791 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 9792 RdxDesc.getRecurrenceKind())) { 9793 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { 9794 return isa<VPWidenSelectRecipe>(U) || 9795 (isa<VPReplicateRecipe>(U) && 9796 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == 9797 Instruction::Select); 9798 })); 9799 VPValue *Cmp = Select->getOperand(0); 9800 // If the compare is checking the reduction PHI node, adjust it to check 9801 // the start value. 9802 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { 9803 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) 9804 if (CmpR->getOperand(I) == PhiR) 9805 CmpR->setOperand(I, PhiR->getStartValue()); 9806 } 9807 VPBuilder::InsertPointGuard Guard(Builder); 9808 Builder.setInsertPoint(Select); 9809 9810 // If the true value of the select is the reduction phi, the new value is 9811 // selected if the negated condition is true in any iteration. 9812 if (Select->getOperand(1) == PhiR) 9813 Cmp = Builder.createNot(Cmp); 9814 VPValue *Or = Builder.createOr(PhiR, Cmp); 9815 Select->getVPSingleValue()->replaceAllUsesWith(Or); 9816 // Delete Select now that it has invalid types. 9817 ToDelete.push_back(Select); 9818 9819 // Convert the reduction phi to operate on bools. 9820 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( 9821 OrigLoop->getHeader()->getContext()))); 9822 continue; 9823 } 9824 9825 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( 9826 RdxDesc.getRecurrenceKind())) { 9827 // Adjust the start value for FindLastIV recurrences to use the sentinel 9828 // value after generating the ResumePhi recipe, which uses the original 9829 // start value. 9830 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue())); 9831 } 9832 } 9833 for (VPRecipeBase *R : ToDelete) 9834 R->eraseFromParent(); 9835 9836 VPlanTransforms::runPass(VPlanTransforms::clearReductionWrapFlags, *Plan); 9837 } 9838 9839 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9840 assert(!State.Lane && "VPDerivedIVRecipe being replicated."); 9841 9842 // Fast-math-flags propagate from the original induction instruction. 9843 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9844 if (FPBinOp) 9845 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9846 9847 Value *Step = State.get(getStepValue(), VPLane(0)); 9848 Value *Index = State.get(getOperand(1), VPLane(0)); 9849 Value *DerivedIV = emitTransformedIndex( 9850 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind, 9851 cast_if_present<BinaryOperator>(FPBinOp)); 9852 DerivedIV->setName(Name); 9853 // If index is the vector trip count, the concrete value will only be set in 9854 // prepareToExecute, leading to missed simplifications, e.g. if it is 0. 9855 // TODO: Remove the special case for the vector trip count once it is computed 9856 // in VPlan and can be used during VPlan simplification. 9857 assert((DerivedIV != Index || 9858 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) && 9859 "IV didn't need transforming?"); 9860 State.set(this, DerivedIV, VPLane(0)); 9861 } 9862 9863 void VPReplicateRecipe::execute(VPTransformState &State) { 9864 Instruction *UI = getUnderlyingInstr(); 9865 if (State.Lane) { // Generate a single instance. 9866 assert((State.VF.isScalar() || !isUniform()) && 9867 "uniform recipe shouldn't be predicated"); 9868 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9869 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State); 9870 // Insert scalar instance packing it into a vector. 9871 if (State.VF.isVector() && shouldPack()) { 9872 // If we're constructing lane 0, initialize to start from poison. 9873 if (State.Lane->isFirstLane()) { 9874 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9875 Value *Poison = PoisonValue::get( 9876 VectorType::get(UI->getType(), State.VF)); 9877 State.set(this, Poison); 9878 } 9879 State.packScalarIntoVectorValue(this, *State.Lane); 9880 } 9881 return; 9882 } 9883 9884 if (IsUniform) { 9885 // Uniform within VL means we need to generate lane 0. 9886 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State); 9887 return; 9888 } 9889 9890 // A store of a loop varying value to a uniform address only needs the last 9891 // copy of the store. 9892 if (isa<StoreInst>(UI) && 9893 vputils::isUniformAfterVectorization(getOperand(1))) { 9894 auto Lane = VPLane::getLastLaneForVF(State.VF); 9895 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 9896 return; 9897 } 9898 9899 // Generate scalar instances for all VF lanes. 9900 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9901 const unsigned EndLane = State.VF.getKnownMinValue(); 9902 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9903 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State); 9904 } 9905 9906 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9907 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9908 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9909 // for predication. 9910 static ScalarEpilogueLowering getScalarEpilogueLowering( 9911 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9912 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9913 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9914 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9915 // don't look at hints or options, and don't request a scalar epilogue. 9916 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9917 // LoopAccessInfo (due to code dependency and not being able to reliably get 9918 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9919 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9920 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9921 // back to the old way and vectorize with versioning when forced. See D81345.) 9922 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9923 PGSOQueryType::IRPass) && 9924 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9925 return CM_ScalarEpilogueNotAllowedOptSize; 9926 9927 // 2) If set, obey the directives 9928 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9929 switch (PreferPredicateOverEpilogue) { 9930 case PreferPredicateTy::ScalarEpilogue: 9931 return CM_ScalarEpilogueAllowed; 9932 case PreferPredicateTy::PredicateElseScalarEpilogue: 9933 return CM_ScalarEpilogueNotNeededUsePredicate; 9934 case PreferPredicateTy::PredicateOrDontVectorize: 9935 return CM_ScalarEpilogueNotAllowedUsePredicate; 9936 }; 9937 } 9938 9939 // 3) If set, obey the hints 9940 switch (Hints.getPredicate()) { 9941 case LoopVectorizeHints::FK_Enabled: 9942 return CM_ScalarEpilogueNotNeededUsePredicate; 9943 case LoopVectorizeHints::FK_Disabled: 9944 return CM_ScalarEpilogueAllowed; 9945 }; 9946 9947 // 4) if the TTI hook indicates this is profitable, request predication. 9948 TailFoldingInfo TFI(TLI, &LVL, IAI); 9949 if (TTI->preferPredicateOverEpilogue(&TFI)) 9950 return CM_ScalarEpilogueNotNeededUsePredicate; 9951 9952 return CM_ScalarEpilogueAllowed; 9953 } 9954 9955 // Process the loop in the VPlan-native vectorization path. This path builds 9956 // VPlan upfront in the vectorization pipeline, which allows to apply 9957 // VPlan-to-VPlan transformations from the very beginning without modifying the 9958 // input LLVM IR. 9959 static bool processLoopInVPlanNativePath( 9960 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9961 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9962 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9963 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9964 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9965 LoopVectorizationRequirements &Requirements) { 9966 9967 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9968 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9969 return false; 9970 } 9971 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9972 Function *F = L->getHeader()->getParent(); 9973 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9974 9975 ScalarEpilogueLowering SEL = 9976 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9977 9978 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9979 &Hints, IAI); 9980 // Use the planner for outer loop vectorization. 9981 // TODO: CM is not used at this point inside the planner. Turn CM into an 9982 // optional argument if we don't need it in the future. 9983 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9984 ORE); 9985 9986 // Get user vectorization factor. 9987 ElementCount UserVF = Hints.getWidth(); 9988 9989 CM.collectElementTypesForWidening(); 9990 9991 // Plan how to best vectorize, return the best VF and its cost. 9992 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9993 9994 // If we are stress testing VPlan builds, do not attempt to generate vector 9995 // code. Masked vector code generation support will follow soon. 9996 // Also, do not attempt to vectorize if no vector code will be produced. 9997 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9998 return false; 9999 10000 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10001 10002 { 10003 bool AddBranchWeights = 10004 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10005 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10006 AddBranchWeights, CM.CostKind); 10007 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10008 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); 10009 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10010 << L->getHeader()->getParent()->getName() << "\"\n"); 10011 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10012 } 10013 10014 reportVectorization(ORE, L, VF, 1); 10015 10016 // Mark the loop as already vectorized to avoid vectorizing again. 10017 Hints.setAlreadyVectorized(); 10018 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10019 return true; 10020 } 10021 10022 // Emit a remark if there are stores to floats that required a floating point 10023 // extension. If the vectorized loop was generated with floating point there 10024 // will be a performance penalty from the conversion overhead and the change in 10025 // the vector width. 10026 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10027 SmallVector<Instruction *, 4> Worklist; 10028 for (BasicBlock *BB : L->getBlocks()) { 10029 for (Instruction &Inst : *BB) { 10030 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10031 if (S->getValueOperand()->getType()->isFloatTy()) 10032 Worklist.push_back(S); 10033 } 10034 } 10035 } 10036 10037 // Traverse the floating point stores upwards searching, for floating point 10038 // conversions. 10039 SmallPtrSet<const Instruction *, 4> Visited; 10040 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10041 while (!Worklist.empty()) { 10042 auto *I = Worklist.pop_back_val(); 10043 if (!L->contains(I)) 10044 continue; 10045 if (!Visited.insert(I).second) 10046 continue; 10047 10048 // Emit a remark if the floating point store required a floating 10049 // point conversion. 10050 // TODO: More work could be done to identify the root cause such as a 10051 // constant or a function return type and point the user to it. 10052 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10053 ORE->emit([&]() { 10054 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10055 I->getDebugLoc(), L->getHeader()) 10056 << "floating point conversion changes vector width. " 10057 << "Mixed floating point precision requires an up/down " 10058 << "cast that will negatively impact performance."; 10059 }); 10060 10061 for (Use &Op : I->operands()) 10062 if (auto *OpI = dyn_cast<Instruction>(Op)) 10063 Worklist.push_back(OpI); 10064 } 10065 } 10066 10067 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10068 VectorizationFactor &VF, Loop *L, 10069 const TargetTransformInfo &TTI, 10070 PredicatedScalarEvolution &PSE, 10071 ScalarEpilogueLowering SEL) { 10072 InstructionCost CheckCost = Checks.getCost(); 10073 if (!CheckCost.isValid()) 10074 return false; 10075 10076 // When interleaving only scalar and vector cost will be equal, which in turn 10077 // would lead to a divide by 0. Fall back to hard threshold. 10078 if (VF.Width.isScalar()) { 10079 if (CheckCost > VectorizeMemoryCheckThreshold) { 10080 LLVM_DEBUG( 10081 dbgs() 10082 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10083 return false; 10084 } 10085 return true; 10086 } 10087 10088 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10089 uint64_t ScalarC = *VF.ScalarCost.getValue(); 10090 if (ScalarC == 0) 10091 return true; 10092 10093 // First, compute the minimum iteration count required so that the vector 10094 // loop outperforms the scalar loop. 10095 // The total cost of the scalar loop is 10096 // ScalarC * TC 10097 // where 10098 // * TC is the actual trip count of the loop. 10099 // * ScalarC is the cost of a single scalar iteration. 10100 // 10101 // The total cost of the vector loop is 10102 // RtC + VecC * (TC / VF) + EpiC 10103 // where 10104 // * RtC is the cost of the generated runtime checks 10105 // * VecC is the cost of a single vector iteration. 10106 // * TC is the actual trip count of the loop 10107 // * VF is the vectorization factor 10108 // * EpiCost is the cost of the generated epilogue, including the cost 10109 // of the remaining scalar operations. 10110 // 10111 // Vectorization is profitable once the total vector cost is less than the 10112 // total scalar cost: 10113 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10114 // 10115 // Now we can compute the minimum required trip count TC as 10116 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 10117 // 10118 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10119 // the computations are performed on doubles, not integers and the result 10120 // is rounded up, hence we get an upper estimate of the TC. 10121 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); 10122 uint64_t RtC = *CheckCost.getValue(); 10123 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); 10124 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); 10125 10126 // Second, compute a minimum iteration count so that the cost of the 10127 // runtime checks is only a fraction of the total scalar loop cost. This 10128 // adds a loop-dependent bound on the overhead incurred if the runtime 10129 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10130 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10131 // cost, compute 10132 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10133 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); 10134 10135 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 10136 // epilogue is allowed, choose the next closest multiple of VF. This should 10137 // partly compensate for ignoring the epilogue cost. 10138 uint64_t MinTC = std::max(MinTC1, MinTC2); 10139 if (SEL == CM_ScalarEpilogueAllowed) 10140 MinTC = alignTo(MinTC, IntVF); 10141 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 10142 10143 LLVM_DEBUG( 10144 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10145 << VF.MinProfitableTripCount << "\n"); 10146 10147 // Skip vectorization if the expected trip count is less than the minimum 10148 // required trip count. 10149 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { 10150 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10151 VF.MinProfitableTripCount)) { 10152 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10153 "trip count < minimum profitable VF (" 10154 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10155 << ")\n"); 10156 10157 return false; 10158 } 10159 } 10160 return true; 10161 } 10162 10163 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10164 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10165 !EnableLoopInterleaving), 10166 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10167 !EnableLoopVectorization) {} 10168 10169 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue 10170 /// vectorization. Remove ResumePhis from \p MainPlan for inductions that 10171 /// don't have a corresponding wide induction in \p EpiPlan. 10172 static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { 10173 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those 10174 // will need their resume-values computed in the main vector loop. Others 10175 // can be removed from the main VPlan. 10176 SmallPtrSet<PHINode *, 2> EpiWidenedPhis; 10177 for (VPRecipeBase &R : 10178 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 10179 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10180 continue; 10181 EpiWidenedPhis.insert( 10182 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue())); 10183 } 10184 for (VPRecipeBase &R : make_early_inc_range( 10185 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) { 10186 auto *VPIRInst = cast<VPIRInstruction>(&R); 10187 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction()); 10188 if (!IRI) 10189 break; 10190 if (EpiWidenedPhis.contains(IRI)) 10191 continue; 10192 // There is no corresponding wide induction in the epilogue plan that would 10193 // need a resume value. Remove the VPIRInst wrapping the scalar header phi 10194 // together with the corresponding ResumePhi. The resume values for the 10195 // scalar loop will be created during execution of EpiPlan. 10196 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe(); 10197 VPIRInst->eraseFromParent(); 10198 ResumePhi->eraseFromParent(); 10199 } 10200 VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan); 10201 10202 using namespace VPlanPatternMatch; 10203 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); 10204 VPValue *VectorTC = &MainPlan.getVectorTripCount(); 10205 // If there is a suitable resume value for the canonical induction in the 10206 // scalar (which will become vector) epilogue loop we are done. Otherwise 10207 // create it below. 10208 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) { 10209 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>( 10210 m_Specific(VectorTC), m_SpecificInt(0))); 10211 })) 10212 return; 10213 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin()); 10214 ScalarPHBuilder.createNaryOp( 10215 VPInstruction::ResumePhi, 10216 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {}, 10217 "vec.epilog.resume.val"); 10218 } 10219 10220 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded 10221 /// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. 10222 static void 10223 preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, 10224 const SCEV2ValueTy &ExpandedSCEVs, 10225 const EpilogueLoopVectorizationInfo &EPI) { 10226 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); 10227 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10228 Header->setName("vec.epilog.vector.body"); 10229 10230 // Re-use the trip count and steps expanded for the main loop, as 10231 // skeleton creation needs it as a value that dominates both the scalar 10232 // and vector epilogue loops 10233 // TODO: This is a workaround needed for epilogue vectorization and it 10234 // should be removed once induction resume value creation is done 10235 // directly in VPlan. 10236 for (auto &R : make_early_inc_range(*Plan.getEntry())) { 10237 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R); 10238 if (!ExpandR) 10239 continue; 10240 auto *ExpandedVal = 10241 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10242 ExpandR->replaceAllUsesWith(ExpandedVal); 10243 if (Plan.getTripCount() == ExpandR) 10244 Plan.resetTripCount(ExpandedVal); 10245 ExpandR->eraseFromParent(); 10246 } 10247 10248 // Ensure that the start values for all header phi recipes are updated before 10249 // vectorizing the epilogue loop. 10250 for (VPRecipeBase &R : Header->phis()) { 10251 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) { 10252 // When vectorizing the epilogue loop, the canonical induction start 10253 // value needs to be changed from zero to the value after the main 10254 // vector loop. Find the resume value created during execution of the main 10255 // VPlan. 10256 // FIXME: Improve modeling for canonical IV start values in the epilogue 10257 // loop. 10258 BasicBlock *MainMiddle = find_singleton<BasicBlock>( 10259 predecessors(L->getLoopPreheader()), 10260 [&EPI](BasicBlock *BB, bool) -> BasicBlock * { 10261 if (BB != EPI.MainLoopIterationCountCheck && 10262 BB != EPI.EpilogueIterationCountCheck && 10263 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck) 10264 return BB; 10265 return nullptr; 10266 }); 10267 using namespace llvm::PatternMatch; 10268 Type *IdxTy = IV->getScalarType(); 10269 PHINode *EPResumeVal = find_singleton<PHINode>( 10270 L->getLoopPreheader()->phis(), 10271 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * { 10272 if (P.getType() == IdxTy && 10273 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount && 10274 match( 10275 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck), 10276 m_SpecificInt(0))) 10277 return &P; 10278 return nullptr; 10279 }); 10280 assert(EPResumeVal && "must have a resume value for the canonical IV"); 10281 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal); 10282 assert(all_of(IV->users(), 10283 [](const VPUser *U) { 10284 return isa<VPScalarIVStepsRecipe>(U) || 10285 isa<VPScalarCastRecipe>(U) || 10286 isa<VPDerivedIVRecipe>(U) || 10287 cast<VPInstruction>(U)->getOpcode() == 10288 Instruction::Add; 10289 }) && 10290 "the canonical IV should only be used by its increment or " 10291 "ScalarIVSteps when resetting the start value"); 10292 IV->setOperand(0, VPV); 10293 continue; 10294 } 10295 10296 Value *ResumeV = nullptr; 10297 // TODO: Move setting of resume values to prepareToExecute. 10298 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10299 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr()) 10300 ->getIncomingValueForBlock(L->getLoopPreheader()); 10301 const RecurrenceDescriptor &RdxDesc = 10302 ReductionPhi->getRecurrenceDescriptor(); 10303 RecurKind RK = RdxDesc.getRecurrenceKind(); 10304 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 10305 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10306 // start value; compare the final value from the main vector loop 10307 // to the start value. 10308 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent(); 10309 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt()); 10310 ResumeV = 10311 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); 10312 } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { 10313 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment 10314 // to the resume value. The resume value is adjusted to the sentinel 10315 // value when the final value from the main vector loop equals the start 10316 // value. This ensures correctness when the start value might not be 10317 // less than the minimum value of a monotonically increasing induction 10318 // variable. 10319 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent(); 10320 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); 10321 Value *Cmp = 10322 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); 10323 ResumeV = 10324 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); 10325 } 10326 } else { 10327 // Retrieve the induction resume values for wide inductions from 10328 // their original phi nodes in the scalar loop. 10329 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode(); 10330 // Hook up to the PHINode generated by a ResumePhi recipe of main 10331 // loop VPlan, which feeds the scalar loop. 10332 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); 10333 } 10334 assert(ResumeV && "Must have a resume value"); 10335 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); 10336 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10337 } 10338 } 10339 10340 bool LoopVectorizePass::processLoop(Loop *L) { 10341 assert((EnableVPlanNativePath || L->isInnermost()) && 10342 "VPlan-native path is not enabled. Only process inner loops."); 10343 10344 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10345 << L->getHeader()->getParent()->getName() << "' from " 10346 << L->getLocStr() << "\n"); 10347 10348 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10349 10350 LLVM_DEBUG( 10351 dbgs() << "LV: Loop hints:" 10352 << " force=" 10353 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10354 ? "disabled" 10355 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10356 ? "enabled" 10357 : "?")) 10358 << " width=" << Hints.getWidth() 10359 << " interleave=" << Hints.getInterleave() << "\n"); 10360 10361 // Function containing loop 10362 Function *F = L->getHeader()->getParent(); 10363 10364 // Looking at the diagnostic output is the only way to determine if a loop 10365 // was vectorized (other than looking at the IR or machine code), so it 10366 // is important to generate an optimization remark for each loop. Most of 10367 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10368 // generated as OptimizationRemark and OptimizationRemarkMissed are 10369 // less verbose reporting vectorized loops and unvectorized loops that may 10370 // benefit from vectorization, respectively. 10371 10372 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10373 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10374 return false; 10375 } 10376 10377 PredicatedScalarEvolution PSE(*SE, *L); 10378 10379 // Check if it is legal to vectorize the loop. 10380 LoopVectorizationRequirements Requirements; 10381 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10382 &Requirements, &Hints, DB, AC, BFI, PSI); 10383 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10384 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10385 Hints.emitRemarkWithHints(); 10386 return false; 10387 } 10388 10389 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { 10390 reportVectorizationFailure("Auto-vectorization of loops with uncountable " 10391 "early exit is not enabled", 10392 "UncountableEarlyExitLoopsDisabled", ORE, L); 10393 return false; 10394 } 10395 10396 if (LVL.hasStructVectorCall()) { 10397 reportVectorizationFailure("Auto-vectorization of calls that return struct " 10398 "types is not yet supported", 10399 "StructCallVectorizationUnsupported", ORE, L); 10400 return false; 10401 } 10402 10403 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10404 // here. They may require CFG and instruction level transformations before 10405 // even evaluating whether vectorization is profitable. Since we cannot modify 10406 // the incoming IR, we need to build VPlan upfront in the vectorization 10407 // pipeline. 10408 if (!L->isInnermost()) 10409 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10410 ORE, BFI, PSI, Hints, Requirements); 10411 10412 assert(L->isInnermost() && "Inner loop expected."); 10413 10414 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10415 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10416 10417 // If an override option has been passed in for interleaved accesses, use it. 10418 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10419 UseInterleaved = EnableInterleavedMemAccesses; 10420 10421 // Analyze interleaved memory accesses. 10422 if (UseInterleaved) 10423 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10424 10425 if (LVL.hasUncountableEarlyExit()) { 10426 BasicBlock *LoopLatch = L->getLoopLatch(); 10427 if (IAI.requiresScalarEpilogue() || 10428 any_of(LVL.getCountableExitingBlocks(), 10429 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) { 10430 reportVectorizationFailure("Auto-vectorization of early exit loops " 10431 "requiring a scalar epilogue is unsupported", 10432 "UncountableEarlyExitUnsupported", ORE, L); 10433 return false; 10434 } 10435 } 10436 10437 // Check the function attributes and profiles to find out if this function 10438 // should be optimized for size. 10439 ScalarEpilogueLowering SEL = 10440 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10441 10442 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10443 // count by optimizing for size, to minimize overheads. 10444 auto ExpectedTC = getSmallBestKnownTC(PSE, L); 10445 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10446 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10447 << "This loop is worth vectorizing only if no scalar " 10448 << "iteration overheads are incurred."); 10449 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10450 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10451 else { 10452 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10453 LLVM_DEBUG(dbgs() << "\n"); 10454 // Predicate tail-folded loops are efficient even when the loop 10455 // iteration count is low. However, setting the epilogue policy to 10456 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 10457 // with runtime checks. It's more effective to let 10458 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 10459 // for the loop. 10460 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 10461 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10462 } else { 10463 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10464 "small to consider vectorizing.\n"); 10465 reportVectorizationFailure( 10466 "The trip count is below the minial threshold value.", 10467 "loop trip count is too low, avoiding vectorization", 10468 "LowTripCount", ORE, L); 10469 Hints.emitRemarkWithHints(); 10470 return false; 10471 } 10472 } 10473 } 10474 10475 // Check the function attributes to see if implicit floats or vectors are 10476 // allowed. 10477 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10478 reportVectorizationFailure( 10479 "Can't vectorize when the NoImplicitFloat attribute is used", 10480 "loop not vectorized due to NoImplicitFloat attribute", 10481 "NoImplicitFloat", ORE, L); 10482 Hints.emitRemarkWithHints(); 10483 return false; 10484 } 10485 10486 // Check if the target supports potentially unsafe FP vectorization. 10487 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10488 // for the target we're vectorizing for, to make sure none of the 10489 // additional fp-math flags can help. 10490 if (Hints.isPotentiallyUnsafe() && 10491 TTI->isFPVectorizationPotentiallyUnsafe()) { 10492 reportVectorizationFailure( 10493 "Potentially unsafe FP op prevents vectorization", 10494 "loop not vectorized due to unsafe FP support.", 10495 "UnsafeFP", ORE, L); 10496 Hints.emitRemarkWithHints(); 10497 return false; 10498 } 10499 10500 bool AllowOrderedReductions; 10501 // If the flag is set, use that instead and override the TTI behaviour. 10502 if (ForceOrderedReductions.getNumOccurrences() > 0) 10503 AllowOrderedReductions = ForceOrderedReductions; 10504 else 10505 AllowOrderedReductions = TTI->enableOrderedReductions(); 10506 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10507 ORE->emit([&]() { 10508 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10509 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10510 ExactFPMathInst->getDebugLoc(), 10511 ExactFPMathInst->getParent()) 10512 << "loop not vectorized: cannot prove it is safe to reorder " 10513 "floating-point operations"; 10514 }); 10515 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10516 "reorder floating-point operations\n"); 10517 Hints.emitRemarkWithHints(); 10518 return false; 10519 } 10520 10521 // Use the cost model. 10522 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10523 F, &Hints, IAI); 10524 // Use the planner for vectorization. 10525 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10526 ORE); 10527 10528 // Get user vectorization factor and interleave count. 10529 ElementCount UserVF = Hints.getWidth(); 10530 unsigned UserIC = Hints.getInterleave(); 10531 10532 // Plan how to best vectorize. 10533 LVP.plan(UserVF, UserIC); 10534 VectorizationFactor VF = LVP.computeBestVF(); 10535 unsigned IC = 1; 10536 10537 if (ORE->allowExtraAnalysis(LV_NAME)) 10538 LVP.emitInvalidCostRemarks(ORE); 10539 10540 bool AddBranchWeights = 10541 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10542 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), 10543 AddBranchWeights, CM.CostKind); 10544 if (LVP.hasPlanWithVF(VF.Width)) { 10545 // Select the interleave count. 10546 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10547 10548 unsigned SelectedIC = std::max(IC, UserIC); 10549 // Optimistically generate runtime checks if they are needed. Drop them if 10550 // they turn out to not be profitable. 10551 if (VF.Width.isVector() || SelectedIC > 1) 10552 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10553 10554 // Check if it is profitable to vectorize with runtime checks. 10555 bool ForceVectorization = 10556 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10557 if (!ForceVectorization && 10558 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { 10559 ORE->emit([&]() { 10560 return OptimizationRemarkAnalysisAliasing( 10561 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10562 L->getHeader()) 10563 << "loop not vectorized: cannot prove it is safe to reorder " 10564 "memory operations"; 10565 }); 10566 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10567 Hints.emitRemarkWithHints(); 10568 return false; 10569 } 10570 } 10571 10572 // Identify the diagnostic messages that should be produced. 10573 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10574 bool VectorizeLoop = true, InterleaveLoop = true; 10575 if (VF.Width.isScalar()) { 10576 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10577 VecDiagMsg = std::make_pair( 10578 "VectorizationNotBeneficial", 10579 "the cost-model indicates that vectorization is not beneficial"); 10580 VectorizeLoop = false; 10581 } 10582 10583 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) { 10584 // Tell the user interleaving was avoided up-front, despite being explicitly 10585 // requested. 10586 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10587 "interleaving should be avoided up front\n"); 10588 IntDiagMsg = std::make_pair( 10589 "InterleavingAvoided", 10590 "Ignoring UserIC, because interleaving was avoided up front"); 10591 InterleaveLoop = false; 10592 } else if (IC == 1 && UserIC <= 1) { 10593 // Tell the user interleaving is not beneficial. 10594 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10595 IntDiagMsg = std::make_pair( 10596 "InterleavingNotBeneficial", 10597 "the cost-model indicates that interleaving is not beneficial"); 10598 InterleaveLoop = false; 10599 if (UserIC == 1) { 10600 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10601 IntDiagMsg.second += 10602 " and is explicitly disabled or interleave count is set to 1"; 10603 } 10604 } else if (IC > 1 && UserIC == 1) { 10605 // Tell the user interleaving is beneficial, but it explicitly disabled. 10606 LLVM_DEBUG( 10607 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10608 IntDiagMsg = std::make_pair( 10609 "InterleavingBeneficialButDisabled", 10610 "the cost-model indicates that interleaving is beneficial " 10611 "but is explicitly disabled or interleave count is set to 1"); 10612 InterleaveLoop = false; 10613 } 10614 10615 // If there is a histogram in the loop, do not just interleave without 10616 // vectorizing. The order of operations will be incorrect without the 10617 // histogram intrinsics, which are only used for recipes with VF > 1. 10618 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) { 10619 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " 10620 << "to histogram operations.\n"); 10621 IntDiagMsg = std::make_pair( 10622 "HistogramPreventsScalarInterleaving", 10623 "Unable to interleave without vectorization due to constraints on " 10624 "the order of histogram operations"); 10625 InterleaveLoop = false; 10626 } 10627 10628 // Override IC if user provided an interleave count. 10629 IC = UserIC > 0 ? UserIC : IC; 10630 10631 // Emit diagnostic messages, if any. 10632 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10633 if (!VectorizeLoop && !InterleaveLoop) { 10634 // Do not vectorize or interleaving the loop. 10635 ORE->emit([&]() { 10636 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10637 L->getStartLoc(), L->getHeader()) 10638 << VecDiagMsg.second; 10639 }); 10640 ORE->emit([&]() { 10641 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10642 L->getStartLoc(), L->getHeader()) 10643 << IntDiagMsg.second; 10644 }); 10645 return false; 10646 } 10647 10648 if (!VectorizeLoop && InterleaveLoop) { 10649 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10650 ORE->emit([&]() { 10651 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10652 L->getStartLoc(), L->getHeader()) 10653 << VecDiagMsg.second; 10654 }); 10655 } else if (VectorizeLoop && !InterleaveLoop) { 10656 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10657 << ") in " << L->getLocStr() << '\n'); 10658 ORE->emit([&]() { 10659 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10660 L->getStartLoc(), L->getHeader()) 10661 << IntDiagMsg.second; 10662 }); 10663 } else if (VectorizeLoop && InterleaveLoop) { 10664 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10665 << ") in " << L->getLocStr() << '\n'); 10666 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10667 } 10668 10669 bool DisableRuntimeUnroll = false; 10670 MDNode *OrigLoopID = L->getLoopID(); 10671 { 10672 using namespace ore; 10673 if (!VectorizeLoop) { 10674 assert(IC > 1 && "interleave count should not be 1 or 0"); 10675 // If we decided that it is not legal to vectorize the loop, then 10676 // interleave it. 10677 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10678 InnerLoopVectorizer Unroller( 10679 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), 10680 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan); 10681 10682 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10683 10684 ORE->emit([&]() { 10685 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10686 L->getHeader()) 10687 << "interleaved loop (interleaved count: " 10688 << NV("InterleaveCount", IC) << ")"; 10689 }); 10690 } else { 10691 // If we decided that it is *legal* to vectorize the loop, then do it. 10692 10693 VPlan &BestPlan = LVP.getPlanFor(VF.Width); 10694 // Consider vectorizing the epilogue too if it's profitable. 10695 VectorizationFactor EpilogueVF = 10696 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10697 if (EpilogueVF.Width.isVector()) { 10698 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate()); 10699 10700 // The first pass vectorizes the main loop and creates a scalar epilogue 10701 // to be vectorized by executing the plan (potentially with a different 10702 // factor) again shortly afterwards. 10703 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); 10704 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); 10705 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, 10706 BestEpiPlan); 10707 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10708 EPI, &LVL, &CM, BFI, PSI, Checks, 10709 *BestMainPlan); 10710 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10711 *BestMainPlan, MainILV, DT, false); 10712 ++LoopsVectorized; 10713 10714 // Second pass vectorizes the epilogue and adjusts the control flow 10715 // edges from the first pass. 10716 EPI.MainLoopVF = EPI.EpilogueVF; 10717 EPI.MainLoopUF = EPI.EpilogueUF; 10718 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10719 ORE, EPI, &LVL, &CM, BFI, PSI, 10720 Checks, BestEpiPlan); 10721 EpilogILV.setTripCount(MainILV.getTripCount()); 10722 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); 10723 10724 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10725 DT, true, &ExpandedSCEVs); 10726 ++LoopsEpilogueVectorized; 10727 10728 if (!MainILV.areSafetyChecksAdded()) 10729 DisableRuntimeUnroll = true; 10730 } else { 10731 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10732 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10733 PSI, Checks, BestPlan); 10734 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10735 ++LoopsVectorized; 10736 10737 // Add metadata to disable runtime unrolling a scalar loop when there 10738 // are no runtime checks about strides and memory. A scalar loop that is 10739 // rarely used is not worth unrolling. 10740 if (!LB.areSafetyChecksAdded()) 10741 DisableRuntimeUnroll = true; 10742 } 10743 // Report the vectorization decision. 10744 reportVectorization(ORE, L, VF, IC); 10745 } 10746 10747 if (ORE->allowExtraAnalysis(LV_NAME)) 10748 checkMixedPrecision(L, ORE); 10749 } 10750 10751 assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 10752 "DT not preserved correctly"); 10753 10754 std::optional<MDNode *> RemainderLoopID = 10755 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10756 LLVMLoopVectorizeFollowupEpilogue}); 10757 if (RemainderLoopID) { 10758 L->setLoopID(*RemainderLoopID); 10759 } else { 10760 if (DisableRuntimeUnroll) 10761 addRuntimeUnrollDisableMetaData(L); 10762 10763 // Mark the loop as already vectorized to avoid vectorizing again. 10764 Hints.setAlreadyVectorized(); 10765 } 10766 10767 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10768 return true; 10769 } 10770 10771 LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) { 10772 10773 // Don't attempt if 10774 // 1. the target claims to have no vector registers, and 10775 // 2. interleaving won't help ILP. 10776 // 10777 // The second condition is necessary because, even if the target has no 10778 // vector registers, loop vectorization may still enable scalar 10779 // interleaving. 10780 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10781 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10782 return LoopVectorizeResult(false, false); 10783 10784 bool Changed = false, CFGChanged = false; 10785 10786 // The vectorizer requires loops to be in simplified form. 10787 // Since simplification may add new inner loops, it has to run before the 10788 // legality and profitability checks. This means running the loop vectorizer 10789 // will simplify all loops, regardless of whether anything end up being 10790 // vectorized. 10791 for (const auto &L : *LI) 10792 Changed |= CFGChanged |= 10793 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10794 10795 // Build up a worklist of inner-loops to vectorize. This is necessary as 10796 // the act of vectorizing or partially unrolling a loop creates new loops 10797 // and can invalidate iterators across the loops. 10798 SmallVector<Loop *, 8> Worklist; 10799 10800 for (Loop *L : *LI) 10801 collectSupportedLoops(*L, LI, ORE, Worklist); 10802 10803 LoopsAnalyzed += Worklist.size(); 10804 10805 // Now walk the identified inner loops. 10806 while (!Worklist.empty()) { 10807 Loop *L = Worklist.pop_back_val(); 10808 10809 // For the inner loops we actually process, form LCSSA to simplify the 10810 // transform. 10811 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10812 10813 Changed |= CFGChanged |= processLoop(L); 10814 10815 if (Changed) { 10816 LAIs->clear(); 10817 10818 #ifndef NDEBUG 10819 if (VerifySCEV) 10820 SE->verify(); 10821 #endif 10822 } 10823 } 10824 10825 // Process each loop nest in the function. 10826 return LoopVectorizeResult(Changed, CFGChanged); 10827 } 10828 10829 PreservedAnalyses LoopVectorizePass::run(Function &F, 10830 FunctionAnalysisManager &AM) { 10831 LI = &AM.getResult<LoopAnalysis>(F); 10832 // There are no loops in the function. Return before computing other 10833 // expensive analyses. 10834 if (LI->empty()) 10835 return PreservedAnalyses::all(); 10836 SE = &AM.getResult<ScalarEvolutionAnalysis>(F); 10837 TTI = &AM.getResult<TargetIRAnalysis>(F); 10838 DT = &AM.getResult<DominatorTreeAnalysis>(F); 10839 TLI = &AM.getResult<TargetLibraryAnalysis>(F); 10840 AC = &AM.getResult<AssumptionAnalysis>(F); 10841 DB = &AM.getResult<DemandedBitsAnalysis>(F); 10842 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10843 LAIs = &AM.getResult<LoopAccessAnalysis>(F); 10844 10845 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10846 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10847 BFI = nullptr; 10848 if (PSI && PSI->hasProfileSummary()) 10849 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10850 LoopVectorizeResult Result = runImpl(F); 10851 if (!Result.MadeAnyChange) 10852 return PreservedAnalyses::all(); 10853 PreservedAnalyses PA; 10854 10855 if (isAssignmentTrackingEnabled(*F.getParent())) { 10856 for (auto &BB : F) 10857 RemoveRedundantDbgInstrs(&BB); 10858 } 10859 10860 PA.preserve<LoopAnalysis>(); 10861 PA.preserve<DominatorTreeAnalysis>(); 10862 PA.preserve<ScalarEvolutionAnalysis>(); 10863 PA.preserve<LoopAccessAnalysis>(); 10864 10865 if (Result.MadeCFGChange) { 10866 // Making CFG changes likely means a loop got vectorized. Indicate that 10867 // extra simplification passes should be run. 10868 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10869 // be run if runtime checks have been added. 10870 AM.getResult<ShouldRunExtraVectorPasses>(F); 10871 PA.preserve<ShouldRunExtraVectorPasses>(); 10872 } else { 10873 PA.preserveSet<CFGAnalyses>(); 10874 } 10875 return PA; 10876 } 10877 10878 void LoopVectorizePass::printPipeline( 10879 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10880 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10881 OS, MapClassName2PassName); 10882 10883 OS << '<'; 10884 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10885 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10886 OS << '>'; 10887 } 10888