1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanAnalysis.h" 61 #include "VPlanHCFGBuilder.h" 62 #include "VPlanPatternMatch.h" 63 #include "VPlanTransforms.h" 64 #include "VPlanVerifier.h" 65 #include "llvm/ADT/APInt.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/DenseMap.h" 68 #include "llvm/ADT/DenseMapInfo.h" 69 #include "llvm/ADT/Hashing.h" 70 #include "llvm/ADT/MapVector.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/ValueTracking.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfo.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/MDBuilder.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/PatternMatch.h" 122 #include "llvm/IR/ProfDataUtils.h" 123 #include "llvm/IR/Type.h" 124 #include "llvm/IR/Use.h" 125 #include "llvm/IR/User.h" 126 #include "llvm/IR/Value.h" 127 #include "llvm/IR/ValueHandle.h" 128 #include "llvm/IR/VectorBuilder.h" 129 #include "llvm/IR/Verifier.h" 130 #include "llvm/Support/Casting.h" 131 #include "llvm/Support/CommandLine.h" 132 #include "llvm/Support/Compiler.h" 133 #include "llvm/Support/Debug.h" 134 #include "llvm/Support/ErrorHandling.h" 135 #include "llvm/Support/InstructionCost.h" 136 #include "llvm/Support/MathExtras.h" 137 #include "llvm/Support/raw_ostream.h" 138 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 139 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 140 #include "llvm/Transforms/Utils/LoopSimplify.h" 141 #include "llvm/Transforms/Utils/LoopUtils.h" 142 #include "llvm/Transforms/Utils/LoopVersioning.h" 143 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 144 #include "llvm/Transforms/Utils/SizeOpts.h" 145 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 146 #include <algorithm> 147 #include <cassert> 148 #include <cmath> 149 #include <cstdint> 150 #include <functional> 151 #include <iterator> 152 #include <limits> 153 #include <map> 154 #include <memory> 155 #include <string> 156 #include <tuple> 157 #include <utility> 158 159 using namespace llvm; 160 161 #define LV_NAME "loop-vectorize" 162 #define DEBUG_TYPE LV_NAME 163 164 #ifndef NDEBUG 165 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 166 #endif 167 168 /// @{ 169 /// Metadata attribute names 170 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 171 const char LLVMLoopVectorizeFollowupVectorized[] = 172 "llvm.loop.vectorize.followup_vectorized"; 173 const char LLVMLoopVectorizeFollowupEpilogue[] = 174 "llvm.loop.vectorize.followup_epilogue"; 175 /// @} 176 177 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 178 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 179 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 180 181 static cl::opt<bool> EnableEpilogueVectorization( 182 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 183 cl::desc("Enable vectorization of epilogue loops.")); 184 185 static cl::opt<unsigned> EpilogueVectorizationForceVF( 186 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 187 cl::desc("When epilogue vectorization is enabled, and a value greater than " 188 "1 is specified, forces the given VF for all applicable epilogue " 189 "loops.")); 190 191 static cl::opt<unsigned> EpilogueVectorizationMinVF( 192 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 193 cl::desc("Only loops with vectorization factor equal to or larger than " 194 "the specified value are considered for epilogue vectorization.")); 195 196 /// Loops with a known constant trip count below this number are vectorized only 197 /// if no scalar iteration overheads are incurred. 198 static cl::opt<unsigned> TinyTripCountVectorThreshold( 199 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 200 cl::desc("Loops with a constant trip count that is smaller than this " 201 "value are vectorized only if no scalar iteration overheads " 202 "are incurred.")); 203 204 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 205 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 206 cl::desc("The maximum allowed number of runtime memory checks")); 207 208 static cl::opt<bool> UseLegacyCostModel( 209 "vectorize-use-legacy-cost-model", cl::init(true), cl::Hidden, 210 cl::desc("Use the legacy cost model instead of the VPlan-based cost model. " 211 "This option will be removed in the future.")); 212 213 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 214 // that predication is preferred, and this lists all options. I.e., the 215 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 216 // and predicate the instructions accordingly. If tail-folding fails, there are 217 // different fallback strategies depending on these values: 218 namespace PreferPredicateTy { 219 enum Option { 220 ScalarEpilogue = 0, 221 PredicateElseScalarEpilogue, 222 PredicateOrDontVectorize 223 }; 224 } // namespace PreferPredicateTy 225 226 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 227 "prefer-predicate-over-epilogue", 228 cl::init(PreferPredicateTy::ScalarEpilogue), 229 cl::Hidden, 230 cl::desc("Tail-folding and predication preferences over creating a scalar " 231 "epilogue loop."), 232 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 233 "scalar-epilogue", 234 "Don't tail-predicate loops, create scalar epilogue"), 235 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 236 "predicate-else-scalar-epilogue", 237 "prefer tail-folding, create scalar epilogue if tail " 238 "folding fails."), 239 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 240 "predicate-dont-vectorize", 241 "prefers tail-folding, don't attempt vectorization if " 242 "tail-folding fails."))); 243 244 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 245 "force-tail-folding-style", cl::desc("Force the tail folding style"), 246 cl::init(TailFoldingStyle::None), 247 cl::values( 248 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 249 clEnumValN( 250 TailFoldingStyle::Data, "data", 251 "Create lane mask for data only, using active.lane.mask intrinsic"), 252 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 253 "data-without-lane-mask", 254 "Create lane mask with compare/stepvector"), 255 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 256 "Create lane mask using active.lane.mask intrinsic, and use " 257 "it for both data and control flow"), 258 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 259 "data-and-control-without-rt-check", 260 "Similar to data-and-control, but remove the runtime check"), 261 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", 262 "Use predicated EVL instructions for tail folding. If EVL " 263 "is unsupported, fallback to data-without-lane-mask."))); 264 265 static cl::opt<bool> MaximizeBandwidth( 266 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 267 cl::desc("Maximize bandwidth when selecting vectorization factor which " 268 "will be determined by the smallest type in loop.")); 269 270 static cl::opt<bool> EnableInterleavedMemAccesses( 271 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 272 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 273 274 /// An interleave-group may need masking if it resides in a block that needs 275 /// predication, or in order to mask away gaps. 276 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 277 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 278 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 279 280 static cl::opt<unsigned> ForceTargetNumScalarRegs( 281 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's number of scalar registers.")); 283 284 static cl::opt<unsigned> ForceTargetNumVectorRegs( 285 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 286 cl::desc("A flag that overrides the target's number of vector registers.")); 287 288 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 289 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 290 cl::desc("A flag that overrides the target's max interleave factor for " 291 "scalar loops.")); 292 293 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 294 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 295 cl::desc("A flag that overrides the target's max interleave factor for " 296 "vectorized loops.")); 297 298 cl::opt<unsigned> ForceTargetInstructionCost( 299 "force-target-instruction-cost", cl::init(0), cl::Hidden, 300 cl::desc("A flag that overrides the target's expected cost for " 301 "an instruction to a single constant value. Mostly " 302 "useful for getting consistent testing.")); 303 304 static cl::opt<bool> ForceTargetSupportsScalableVectors( 305 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 306 cl::desc( 307 "Pretend that scalable vectors are supported, even if the target does " 308 "not support them. This flag should only be used for testing.")); 309 310 static cl::opt<unsigned> SmallLoopCost( 311 "small-loop-cost", cl::init(20), cl::Hidden, 312 cl::desc( 313 "The cost of a loop that is considered 'small' by the interleaver.")); 314 315 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 316 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 317 cl::desc("Enable the use of the block frequency analysis to access PGO " 318 "heuristics minimizing code growth in cold regions and being more " 319 "aggressive in hot regions.")); 320 321 // Runtime interleave loops for load/store throughput. 322 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 323 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 324 cl::desc( 325 "Enable runtime interleaving until load/store ports are saturated")); 326 327 /// The number of stores in a loop that are allowed to need predication. 328 static cl::opt<unsigned> NumberOfStoresToPredicate( 329 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 330 cl::desc("Max number of stores to be predicated behind an if.")); 331 332 static cl::opt<bool> EnableIndVarRegisterHeur( 333 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 334 cl::desc("Count the induction variable only once when interleaving")); 335 336 static cl::opt<bool> EnableCondStoresVectorization( 337 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 338 cl::desc("Enable if predication of stores during vectorization.")); 339 340 static cl::opt<unsigned> MaxNestedScalarReductionIC( 341 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 342 cl::desc("The maximum interleave count to use when interleaving a scalar " 343 "reduction in a nested loop.")); 344 345 static cl::opt<bool> 346 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 347 cl::Hidden, 348 cl::desc("Prefer in-loop vector reductions, " 349 "overriding the targets preference.")); 350 351 static cl::opt<bool> ForceOrderedReductions( 352 "force-ordered-reductions", cl::init(false), cl::Hidden, 353 cl::desc("Enable the vectorisation of loops with in-order (strict) " 354 "FP reductions")); 355 356 static cl::opt<bool> PreferPredicatedReductionSelect( 357 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 358 cl::desc( 359 "Prefer predicating a reduction operation over an after loop select.")); 360 361 namespace llvm { 362 cl::opt<bool> EnableVPlanNativePath( 363 "enable-vplan-native-path", cl::Hidden, 364 cl::desc("Enable VPlan-native vectorization path with " 365 "support for outer loop vectorization.")); 366 } 367 368 // This flag enables the stress testing of the VPlan H-CFG construction in the 369 // VPlan-native vectorization path. It must be used in conjuction with 370 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 371 // verification of the H-CFGs built. 372 static cl::opt<bool> VPlanBuildStressTest( 373 "vplan-build-stress-test", cl::init(false), cl::Hidden, 374 cl::desc( 375 "Build VPlan for every supported loop nest in the function and bail " 376 "out right after the build (stress test the VPlan H-CFG construction " 377 "in the VPlan-native vectorization path).")); 378 379 cl::opt<bool> llvm::EnableLoopInterleaving( 380 "interleave-loops", cl::init(true), cl::Hidden, 381 cl::desc("Enable loop interleaving in Loop vectorization passes")); 382 cl::opt<bool> llvm::EnableLoopVectorization( 383 "vectorize-loops", cl::init(true), cl::Hidden, 384 cl::desc("Run the Loop vectorization passes")); 385 386 static cl::opt<bool> PrintVPlansInDotFormat( 387 "vplan-print-in-dot-format", cl::Hidden, 388 cl::desc("Use dot format instead of plain text when dumping VPlans")); 389 390 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 391 "force-widen-divrem-via-safe-divisor", cl::Hidden, 392 cl::desc( 393 "Override cost based safe divisor widening for div/rem instructions")); 394 395 static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 396 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 397 cl::Hidden, 398 cl::desc("Try wider VFs if they enable the use of vector variants")); 399 400 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV 401 // variables not overflowing do not hold. See `emitSCEVChecks`. 402 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 403 // Likelyhood of bypassing the vectorized loop because pointers overlap. See 404 // `emitMemRuntimeChecks`. 405 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 406 // Likelyhood of bypassing the vectorized loop because there are zero trips left 407 // after prolog. See `emitIterationCountCheck`. 408 static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 409 410 /// A helper function that returns true if the given type is irregular. The 411 /// type is irregular if its allocated size doesn't equal the store size of an 412 /// element of the corresponding vector type. 413 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 414 // Determine if an array of N elements of type Ty is "bitcast compatible" 415 // with a <N x Ty> vector. 416 // This is only true if there is no padding between the array elements. 417 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 418 } 419 420 /// Returns "best known" trip count for the specified loop \p L as defined by 421 /// the following procedure: 422 /// 1) Returns exact trip count if it is known. 423 /// 2) Returns expected trip count according to profile data if any. 424 /// 3) Returns upper bound estimate if it is known. 425 /// 4) Returns std::nullopt if all of the above failed. 426 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 427 Loop *L) { 428 // Check if exact trip count is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 430 return ExpectedTC; 431 432 // Check if there is an expected trip count available from profile data. 433 if (LoopVectorizeWithBlockFrequency) 434 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 435 return *EstimatedTC; 436 437 // Check if upper bound estimate is known. 438 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 439 return ExpectedTC; 440 441 return std::nullopt; 442 } 443 444 namespace { 445 // Forward declare GeneratedRTChecks. 446 class GeneratedRTChecks; 447 448 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 449 } // namespace 450 451 namespace llvm { 452 453 AnalysisKey ShouldRunExtraVectorPasses::Key; 454 455 /// InnerLoopVectorizer vectorizes loops which contain only one basic 456 /// block to a specified vectorization factor (VF). 457 /// This class performs the widening of scalars into vectors, or multiple 458 /// scalars. This class also implements the following features: 459 /// * It inserts an epilogue loop for handling loops that don't have iteration 460 /// counts that are known to be a multiple of the vectorization factor. 461 /// * It handles the code generation for reduction variables. 462 /// * Scalarization (implementation using scalars) of un-vectorizable 463 /// instructions. 464 /// InnerLoopVectorizer does not perform any vectorization-legality 465 /// checks, and relies on the caller to check for the different legality 466 /// aspects. The InnerLoopVectorizer relies on the 467 /// LoopVectorizationLegality class to provide information about the induction 468 /// and reduction variables that were found to a given vectorization factor. 469 class InnerLoopVectorizer { 470 public: 471 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 472 LoopInfo *LI, DominatorTree *DT, 473 const TargetLibraryInfo *TLI, 474 const TargetTransformInfo *TTI, AssumptionCache *AC, 475 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 476 ElementCount MinProfitableTripCount, 477 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 478 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 479 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 480 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 481 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 482 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 483 PSI(PSI), RTChecks(RTChecks) { 484 // Query this against the original loop and save it here because the profile 485 // of the original loop header may change as the transformation happens. 486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 487 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 488 489 if (MinProfitableTripCount.isZero()) 490 this->MinProfitableTripCount = VecWidth; 491 else 492 this->MinProfitableTripCount = MinProfitableTripCount; 493 } 494 495 virtual ~InnerLoopVectorizer() = default; 496 497 /// Create a new empty loop that will contain vectorized instructions later 498 /// on, while the old loop will be used as the scalar remainder. Control flow 499 /// is generated around the vectorized (and scalar epilogue) loops consisting 500 /// of various checks and bypasses. Return the pre-header block of the new 501 /// loop and the start value for the canonical induction, if it is != 0. The 502 /// latter is the case when vectorizing the epilogue loop. In the case of 503 /// epilogue vectorization, this function is overriden to handle the more 504 /// complex control flow around the loops. \p ExpandedSCEVs is used to 505 /// look up SCEV expansions for expressions needed during skeleton creation. 506 virtual std::pair<BasicBlock *, Value *> 507 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 508 509 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 510 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 511 512 // Return true if any runtime check is added. 513 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 514 515 /// A helper function to scalarize a single Instruction in the innermost loop. 516 /// Generates a sequence of scalar instances for each lane between \p MinLane 517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 518 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 519 /// Instr's operands. 520 void scalarizeInstruction(const Instruction *Instr, 521 VPReplicateRecipe *RepRecipe, 522 const VPIteration &Instance, 523 VPTransformState &State); 524 525 /// Fix the non-induction PHIs in \p Plan. 526 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 527 528 /// Create a new phi node for the induction variable \p OrigPhi to resume 529 /// iteration count in the scalar epilogue, from where the vectorized loop 530 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 531 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 532 /// and the resume values can come from an additional bypass block, the \p 533 /// AdditionalBypass pair provides information about the bypass block and the 534 /// end value on the edge from bypass to this loop. 535 PHINode *createInductionResumeValue( 536 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 537 ArrayRef<BasicBlock *> BypassBlocks, 538 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 539 540 /// Returns the original loop trip count. 541 Value *getTripCount() const { return TripCount; } 542 543 /// Used to set the trip count after ILV's construction and after the 544 /// preheader block has been executed. Note that this always holds the trip 545 /// count of the original loop for both main loop and epilogue vectorization. 546 void setTripCount(Value *TC) { TripCount = TC; } 547 548 protected: 549 friend class LoopVectorizationPlanner; 550 551 /// A small list of PHINodes. 552 using PhiVector = SmallVector<PHINode *, 4>; 553 554 /// A type for scalarized values in the new loop. Each value from the 555 /// original loop, when scalarized, is represented by UF x VF scalar values 556 /// in the new unrolled loop, where UF is the unroll factor and VF is the 557 /// vectorization factor. 558 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 559 560 /// Set up the values of the IVs correctly when exiting the vector loop. 561 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 562 Value *VectorTripCount, Value *EndValue, 563 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 564 VPlan &Plan, VPTransformState &State); 565 566 /// Iteratively sink the scalarized operands of a predicated instruction into 567 /// the block that was created for it. 568 void sinkScalarOperands(Instruction *PredInst); 569 570 /// Returns (and creates if needed) the trip count of the widened loop. 571 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 572 573 /// Emit a bypass check to see if the vector trip count is zero, including if 574 /// it overflows. 575 void emitIterationCountCheck(BasicBlock *Bypass); 576 577 /// Emit a bypass check to see if all of the SCEV assumptions we've 578 /// had to make are correct. Returns the block containing the checks or 579 /// nullptr if no checks have been added. 580 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 581 582 /// Emit bypass checks to check any memory assumptions we may have made. 583 /// Returns the block containing the checks or nullptr if no checks have been 584 /// added. 585 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 586 587 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 588 /// vector loop preheader, middle block and scalar preheader. 589 void createVectorLoopSkeleton(StringRef Prefix); 590 591 /// Create new phi nodes for the induction variables to resume iteration count 592 /// in the scalar epilogue, from where the vectorized loop left off. 593 /// In cases where the loop skeleton is more complicated (eg. epilogue 594 /// vectorization) and the resume values can come from an additional bypass 595 /// block, the \p AdditionalBypass pair provides information about the bypass 596 /// block and the end value on the edge from bypass to this loop. 597 void createInductionResumeValues( 598 const SCEV2ValueTy &ExpandedSCEVs, 599 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 600 601 /// Complete the loop skeleton by adding debug MDs, creating appropriate 602 /// conditional branches in the middle block, preparing the builder and 603 /// running the verifier. Return the preheader of the completed vector loop. 604 BasicBlock *completeLoopSkeleton(); 605 606 /// Allow subclasses to override and print debug traces before/after vplan 607 /// execution, when trace information is requested. 608 virtual void printDebugTracesAtStart(){}; 609 virtual void printDebugTracesAtEnd(){}; 610 611 /// The original loop. 612 Loop *OrigLoop; 613 614 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 615 /// dynamic knowledge to simplify SCEV expressions and converts them to a 616 /// more usable form. 617 PredicatedScalarEvolution &PSE; 618 619 /// Loop Info. 620 LoopInfo *LI; 621 622 /// Dominator Tree. 623 DominatorTree *DT; 624 625 /// Target Library Info. 626 const TargetLibraryInfo *TLI; 627 628 /// Target Transform Info. 629 const TargetTransformInfo *TTI; 630 631 /// Assumption Cache. 632 AssumptionCache *AC; 633 634 /// Interface to emit optimization remarks. 635 OptimizationRemarkEmitter *ORE; 636 637 /// The vectorization SIMD factor to use. Each vector will have this many 638 /// vector elements. 639 ElementCount VF; 640 641 ElementCount MinProfitableTripCount; 642 643 /// The vectorization unroll factor to use. Each scalar is vectorized to this 644 /// many different vector instructions. 645 unsigned UF; 646 647 /// The builder that we use 648 IRBuilder<> Builder; 649 650 // --- Vectorization state --- 651 652 /// The vector-loop preheader. 653 BasicBlock *LoopVectorPreHeader; 654 655 /// The scalar-loop preheader. 656 BasicBlock *LoopScalarPreHeader; 657 658 /// Middle Block between the vector and the scalar. 659 BasicBlock *LoopMiddleBlock; 660 661 /// The unique ExitBlock of the scalar loop if one exists. Note that 662 /// there can be multiple exiting edges reaching this block. 663 BasicBlock *LoopExitBlock; 664 665 /// The scalar loop body. 666 BasicBlock *LoopScalarBody; 667 668 /// A list of all bypass blocks. The first block is the entry of the loop. 669 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 670 671 /// Store instructions that were predicated. 672 SmallVector<Instruction *, 4> PredicatedInstructions; 673 674 /// Trip count of the original loop. 675 Value *TripCount = nullptr; 676 677 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 678 Value *VectorTripCount = nullptr; 679 680 /// The legality analysis. 681 LoopVectorizationLegality *Legal; 682 683 /// The profitablity analysis. 684 LoopVectorizationCostModel *Cost; 685 686 // Record whether runtime checks are added. 687 bool AddedSafetyChecks = false; 688 689 // Holds the end values for each induction variable. We save the end values 690 // so we can later fix-up the external users of the induction variables. 691 DenseMap<PHINode *, Value *> IVEndValues; 692 693 /// BFI and PSI are used to check for profile guided size optimizations. 694 BlockFrequencyInfo *BFI; 695 ProfileSummaryInfo *PSI; 696 697 // Whether this loop should be optimized for size based on profile guided size 698 // optimizatios. 699 bool OptForSizeBasedOnProfile; 700 701 /// Structure to hold information about generated runtime checks, responsible 702 /// for cleaning the checks, if vectorization turns out unprofitable. 703 GeneratedRTChecks &RTChecks; 704 705 // Holds the resume values for reductions in the loops, used to set the 706 // correct start value of reduction PHIs when vectorizing the epilogue. 707 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 708 ReductionResumeValues; 709 }; 710 711 class InnerLoopUnroller : public InnerLoopVectorizer { 712 public: 713 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 714 LoopInfo *LI, DominatorTree *DT, 715 const TargetLibraryInfo *TLI, 716 const TargetTransformInfo *TTI, AssumptionCache *AC, 717 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 718 LoopVectorizationLegality *LVL, 719 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 720 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 721 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 722 ElementCount::getFixed(1), 723 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 724 BFI, PSI, Check) {} 725 }; 726 727 /// Encapsulate information regarding vectorization of a loop and its epilogue. 728 /// This information is meant to be updated and used across two stages of 729 /// epilogue vectorization. 730 struct EpilogueLoopVectorizationInfo { 731 ElementCount MainLoopVF = ElementCount::getFixed(0); 732 unsigned MainLoopUF = 0; 733 ElementCount EpilogueVF = ElementCount::getFixed(0); 734 unsigned EpilogueUF = 0; 735 BasicBlock *MainLoopIterationCountCheck = nullptr; 736 BasicBlock *EpilogueIterationCountCheck = nullptr; 737 BasicBlock *SCEVSafetyCheck = nullptr; 738 BasicBlock *MemSafetyCheck = nullptr; 739 Value *TripCount = nullptr; 740 Value *VectorTripCount = nullptr; 741 742 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 743 ElementCount EVF, unsigned EUF) 744 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 745 assert(EUF == 1 && 746 "A high UF for the epilogue loop is likely not beneficial."); 747 } 748 }; 749 750 /// An extension of the inner loop vectorizer that creates a skeleton for a 751 /// vectorized loop that has its epilogue (residual) also vectorized. 752 /// The idea is to run the vplan on a given loop twice, firstly to setup the 753 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 754 /// from the first step and vectorize the epilogue. This is achieved by 755 /// deriving two concrete strategy classes from this base class and invoking 756 /// them in succession from the loop vectorizer planner. 757 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 758 public: 759 InnerLoopAndEpilogueVectorizer( 760 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 761 DominatorTree *DT, const TargetLibraryInfo *TLI, 762 const TargetTransformInfo *TTI, AssumptionCache *AC, 763 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 764 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 765 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 766 GeneratedRTChecks &Checks) 767 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 768 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 769 CM, BFI, PSI, Checks), 770 EPI(EPI) {} 771 772 // Override this function to handle the more complex control flow around the 773 // three loops. 774 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 775 const SCEV2ValueTy &ExpandedSCEVs) final { 776 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 777 } 778 779 /// The interface for creating a vectorized skeleton using one of two 780 /// different strategies, each corresponding to one execution of the vplan 781 /// as described above. 782 virtual std::pair<BasicBlock *, Value *> 783 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 784 785 /// Holds and updates state information required to vectorize the main loop 786 /// and its epilogue in two separate passes. This setup helps us avoid 787 /// regenerating and recomputing runtime safety checks. It also helps us to 788 /// shorten the iteration-count-check path length for the cases where the 789 /// iteration count of the loop is so small that the main vector loop is 790 /// completely skipped. 791 EpilogueLoopVectorizationInfo &EPI; 792 }; 793 794 /// A specialized derived class of inner loop vectorizer that performs 795 /// vectorization of *main* loops in the process of vectorizing loops and their 796 /// epilogues. 797 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 798 public: 799 EpilogueVectorizerMainLoop( 800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 801 DominatorTree *DT, const TargetLibraryInfo *TLI, 802 const TargetTransformInfo *TTI, AssumptionCache *AC, 803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 806 GeneratedRTChecks &Check) 807 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 808 EPI, LVL, CM, BFI, PSI, Check) {} 809 /// Implements the interface for creating a vectorized skeleton using the 810 /// *main loop* strategy (ie the first pass of vplan execution). 811 std::pair<BasicBlock *, Value *> 812 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 813 814 protected: 815 /// Emits an iteration count bypass check once for the main loop (when \p 816 /// ForEpilogue is false) and once for the epilogue loop (when \p 817 /// ForEpilogue is true). 818 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 819 void printDebugTracesAtStart() override; 820 void printDebugTracesAtEnd() override; 821 }; 822 823 // A specialized derived class of inner loop vectorizer that performs 824 // vectorization of *epilogue* loops in the process of vectorizing loops and 825 // their epilogues. 826 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 827 public: 828 EpilogueVectorizerEpilogueLoop( 829 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 830 DominatorTree *DT, const TargetLibraryInfo *TLI, 831 const TargetTransformInfo *TTI, AssumptionCache *AC, 832 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 833 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 834 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 835 GeneratedRTChecks &Checks) 836 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 837 EPI, LVL, CM, BFI, PSI, Checks) { 838 TripCount = EPI.TripCount; 839 } 840 /// Implements the interface for creating a vectorized skeleton using the 841 /// *epilogue loop* strategy (ie the second pass of vplan execution). 842 std::pair<BasicBlock *, Value *> 843 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 844 845 protected: 846 /// Emits an iteration count bypass check after the main vector loop has 847 /// finished to see if there are any iterations left to execute by either 848 /// the vector epilogue or the scalar epilogue. 849 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 850 BasicBlock *Bypass, 851 BasicBlock *Insert); 852 void printDebugTracesAtStart() override; 853 void printDebugTracesAtEnd() override; 854 }; 855 } // end namespace llvm 856 857 /// Look for a meaningful debug location on the instruction or it's 858 /// operands. 859 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 860 if (!I) 861 return DebugLoc(); 862 863 DebugLoc Empty; 864 if (I->getDebugLoc() != Empty) 865 return I->getDebugLoc(); 866 867 for (Use &Op : I->operands()) { 868 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 869 if (OpInst->getDebugLoc() != Empty) 870 return OpInst->getDebugLoc(); 871 } 872 873 return I->getDebugLoc(); 874 } 875 876 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 877 /// is passed, the message relates to that particular instruction. 878 #ifndef NDEBUG 879 static void debugVectorizationMessage(const StringRef Prefix, 880 const StringRef DebugMsg, 881 Instruction *I) { 882 dbgs() << "LV: " << Prefix << DebugMsg; 883 if (I != nullptr) 884 dbgs() << " " << *I; 885 else 886 dbgs() << '.'; 887 dbgs() << '\n'; 888 } 889 #endif 890 891 /// Create an analysis remark that explains why vectorization failed 892 /// 893 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 894 /// RemarkName is the identifier for the remark. If \p I is passed it is an 895 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 896 /// the location of the remark. \return the remark object that can be 897 /// streamed to. 898 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 899 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 900 Value *CodeRegion = TheLoop->getHeader(); 901 DebugLoc DL = TheLoop->getStartLoc(); 902 903 if (I) { 904 CodeRegion = I->getParent(); 905 // If there is no debug location attached to the instruction, revert back to 906 // using the loop's. 907 if (I->getDebugLoc()) 908 DL = I->getDebugLoc(); 909 } 910 911 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 912 } 913 914 namespace llvm { 915 916 /// Return a value for Step multiplied by VF. 917 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 918 int64_t Step) { 919 assert(Ty->isIntegerTy() && "Expected an integer step"); 920 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 921 } 922 923 /// Return the runtime value for VF. 924 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 925 return B.CreateElementCount(Ty, VF); 926 } 927 928 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 929 Loop *OrigLoop) { 930 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 931 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 932 933 ScalarEvolution &SE = *PSE.getSE(); 934 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 935 } 936 937 void reportVectorizationFailure(const StringRef DebugMsg, 938 const StringRef OREMsg, const StringRef ORETag, 939 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 940 Instruction *I) { 941 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 942 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 943 ORE->emit( 944 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 945 << "loop not vectorized: " << OREMsg); 946 } 947 948 /// Reports an informative message: print \p Msg for debugging purposes as well 949 /// as an optimization remark. Uses either \p I as location of the remark, or 950 /// otherwise \p TheLoop. 951 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 952 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 953 Instruction *I = nullptr) { 954 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 955 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 956 ORE->emit( 957 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 958 << Msg); 959 } 960 961 /// Report successful vectorization of the loop. In case an outer loop is 962 /// vectorized, prepend "outer" to the vectorization remark. 963 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 964 VectorizationFactor VF, unsigned IC) { 965 LLVM_DEBUG(debugVectorizationMessage( 966 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 967 nullptr)); 968 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 969 ORE->emit([&]() { 970 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 971 TheLoop->getHeader()) 972 << "vectorized " << LoopType << "loop (vectorization width: " 973 << ore::NV("VectorizationFactor", VF.Width) 974 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 975 }); 976 } 977 978 } // end namespace llvm 979 980 namespace llvm { 981 982 // Loop vectorization cost-model hints how the scalar epilogue loop should be 983 // lowered. 984 enum ScalarEpilogueLowering { 985 986 // The default: allowing scalar epilogues. 987 CM_ScalarEpilogueAllowed, 988 989 // Vectorization with OptForSize: don't allow epilogues. 990 CM_ScalarEpilogueNotAllowedOptSize, 991 992 // A special case of vectorisation with OptForSize: loops with a very small 993 // trip count are considered for vectorization under OptForSize, thereby 994 // making sure the cost of their loop body is dominant, free of runtime 995 // guards and scalar iteration overheads. 996 CM_ScalarEpilogueNotAllowedLowTripLoop, 997 998 // Loop hint predicate indicating an epilogue is undesired. 999 CM_ScalarEpilogueNotNeededUsePredicate, 1000 1001 // Directive indicating we must either tail fold or not vectorize 1002 CM_ScalarEpilogueNotAllowedUsePredicate 1003 }; 1004 1005 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1006 1007 /// LoopVectorizationCostModel - estimates the expected speedups due to 1008 /// vectorization. 1009 /// In many cases vectorization is not profitable. This can happen because of 1010 /// a number of reasons. In this class we mainly attempt to predict the 1011 /// expected speedup/slowdowns due to the supported instruction set. We use the 1012 /// TargetTransformInfo to query the different backends for the cost of 1013 /// different operations. 1014 class LoopVectorizationCostModel { 1015 public: 1016 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1017 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1018 LoopVectorizationLegality *Legal, 1019 const TargetTransformInfo &TTI, 1020 const TargetLibraryInfo *TLI, DemandedBits *DB, 1021 AssumptionCache *AC, 1022 OptimizationRemarkEmitter *ORE, const Function *F, 1023 const LoopVectorizeHints *Hints, 1024 InterleavedAccessInfo &IAI) 1025 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1026 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1027 Hints(Hints), InterleaveInfo(IAI) {} 1028 1029 /// \return An upper bound for the vectorization factors (both fixed and 1030 /// scalable). If the factors are 0, vectorization and interleaving should be 1031 /// avoided up front. 1032 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1033 1034 /// \return True if runtime checks are required for vectorization, and false 1035 /// otherwise. 1036 bool runtimeChecksRequired(); 1037 1038 /// Setup cost-based decisions for user vectorization factor. 1039 /// \return true if the UserVF is a feasible VF to be chosen. 1040 bool selectUserVectorizationFactor(ElementCount UserVF) { 1041 collectUniformsAndScalars(UserVF); 1042 collectInstsToScalarize(UserVF); 1043 return expectedCost(UserVF).isValid(); 1044 } 1045 1046 /// \return The size (in bits) of the smallest and widest types in the code 1047 /// that needs to be vectorized. We ignore values that remain scalar such as 1048 /// 64 bit loop indices. 1049 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1050 1051 /// \return The desired interleave count. 1052 /// If interleave count has been specified by metadata it will be returned. 1053 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1054 /// are the selected vectorization factor and the cost of the selected VF. 1055 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1056 1057 /// Memory access instruction may be vectorized in more than one way. 1058 /// Form of instruction after vectorization depends on cost. 1059 /// This function takes cost-based decisions for Load/Store instructions 1060 /// and collects them in a map. This decisions map is used for building 1061 /// the lists of loop-uniform and loop-scalar instructions. 1062 /// The calculated cost is saved with widening decision in order to 1063 /// avoid redundant calculations. 1064 void setCostBasedWideningDecision(ElementCount VF); 1065 1066 /// A call may be vectorized in different ways depending on whether we have 1067 /// vectorized variants available and whether the target supports masking. 1068 /// This function analyzes all calls in the function at the supplied VF, 1069 /// makes a decision based on the costs of available options, and stores that 1070 /// decision in a map for use in planning and plan execution. 1071 void setVectorizedCallDecision(ElementCount VF); 1072 1073 /// A struct that represents some properties of the register usage 1074 /// of a loop. 1075 struct RegisterUsage { 1076 /// Holds the number of loop invariant values that are used in the loop. 1077 /// The key is ClassID of target-provided register class. 1078 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1079 /// Holds the maximum number of concurrent live intervals in the loop. 1080 /// The key is ClassID of target-provided register class. 1081 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1082 }; 1083 1084 /// \return Returns information about the register usages of the loop for the 1085 /// given vectorization factors. 1086 SmallVector<RegisterUsage, 8> 1087 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1088 1089 /// Collect values we want to ignore in the cost model. 1090 void collectValuesToIgnore(); 1091 1092 /// Collect all element types in the loop for which widening is needed. 1093 void collectElementTypesForWidening(); 1094 1095 /// Split reductions into those that happen in the loop, and those that happen 1096 /// outside. In loop reductions are collected into InLoopReductions. 1097 void collectInLoopReductions(); 1098 1099 /// Returns true if we should use strict in-order reductions for the given 1100 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1101 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1102 /// of FP operations. 1103 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1104 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1105 } 1106 1107 /// \returns The smallest bitwidth each instruction can be represented with. 1108 /// The vector equivalents of these instructions should be truncated to this 1109 /// type. 1110 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1111 return MinBWs; 1112 } 1113 1114 /// \returns True if it is more profitable to scalarize instruction \p I for 1115 /// vectorization factor \p VF. 1116 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1117 assert(VF.isVector() && 1118 "Profitable to scalarize relevant only for VF > 1."); 1119 assert( 1120 TheLoop->isInnermost() && 1121 "cost-model should not be used for outer loops (in VPlan-native path)"); 1122 1123 auto Scalars = InstsToScalarize.find(VF); 1124 assert(Scalars != InstsToScalarize.end() && 1125 "VF not yet analyzed for scalarization profitability"); 1126 return Scalars->second.contains(I); 1127 } 1128 1129 /// Returns true if \p I is known to be uniform after vectorization. 1130 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1131 assert( 1132 TheLoop->isInnermost() && 1133 "cost-model should not be used for outer loops (in VPlan-native path)"); 1134 // Pseudo probe needs to be duplicated for each unrolled iteration and 1135 // vector lane so that profiled loop trip count can be accurately 1136 // accumulated instead of being under counted. 1137 if (isa<PseudoProbeInst>(I)) 1138 return false; 1139 1140 if (VF.isScalar()) 1141 return true; 1142 1143 auto UniformsPerVF = Uniforms.find(VF); 1144 assert(UniformsPerVF != Uniforms.end() && 1145 "VF not yet analyzed for uniformity"); 1146 return UniformsPerVF->second.count(I); 1147 } 1148 1149 /// Returns true if \p I is known to be scalar after vectorization. 1150 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1151 assert( 1152 TheLoop->isInnermost() && 1153 "cost-model should not be used for outer loops (in VPlan-native path)"); 1154 if (VF.isScalar()) 1155 return true; 1156 1157 auto ScalarsPerVF = Scalars.find(VF); 1158 assert(ScalarsPerVF != Scalars.end() && 1159 "Scalar values are not calculated for VF"); 1160 return ScalarsPerVF->second.count(I); 1161 } 1162 1163 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1164 /// for vectorization factor \p VF. 1165 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1166 return VF.isVector() && MinBWs.contains(I) && 1167 !isProfitableToScalarize(I, VF) && 1168 !isScalarAfterVectorization(I, VF); 1169 } 1170 1171 /// Decision that was taken during cost calculation for memory instruction. 1172 enum InstWidening { 1173 CM_Unknown, 1174 CM_Widen, // For consecutive accesses with stride +1. 1175 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1176 CM_Interleave, 1177 CM_GatherScatter, 1178 CM_Scalarize, 1179 CM_VectorCall, 1180 CM_IntrinsicCall 1181 }; 1182 1183 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1184 /// instruction \p I and vector width \p VF. 1185 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1186 InstructionCost Cost) { 1187 assert(VF.isVector() && "Expected VF >=2"); 1188 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1189 } 1190 1191 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1192 /// interleaving group \p Grp and vector width \p VF. 1193 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1194 ElementCount VF, InstWidening W, 1195 InstructionCost Cost) { 1196 assert(VF.isVector() && "Expected VF >=2"); 1197 /// Broadcast this decicion to all instructions inside the group. 1198 /// But the cost will be assigned to one instruction only. 1199 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1200 if (auto *I = Grp->getMember(i)) { 1201 if (Grp->getInsertPos() == I) 1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1203 else 1204 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1205 } 1206 } 1207 } 1208 1209 /// Return the cost model decision for the given instruction \p I and vector 1210 /// width \p VF. Return CM_Unknown if this instruction did not pass 1211 /// through the cost modeling. 1212 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1213 assert(VF.isVector() && "Expected VF to be a vector VF"); 1214 assert( 1215 TheLoop->isInnermost() && 1216 "cost-model should not be used for outer loops (in VPlan-native path)"); 1217 1218 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1219 auto Itr = WideningDecisions.find(InstOnVF); 1220 if (Itr == WideningDecisions.end()) 1221 return CM_Unknown; 1222 return Itr->second.first; 1223 } 1224 1225 /// Return the vectorization cost for the given instruction \p I and vector 1226 /// width \p VF. 1227 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1228 assert(VF.isVector() && "Expected VF >=2"); 1229 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1230 assert(WideningDecisions.contains(InstOnVF) && 1231 "The cost is not calculated"); 1232 return WideningDecisions[InstOnVF].second; 1233 } 1234 1235 struct CallWideningDecision { 1236 InstWidening Kind; 1237 Function *Variant; 1238 Intrinsic::ID IID; 1239 std::optional<unsigned> MaskPos; 1240 InstructionCost Cost; 1241 }; 1242 1243 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1244 Function *Variant, Intrinsic::ID IID, 1245 std::optional<unsigned> MaskPos, 1246 InstructionCost Cost) { 1247 assert(!VF.isScalar() && "Expected vector VF"); 1248 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1249 MaskPos, Cost}; 1250 } 1251 1252 CallWideningDecision getCallWideningDecision(CallInst *CI, 1253 ElementCount VF) const { 1254 assert(!VF.isScalar() && "Expected vector VF"); 1255 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1256 } 1257 1258 /// Return True if instruction \p I is an optimizable truncate whose operand 1259 /// is an induction variable. Such a truncate will be removed by adding a new 1260 /// induction variable with the destination type. 1261 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1262 // If the instruction is not a truncate, return false. 1263 auto *Trunc = dyn_cast<TruncInst>(I); 1264 if (!Trunc) 1265 return false; 1266 1267 // Get the source and destination types of the truncate. 1268 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1269 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1270 1271 // If the truncate is free for the given types, return false. Replacing a 1272 // free truncate with an induction variable would add an induction variable 1273 // update instruction to each iteration of the loop. We exclude from this 1274 // check the primary induction variable since it will need an update 1275 // instruction regardless. 1276 Value *Op = Trunc->getOperand(0); 1277 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1278 return false; 1279 1280 // If the truncated value is not an induction variable, return false. 1281 return Legal->isInductionPhi(Op); 1282 } 1283 1284 /// Collects the instructions to scalarize for each predicated instruction in 1285 /// the loop. 1286 void collectInstsToScalarize(ElementCount VF); 1287 1288 /// Collect Uniform and Scalar values for the given \p VF. 1289 /// The sets depend on CM decision for Load/Store instructions 1290 /// that may be vectorized as interleave, gather-scatter or scalarized. 1291 /// Also make a decision on what to do about call instructions in the loop 1292 /// at that VF -- scalarize, call a known vector routine, or call a 1293 /// vector intrinsic. 1294 void collectUniformsAndScalars(ElementCount VF) { 1295 // Do the analysis once. 1296 if (VF.isScalar() || Uniforms.contains(VF)) 1297 return; 1298 setCostBasedWideningDecision(VF); 1299 setVectorizedCallDecision(VF); 1300 collectLoopUniforms(VF); 1301 collectLoopScalars(VF); 1302 } 1303 1304 /// Returns true if the target machine supports masked store operation 1305 /// for the given \p DataType and kind of access to \p Ptr. 1306 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1307 return Legal->isConsecutivePtr(DataType, Ptr) && 1308 TTI.isLegalMaskedStore(DataType, Alignment); 1309 } 1310 1311 /// Returns true if the target machine supports masked load operation 1312 /// for the given \p DataType and kind of access to \p Ptr. 1313 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1314 return Legal->isConsecutivePtr(DataType, Ptr) && 1315 TTI.isLegalMaskedLoad(DataType, Alignment); 1316 } 1317 1318 /// Returns true if the target machine can represent \p V as a masked gather 1319 /// or scatter operation. 1320 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1321 bool LI = isa<LoadInst>(V); 1322 bool SI = isa<StoreInst>(V); 1323 if (!LI && !SI) 1324 return false; 1325 auto *Ty = getLoadStoreType(V); 1326 Align Align = getLoadStoreAlignment(V); 1327 if (VF.isVector()) 1328 Ty = VectorType::get(Ty, VF); 1329 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1330 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1331 } 1332 1333 /// Returns true if the target machine supports all of the reduction 1334 /// variables found for the given VF. 1335 bool canVectorizeReductions(ElementCount VF) const { 1336 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1337 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1338 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1339 })); 1340 } 1341 1342 /// Given costs for both strategies, return true if the scalar predication 1343 /// lowering should be used for div/rem. This incorporates an override 1344 /// option so it is not simply a cost comparison. 1345 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1346 InstructionCost SafeDivisorCost) const { 1347 switch (ForceSafeDivisor) { 1348 case cl::BOU_UNSET: 1349 return ScalarCost < SafeDivisorCost; 1350 case cl::BOU_TRUE: 1351 return false; 1352 case cl::BOU_FALSE: 1353 return true; 1354 }; 1355 llvm_unreachable("impossible case value"); 1356 } 1357 1358 /// Returns true if \p I is an instruction which requires predication and 1359 /// for which our chosen predication strategy is scalarization (i.e. we 1360 /// don't have an alternate strategy such as masking available). 1361 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1362 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1363 1364 /// Returns true if \p I is an instruction that needs to be predicated 1365 /// at runtime. The result is independent of the predication mechanism. 1366 /// Superset of instructions that return true for isScalarWithPredication. 1367 bool isPredicatedInst(Instruction *I) const; 1368 1369 /// Return the costs for our two available strategies for lowering a 1370 /// div/rem operation which requires speculating at least one lane. 1371 /// First result is for scalarization (will be invalid for scalable 1372 /// vectors); second is for the safe-divisor strategy. 1373 std::pair<InstructionCost, InstructionCost> 1374 getDivRemSpeculationCost(Instruction *I, 1375 ElementCount VF) const; 1376 1377 /// Returns true if \p I is a memory instruction with consecutive memory 1378 /// access that can be widened. 1379 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1380 1381 /// Returns true if \p I is a memory instruction in an interleaved-group 1382 /// of memory accesses that can be vectorized with wide vector loads/stores 1383 /// and shuffles. 1384 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; 1385 1386 /// Check if \p Instr belongs to any interleaved access group. 1387 bool isAccessInterleaved(Instruction *Instr) const { 1388 return InterleaveInfo.isInterleaved(Instr); 1389 } 1390 1391 /// Get the interleaved access group that \p Instr belongs to. 1392 const InterleaveGroup<Instruction> * 1393 getInterleavedAccessGroup(Instruction *Instr) const { 1394 return InterleaveInfo.getInterleaveGroup(Instr); 1395 } 1396 1397 /// Returns true if we're required to use a scalar epilogue for at least 1398 /// the final iteration of the original loop. 1399 bool requiresScalarEpilogue(bool IsVectorizing) const { 1400 if (!isScalarEpilogueAllowed()) { 1401 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1402 return false; 1403 } 1404 // If we might exit from anywhere but the latch, must run the exiting 1405 // iteration in scalar form. 1406 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 1407 LLVM_DEBUG( 1408 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); 1409 return true; 1410 } 1411 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { 1412 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " 1413 "interleaved group requires scalar epilogue\n"); 1414 return true; 1415 } 1416 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); 1417 return false; 1418 } 1419 1420 /// Returns true if we're required to use a scalar epilogue for at least 1421 /// the final iteration of the original loop for all VFs in \p Range. 1422 /// A scalar epilogue must either be required for all VFs in \p Range or for 1423 /// none. 1424 bool requiresScalarEpilogue(VFRange Range) const { 1425 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1426 return requiresScalarEpilogue(VF.isVector()); 1427 }; 1428 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1429 assert( 1430 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1431 "all VFs in range must agree on whether a scalar epilogue is required"); 1432 return IsRequired; 1433 } 1434 1435 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1436 /// loop hint annotation. 1437 bool isScalarEpilogueAllowed() const { 1438 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1439 } 1440 1441 /// Returns the TailFoldingStyle that is best for the current loop. 1442 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1443 if (!ChosenTailFoldingStyle) 1444 return TailFoldingStyle::None; 1445 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first 1446 : ChosenTailFoldingStyle->second; 1447 } 1448 1449 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1450 /// overflow or not. 1451 /// \param IsScalableVF true if scalable vector factors enabled. 1452 /// \param UserIC User specific interleave count. 1453 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { 1454 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); 1455 if (!Legal->canFoldTailByMasking()) { 1456 ChosenTailFoldingStyle = 1457 std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); 1458 return; 1459 } 1460 1461 if (!ForceTailFoldingStyle.getNumOccurrences()) { 1462 ChosenTailFoldingStyle = std::make_pair( 1463 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), 1464 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); 1465 return; 1466 } 1467 1468 // Set styles when forced. 1469 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), 1470 ForceTailFoldingStyle.getValue()); 1471 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) 1472 return; 1473 // Override forced styles if needed. 1474 // FIXME: use actual opcode/data type for analysis here. 1475 // FIXME: Investigate opportunity for fixed vector factor. 1476 bool EVLIsLegal = 1477 IsScalableVF && UserIC <= 1 && 1478 TTI.hasActiveVectorLength(0, nullptr, Align()) && 1479 !EnableVPlanNativePath && 1480 // FIXME: implement support for max safe dependency distance. 1481 Legal->isSafeForAnyVectorWidth(); 1482 if (!EVLIsLegal) { 1483 // If for some reason EVL mode is unsupported, fallback to 1484 // DataWithoutLaneMask to try to vectorize the loop with folded tail 1485 // in a generic way. 1486 ChosenTailFoldingStyle = 1487 std::make_pair(TailFoldingStyle::DataWithoutLaneMask, 1488 TailFoldingStyle::DataWithoutLaneMask); 1489 LLVM_DEBUG( 1490 dbgs() 1491 << "LV: Preference for VP intrinsics indicated. Will " 1492 "not try to generate VP Intrinsics " 1493 << (UserIC > 1 1494 ? "since interleave count specified is greater than 1.\n" 1495 : "due to non-interleaving reasons.\n")); 1496 } 1497 } 1498 1499 /// Returns true if all loop blocks should be masked to fold tail loop. 1500 bool foldTailByMasking() const { 1501 // TODO: check if it is possible to check for None style independent of 1502 // IVUpdateMayOverflow flag in getTailFoldingStyle. 1503 return getTailFoldingStyle() != TailFoldingStyle::None; 1504 } 1505 1506 /// Returns true if the instructions in this block requires predication 1507 /// for any reason, e.g. because tail folding now requires a predicate 1508 /// or because the block in the original loop was predicated. 1509 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1510 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1511 } 1512 1513 /// Returns true if VP intrinsics with explicit vector length support should 1514 /// be generated in the tail folded loop. 1515 bool foldTailWithEVL() const { 1516 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; 1517 } 1518 1519 /// Returns true if the Phi is part of an inloop reduction. 1520 bool isInLoopReduction(PHINode *Phi) const { 1521 return InLoopReductions.contains(Phi); 1522 } 1523 1524 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1525 /// with factor VF. Return the cost of the instruction, including 1526 /// scalarization overhead if it's needed. 1527 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1528 1529 /// Estimate cost of a call instruction CI if it were vectorized with factor 1530 /// VF. Return the cost of the instruction, including scalarization overhead 1531 /// if it's needed. 1532 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1533 1534 /// Invalidates decisions already taken by the cost model. 1535 void invalidateCostModelingDecisions() { 1536 WideningDecisions.clear(); 1537 CallWideningDecisions.clear(); 1538 Uniforms.clear(); 1539 Scalars.clear(); 1540 } 1541 1542 /// Returns the expected execution cost. The unit of the cost does 1543 /// not matter because we use the 'cost' units to compare different 1544 /// vector widths. The cost that is returned is *not* normalized by 1545 /// the factor width. If \p Invalid is not nullptr, this function 1546 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1547 /// each instruction that has an Invalid cost for the given VF. 1548 InstructionCost 1549 expectedCost(ElementCount VF, 1550 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1551 1552 bool hasPredStores() const { return NumPredStores > 0; } 1553 1554 /// Returns true if epilogue vectorization is considered profitable, and 1555 /// false otherwise. 1556 /// \p VF is the vectorization factor chosen for the original loop. 1557 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1558 1559 /// Returns the execution time cost of an instruction for a given vector 1560 /// width. Vector width of one means scalar. 1561 InstructionCost getInstructionCost(Instruction *I, ElementCount VF); 1562 1563 /// Return the cost of instructions in an inloop reduction pattern, if I is 1564 /// part of that pattern. 1565 std::optional<InstructionCost> 1566 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1567 TTI::TargetCostKind CostKind) const; 1568 1569 private: 1570 unsigned NumPredStores = 0; 1571 1572 /// \return An upper bound for the vectorization factors for both 1573 /// fixed and scalable vectorization, where the minimum-known number of 1574 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1575 /// disabled or unsupported, then the scalable part will be equal to 1576 /// ElementCount::getScalable(0). 1577 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1578 ElementCount UserVF, 1579 bool FoldTailByMasking); 1580 1581 /// \return the maximized element count based on the targets vector 1582 /// registers and the loop trip-count, but limited to a maximum safe VF. 1583 /// This is a helper function of computeFeasibleMaxVF. 1584 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1585 unsigned SmallestType, 1586 unsigned WidestType, 1587 ElementCount MaxSafeVF, 1588 bool FoldTailByMasking); 1589 1590 /// Checks if scalable vectorization is supported and enabled. Caches the 1591 /// result to avoid repeated debug dumps for repeated queries. 1592 bool isScalableVectorizationAllowed(); 1593 1594 /// \return the maximum legal scalable VF, based on the safe max number 1595 /// of elements. 1596 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1597 1598 /// Calculate vectorization cost of memory instruction \p I. 1599 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1600 1601 /// The cost computation for scalarized memory instruction. 1602 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1603 1604 /// The cost computation for interleaving group of memory instructions. 1605 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1606 1607 /// The cost computation for Gather/Scatter instruction. 1608 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1609 1610 /// The cost computation for widening instruction \p I with consecutive 1611 /// memory access. 1612 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1613 1614 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1615 /// Load: scalar load + broadcast. 1616 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1617 /// element) 1618 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1619 1620 /// Estimate the overhead of scalarizing an instruction. This is a 1621 /// convenience wrapper for the type-based getScalarizationOverhead API. 1622 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1623 TTI::TargetCostKind CostKind) const; 1624 1625 /// Returns true if an artificially high cost for emulated masked memrefs 1626 /// should be used. 1627 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1628 1629 /// Map of scalar integer values to the smallest bitwidth they can be legally 1630 /// represented as. The vector equivalents of these values should be truncated 1631 /// to this type. 1632 MapVector<Instruction *, uint64_t> MinBWs; 1633 1634 /// A type representing the costs for instructions if they were to be 1635 /// scalarized rather than vectorized. The entries are Instruction-Cost 1636 /// pairs. 1637 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1638 1639 /// A set containing all BasicBlocks that are known to present after 1640 /// vectorization as a predicated block. 1641 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1642 PredicatedBBsAfterVectorization; 1643 1644 /// Records whether it is allowed to have the original scalar loop execute at 1645 /// least once. This may be needed as a fallback loop in case runtime 1646 /// aliasing/dependence checks fail, or to handle the tail/remainder 1647 /// iterations when the trip count is unknown or doesn't divide by the VF, 1648 /// or as a peel-loop to handle gaps in interleave-groups. 1649 /// Under optsize and when the trip count is very small we don't allow any 1650 /// iterations to execute in the scalar loop. 1651 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1652 1653 /// Control finally chosen tail folding style. The first element is used if 1654 /// the IV update may overflow, the second element - if it does not. 1655 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> 1656 ChosenTailFoldingStyle; 1657 1658 /// true if scalable vectorization is supported and enabled. 1659 std::optional<bool> IsScalableVectorizationAllowed; 1660 1661 /// A map holding scalar costs for different vectorization factors. The 1662 /// presence of a cost for an instruction in the mapping indicates that the 1663 /// instruction will be scalarized when vectorizing with the associated 1664 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1665 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1666 1667 /// Holds the instructions known to be uniform after vectorization. 1668 /// The data is collected per VF. 1669 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1670 1671 /// Holds the instructions known to be scalar after vectorization. 1672 /// The data is collected per VF. 1673 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1674 1675 /// Holds the instructions (address computations) that are forced to be 1676 /// scalarized. 1677 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1678 1679 /// PHINodes of the reductions that should be expanded in-loop. 1680 SmallPtrSet<PHINode *, 4> InLoopReductions; 1681 1682 /// A Map of inloop reduction operations and their immediate chain operand. 1683 /// FIXME: This can be removed once reductions can be costed correctly in 1684 /// VPlan. This was added to allow quick lookup of the inloop operations. 1685 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1686 1687 /// Returns the expected difference in cost from scalarizing the expression 1688 /// feeding a predicated instruction \p PredInst. The instructions to 1689 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1690 /// non-negative return value implies the expression will be scalarized. 1691 /// Currently, only single-use chains are considered for scalarization. 1692 InstructionCost computePredInstDiscount(Instruction *PredInst, 1693 ScalarCostsTy &ScalarCosts, 1694 ElementCount VF); 1695 1696 /// Collect the instructions that are uniform after vectorization. An 1697 /// instruction is uniform if we represent it with a single scalar value in 1698 /// the vectorized loop corresponding to each vector iteration. Examples of 1699 /// uniform instructions include pointer operands of consecutive or 1700 /// interleaved memory accesses. Note that although uniformity implies an 1701 /// instruction will be scalar, the reverse is not true. In general, a 1702 /// scalarized instruction will be represented by VF scalar values in the 1703 /// vectorized loop, each corresponding to an iteration of the original 1704 /// scalar loop. 1705 void collectLoopUniforms(ElementCount VF); 1706 1707 /// Collect the instructions that are scalar after vectorization. An 1708 /// instruction is scalar if it is known to be uniform or will be scalarized 1709 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1710 /// to the list if they are used by a load/store instruction that is marked as 1711 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1712 /// VF values in the vectorized loop, each corresponding to an iteration of 1713 /// the original scalar loop. 1714 void collectLoopScalars(ElementCount VF); 1715 1716 /// Keeps cost model vectorization decision and cost for instructions. 1717 /// Right now it is used for memory instructions only. 1718 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1719 std::pair<InstWidening, InstructionCost>>; 1720 1721 DecisionList WideningDecisions; 1722 1723 using CallDecisionList = 1724 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1725 1726 CallDecisionList CallWideningDecisions; 1727 1728 /// Returns true if \p V is expected to be vectorized and it needs to be 1729 /// extracted. 1730 bool needsExtract(Value *V, ElementCount VF) const { 1731 Instruction *I = dyn_cast<Instruction>(V); 1732 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1733 TheLoop->isLoopInvariant(I)) 1734 return false; 1735 1736 // Assume we can vectorize V (and hence we need extraction) if the 1737 // scalars are not computed yet. This can happen, because it is called 1738 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1739 // the scalars are collected. That should be a safe assumption in most 1740 // cases, because we check if the operands have vectorizable types 1741 // beforehand in LoopVectorizationLegality. 1742 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1743 }; 1744 1745 /// Returns a range containing only operands needing to be extracted. 1746 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1747 ElementCount VF) const { 1748 return SmallVector<Value *, 4>(make_filter_range( 1749 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1750 } 1751 1752 public: 1753 /// The loop that we evaluate. 1754 Loop *TheLoop; 1755 1756 /// Predicated scalar evolution analysis. 1757 PredicatedScalarEvolution &PSE; 1758 1759 /// Loop Info analysis. 1760 LoopInfo *LI; 1761 1762 /// Vectorization legality. 1763 LoopVectorizationLegality *Legal; 1764 1765 /// Vector target information. 1766 const TargetTransformInfo &TTI; 1767 1768 /// Target Library Info. 1769 const TargetLibraryInfo *TLI; 1770 1771 /// Demanded bits analysis. 1772 DemandedBits *DB; 1773 1774 /// Assumption cache. 1775 AssumptionCache *AC; 1776 1777 /// Interface to emit optimization remarks. 1778 OptimizationRemarkEmitter *ORE; 1779 1780 const Function *TheFunction; 1781 1782 /// Loop Vectorize Hint. 1783 const LoopVectorizeHints *Hints; 1784 1785 /// The interleave access information contains groups of interleaved accesses 1786 /// with the same stride and close to each other. 1787 InterleavedAccessInfo &InterleaveInfo; 1788 1789 /// Values to ignore in the cost model. 1790 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1791 1792 /// Values to ignore in the cost model when VF > 1. 1793 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1794 1795 /// All element types found in the loop. 1796 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1797 }; 1798 } // end namespace llvm 1799 1800 namespace { 1801 /// Helper struct to manage generating runtime checks for vectorization. 1802 /// 1803 /// The runtime checks are created up-front in temporary blocks to allow better 1804 /// estimating the cost and un-linked from the existing IR. After deciding to 1805 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1806 /// temporary blocks are completely removed. 1807 class GeneratedRTChecks { 1808 /// Basic block which contains the generated SCEV checks, if any. 1809 BasicBlock *SCEVCheckBlock = nullptr; 1810 1811 /// The value representing the result of the generated SCEV checks. If it is 1812 /// nullptr, either no SCEV checks have been generated or they have been used. 1813 Value *SCEVCheckCond = nullptr; 1814 1815 /// Basic block which contains the generated memory runtime checks, if any. 1816 BasicBlock *MemCheckBlock = nullptr; 1817 1818 /// The value representing the result of the generated memory runtime checks. 1819 /// If it is nullptr, either no memory runtime checks have been generated or 1820 /// they have been used. 1821 Value *MemRuntimeCheckCond = nullptr; 1822 1823 DominatorTree *DT; 1824 LoopInfo *LI; 1825 TargetTransformInfo *TTI; 1826 1827 SCEVExpander SCEVExp; 1828 SCEVExpander MemCheckExp; 1829 1830 bool CostTooHigh = false; 1831 const bool AddBranchWeights; 1832 1833 Loop *OuterLoop = nullptr; 1834 1835 public: 1836 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1837 TargetTransformInfo *TTI, const DataLayout &DL, 1838 bool AddBranchWeights) 1839 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1840 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} 1841 1842 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1843 /// accurately estimate the cost of the runtime checks. The blocks are 1844 /// un-linked from the IR and is added back during vector code generation. If 1845 /// there is no vector code generation, the check blocks are removed 1846 /// completely. 1847 void Create(Loop *L, const LoopAccessInfo &LAI, 1848 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1849 1850 // Hard cutoff to limit compile-time increase in case a very large number of 1851 // runtime checks needs to be generated. 1852 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1853 // profile info. 1854 CostTooHigh = 1855 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1856 if (CostTooHigh) 1857 return; 1858 1859 BasicBlock *LoopHeader = L->getHeader(); 1860 BasicBlock *Preheader = L->getLoopPreheader(); 1861 1862 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1863 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1864 // may be used by SCEVExpander. The blocks will be un-linked from their 1865 // predecessors and removed from LI & DT at the end of the function. 1866 if (!UnionPred.isAlwaysTrue()) { 1867 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1868 nullptr, "vector.scevcheck"); 1869 1870 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1871 &UnionPred, SCEVCheckBlock->getTerminator()); 1872 } 1873 1874 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1875 if (RtPtrChecking.Need) { 1876 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1877 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1878 "vector.memcheck"); 1879 1880 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1881 if (DiffChecks) { 1882 Value *RuntimeVF = nullptr; 1883 MemRuntimeCheckCond = addDiffRuntimeChecks( 1884 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1885 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1886 if (!RuntimeVF) 1887 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1888 return RuntimeVF; 1889 }, 1890 IC); 1891 } else { 1892 MemRuntimeCheckCond = addRuntimeChecks( 1893 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 1894 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 1895 } 1896 assert(MemRuntimeCheckCond && 1897 "no RT checks generated although RtPtrChecking " 1898 "claimed checks are required"); 1899 } 1900 1901 if (!MemCheckBlock && !SCEVCheckBlock) 1902 return; 1903 1904 // Unhook the temporary block with the checks, update various places 1905 // accordingly. 1906 if (SCEVCheckBlock) 1907 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1908 if (MemCheckBlock) 1909 MemCheckBlock->replaceAllUsesWith(Preheader); 1910 1911 if (SCEVCheckBlock) { 1912 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1913 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1914 Preheader->getTerminator()->eraseFromParent(); 1915 } 1916 if (MemCheckBlock) { 1917 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1918 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1919 Preheader->getTerminator()->eraseFromParent(); 1920 } 1921 1922 DT->changeImmediateDominator(LoopHeader, Preheader); 1923 if (MemCheckBlock) { 1924 DT->eraseNode(MemCheckBlock); 1925 LI->removeBlock(MemCheckBlock); 1926 } 1927 if (SCEVCheckBlock) { 1928 DT->eraseNode(SCEVCheckBlock); 1929 LI->removeBlock(SCEVCheckBlock); 1930 } 1931 1932 // Outer loop is used as part of the later cost calculations. 1933 OuterLoop = L->getParentLoop(); 1934 } 1935 1936 InstructionCost getCost() { 1937 if (SCEVCheckBlock || MemCheckBlock) 1938 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 1939 1940 if (CostTooHigh) { 1941 InstructionCost Cost; 1942 Cost.setInvalid(); 1943 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 1944 return Cost; 1945 } 1946 1947 InstructionCost RTCheckCost = 0; 1948 if (SCEVCheckBlock) 1949 for (Instruction &I : *SCEVCheckBlock) { 1950 if (SCEVCheckBlock->getTerminator() == &I) 1951 continue; 1952 InstructionCost C = 1953 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1954 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1955 RTCheckCost += C; 1956 } 1957 if (MemCheckBlock) { 1958 InstructionCost MemCheckCost = 0; 1959 for (Instruction &I : *MemCheckBlock) { 1960 if (MemCheckBlock->getTerminator() == &I) 1961 continue; 1962 InstructionCost C = 1963 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 1964 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 1965 MemCheckCost += C; 1966 } 1967 1968 // If the runtime memory checks are being created inside an outer loop 1969 // we should find out if these checks are outer loop invariant. If so, 1970 // the checks will likely be hoisted out and so the effective cost will 1971 // reduce according to the outer loop trip count. 1972 if (OuterLoop) { 1973 ScalarEvolution *SE = MemCheckExp.getSE(); 1974 // TODO: If profitable, we could refine this further by analysing every 1975 // individual memory check, since there could be a mixture of loop 1976 // variant and invariant checks that mean the final condition is 1977 // variant. 1978 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 1979 if (SE->isLoopInvariant(Cond, OuterLoop)) { 1980 // It seems reasonable to assume that we can reduce the effective 1981 // cost of the checks even when we know nothing about the trip 1982 // count. Assume that the outer loop executes at least twice. 1983 unsigned BestTripCount = 2; 1984 1985 // If exact trip count is known use that. 1986 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) 1987 BestTripCount = SmallTC; 1988 else if (LoopVectorizeWithBlockFrequency) { 1989 // Else use profile data if available. 1990 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) 1991 BestTripCount = *EstimatedTC; 1992 } 1993 1994 BestTripCount = std::max(BestTripCount, 1U); 1995 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 1996 1997 // Let's ensure the cost is always at least 1. 1998 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 1999 (InstructionCost::CostType)1); 2000 2001 if (BestTripCount > 1) 2002 LLVM_DEBUG(dbgs() 2003 << "We expect runtime memory checks to be hoisted " 2004 << "out of the outer loop. Cost reduced from " 2005 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2006 2007 MemCheckCost = NewMemCheckCost; 2008 } 2009 } 2010 2011 RTCheckCost += MemCheckCost; 2012 } 2013 2014 if (SCEVCheckBlock || MemCheckBlock) 2015 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2016 << "\n"); 2017 2018 return RTCheckCost; 2019 } 2020 2021 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2022 /// unused. 2023 ~GeneratedRTChecks() { 2024 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2025 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2026 if (!SCEVCheckCond) 2027 SCEVCleaner.markResultUsed(); 2028 2029 if (!MemRuntimeCheckCond) 2030 MemCheckCleaner.markResultUsed(); 2031 2032 if (MemRuntimeCheckCond) { 2033 auto &SE = *MemCheckExp.getSE(); 2034 // Memory runtime check generation creates compares that use expanded 2035 // values. Remove them before running the SCEVExpanderCleaners. 2036 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2037 if (MemCheckExp.isInsertedInstruction(&I)) 2038 continue; 2039 SE.forgetValue(&I); 2040 I.eraseFromParent(); 2041 } 2042 } 2043 MemCheckCleaner.cleanup(); 2044 SCEVCleaner.cleanup(); 2045 2046 if (SCEVCheckCond) 2047 SCEVCheckBlock->eraseFromParent(); 2048 if (MemRuntimeCheckCond) 2049 MemCheckBlock->eraseFromParent(); 2050 } 2051 2052 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2053 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2054 /// depending on the generated condition. 2055 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2056 BasicBlock *LoopVectorPreHeader, 2057 BasicBlock *LoopExitBlock) { 2058 if (!SCEVCheckCond) 2059 return nullptr; 2060 2061 Value *Cond = SCEVCheckCond; 2062 // Mark the check as used, to prevent it from being removed during cleanup. 2063 SCEVCheckCond = nullptr; 2064 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2065 if (C->isZero()) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2071 // Create new preheader for vector loop. 2072 if (OuterLoop) 2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2074 2075 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2078 SCEVCheckBlock); 2079 2080 DT->addNewBlock(SCEVCheckBlock, Pred); 2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2082 2083 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2084 if (AddBranchWeights) 2085 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); 2086 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2087 return SCEVCheckBlock; 2088 } 2089 2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2091 /// the branches to branch to the vector preheader or \p Bypass, depending on 2092 /// the generated condition. 2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2094 BasicBlock *LoopVectorPreHeader) { 2095 // Check if we generated code that checks in runtime if arrays overlap. 2096 if (!MemRuntimeCheckCond) 2097 return nullptr; 2098 2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2101 MemCheckBlock); 2102 2103 DT->addNewBlock(MemCheckBlock, Pred); 2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2105 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2106 2107 if (OuterLoop) 2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2109 2110 BranchInst &BI = 2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2112 if (AddBranchWeights) { 2113 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); 2114 } 2115 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2116 MemCheckBlock->getTerminator()->setDebugLoc( 2117 Pred->getTerminator()->getDebugLoc()); 2118 2119 // Mark the check as used, to prevent it from being removed during cleanup. 2120 MemRuntimeCheckCond = nullptr; 2121 return MemCheckBlock; 2122 } 2123 }; 2124 } // namespace 2125 2126 static bool useActiveLaneMask(TailFoldingStyle Style) { 2127 return Style == TailFoldingStyle::Data || 2128 Style == TailFoldingStyle::DataAndControlFlow || 2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2130 } 2131 2132 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2133 return Style == TailFoldingStyle::DataAndControlFlow || 2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2135 } 2136 2137 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2138 // vectorization. The loop needs to be annotated with #pragma omp simd 2139 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2140 // vector length information is not provided, vectorization is not considered 2141 // explicit. Interleave hints are not allowed either. These limitations will be 2142 // relaxed in the future. 2143 // Please, note that we are currently forced to abuse the pragma 'clang 2144 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2145 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2146 // provides *explicit vectorization hints* (LV can bypass legal checks and 2147 // assume that vectorization is legal). However, both hints are implemented 2148 // using the same metadata (llvm.loop.vectorize, processed by 2149 // LoopVectorizeHints). This will be fixed in the future when the native IR 2150 // representation for pragma 'omp simd' is introduced. 2151 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2152 OptimizationRemarkEmitter *ORE) { 2153 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2155 2156 // Only outer loops with an explicit vectorization hint are supported. 2157 // Unannotated outer loops are ignored. 2158 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2159 return false; 2160 2161 Function *Fn = OuterLp->getHeader()->getParent(); 2162 if (!Hints.allowVectorization(Fn, OuterLp, 2163 true /*VectorizeOnlyWhenForced*/)) { 2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2165 return false; 2166 } 2167 2168 if (Hints.getInterleave() > 1) { 2169 // TODO: Interleave support is future work. 2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2171 "outer loops.\n"); 2172 Hints.emitRemarkWithHints(); 2173 return false; 2174 } 2175 2176 return true; 2177 } 2178 2179 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2180 OptimizationRemarkEmitter *ORE, 2181 SmallVectorImpl<Loop *> &V) { 2182 // Collect inner loops and outer loops without irreducible control flow. For 2183 // now, only collect outer loops that have explicit vectorization hints. If we 2184 // are stress testing the VPlan H-CFG construction, we collect the outermost 2185 // loop of every loop nest. 2186 if (L.isInnermost() || VPlanBuildStressTest || 2187 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2188 LoopBlocksRPO RPOT(&L); 2189 RPOT.perform(LI); 2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2191 V.push_back(&L); 2192 // TODO: Collect inner loops inside marked outer loops in case 2193 // vectorization fails for the outer loop. Do not invoke 2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2195 // already known to be reducible. We can use an inherited attribute for 2196 // that. 2197 return; 2198 } 2199 } 2200 for (Loop *InnerL : L) 2201 collectSupportedLoops(*InnerL, LI, ORE, V); 2202 } 2203 2204 //===----------------------------------------------------------------------===// 2205 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2206 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2207 //===----------------------------------------------------------------------===// 2208 2209 /// Compute the transformed value of Index at offset StartValue using step 2210 /// StepValue. 2211 /// For integer induction, returns StartValue + Index * StepValue. 2212 /// For pointer induction, returns StartValue[Index * StepValue]. 2213 /// FIXME: The newly created binary instructions should contain nsw/nuw 2214 /// flags, which can be found from the original scalar operations. 2215 static Value * 2216 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2217 Value *Step, 2218 InductionDescriptor::InductionKind InductionKind, 2219 const BinaryOperator *InductionBinOp) { 2220 Type *StepTy = Step->getType(); 2221 Value *CastedIndex = StepTy->isIntegerTy() 2222 ? B.CreateSExtOrTrunc(Index, StepTy) 2223 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2224 if (CastedIndex != Index) { 2225 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2226 Index = CastedIndex; 2227 } 2228 2229 // Note: the IR at this point is broken. We cannot use SE to create any new 2230 // SCEV and then expand it, hoping that SCEV's simplification will give us 2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2232 // lead to various SCEV crashes. So all we can do is to use builder and rely 2233 // on InstCombine for future simplifications. Here we handle some trivial 2234 // cases only. 2235 auto CreateAdd = [&B](Value *X, Value *Y) { 2236 assert(X->getType() == Y->getType() && "Types don't match!"); 2237 if (auto *CX = dyn_cast<ConstantInt>(X)) 2238 if (CX->isZero()) 2239 return Y; 2240 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2241 if (CY->isZero()) 2242 return X; 2243 return B.CreateAdd(X, Y); 2244 }; 2245 2246 // We allow X to be a vector type, in which case Y will potentially be 2247 // splatted into a vector with the same element count. 2248 auto CreateMul = [&B](Value *X, Value *Y) { 2249 assert(X->getType()->getScalarType() == Y->getType() && 2250 "Types don't match!"); 2251 if (auto *CX = dyn_cast<ConstantInt>(X)) 2252 if (CX->isOne()) 2253 return Y; 2254 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2255 if (CY->isOne()) 2256 return X; 2257 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2258 if (XVTy && !isa<VectorType>(Y->getType())) 2259 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2260 return B.CreateMul(X, Y); 2261 }; 2262 2263 switch (InductionKind) { 2264 case InductionDescriptor::IK_IntInduction: { 2265 assert(!isa<VectorType>(Index->getType()) && 2266 "Vector indices not supported for integer inductions yet"); 2267 assert(Index->getType() == StartValue->getType() && 2268 "Index type does not match StartValue type"); 2269 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2270 return B.CreateSub(StartValue, Index); 2271 auto *Offset = CreateMul(Index, Step); 2272 return CreateAdd(StartValue, Offset); 2273 } 2274 case InductionDescriptor::IK_PtrInduction: 2275 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2276 case InductionDescriptor::IK_FpInduction: { 2277 assert(!isa<VectorType>(Index->getType()) && 2278 "Vector indices not supported for FP inductions yet"); 2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2280 assert(InductionBinOp && 2281 (InductionBinOp->getOpcode() == Instruction::FAdd || 2282 InductionBinOp->getOpcode() == Instruction::FSub) && 2283 "Original bin op should be defined for FP induction"); 2284 2285 Value *MulExp = B.CreateFMul(Step, Index); 2286 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2287 "induction"); 2288 } 2289 case InductionDescriptor::IK_NoInduction: 2290 return nullptr; 2291 } 2292 llvm_unreachable("invalid enum"); 2293 } 2294 2295 std::optional<unsigned> getMaxVScale(const Function &F, 2296 const TargetTransformInfo &TTI) { 2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2298 return MaxVScale; 2299 2300 if (F.hasFnAttribute(Attribute::VScaleRange)) 2301 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2302 2303 return std::nullopt; 2304 } 2305 2306 /// For the given VF and UF and maximum trip count computed for the loop, return 2307 /// whether the induction variable might overflow in the vectorized loop. If not, 2308 /// then we know a runtime overflow check always evaluates to false and can be 2309 /// removed. 2310 static bool isIndvarOverflowCheckKnownFalse( 2311 const LoopVectorizationCostModel *Cost, 2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2313 // Always be conservative if we don't know the exact unroll factor. 2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2315 2316 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2317 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2318 2319 // We know the runtime overflow check is known false iff the (max) trip-count 2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2321 // the vector loop induction variable. 2322 if (unsigned TC = 2323 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2324 uint64_t MaxVF = VF.getKnownMinValue(); 2325 if (VF.isScalable()) { 2326 std::optional<unsigned> MaxVScale = 2327 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2328 if (!MaxVScale) 2329 return false; 2330 MaxVF *= *MaxVScale; 2331 } 2332 2333 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2334 } 2335 2336 return false; 2337 } 2338 2339 // Return whether we allow using masked interleave-groups (for dealing with 2340 // strided loads/stores that reside in predicated blocks, or for dealing 2341 // with gaps). 2342 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2343 // If an override option has been passed in for interleaved accesses, use it. 2344 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2345 return EnableMaskedInterleavedMemAccesses; 2346 2347 return TTI.enableMaskedInterleavedAccessVectorization(); 2348 } 2349 2350 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2351 VPReplicateRecipe *RepRecipe, 2352 const VPIteration &Instance, 2353 VPTransformState &State) { 2354 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2355 2356 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2357 // the first lane and part. 2358 if (isa<NoAliasScopeDeclInst>(Instr)) 2359 if (!Instance.isFirstIteration()) 2360 return; 2361 2362 // Does this instruction return a value ? 2363 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2364 2365 Instruction *Cloned = Instr->clone(); 2366 if (!IsVoidRetTy) { 2367 Cloned->setName(Instr->getName() + ".cloned"); 2368 #if !defined(NDEBUG) 2369 // Verify that VPlan type inference results agree with the type of the 2370 // generated values. 2371 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2372 "inferred type and type from generated instructions do not match"); 2373 #endif 2374 } 2375 2376 RepRecipe->setFlags(Cloned); 2377 2378 if (auto DL = Instr->getDebugLoc()) 2379 State.setDebugLocFrom(DL); 2380 2381 // Replace the operands of the cloned instructions with their scalar 2382 // equivalents in the new loop. 2383 for (const auto &I : enumerate(RepRecipe->operands())) { 2384 auto InputInstance = Instance; 2385 VPValue *Operand = I.value(); 2386 if (vputils::isUniformAfterVectorization(Operand)) 2387 InputInstance.Lane = VPLane::getFirstLane(); 2388 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2389 } 2390 State.addNewMetadata(Cloned, Instr); 2391 2392 // Place the cloned scalar in the new loop. 2393 State.Builder.Insert(Cloned); 2394 2395 State.set(RepRecipe, Cloned, Instance); 2396 2397 // If we just cloned a new assumption, add it the assumption cache. 2398 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2399 AC->registerAssumption(II); 2400 2401 // End if-block. 2402 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2403 if (IfPredicateInstr) 2404 PredicatedInstructions.push_back(Cloned); 2405 } 2406 2407 Value * 2408 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2409 if (VectorTripCount) 2410 return VectorTripCount; 2411 2412 Value *TC = getTripCount(); 2413 IRBuilder<> Builder(InsertBlock->getTerminator()); 2414 2415 Type *Ty = TC->getType(); 2416 // This is where we can make the step a runtime constant. 2417 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2418 2419 // If the tail is to be folded by masking, round the number of iterations N 2420 // up to a multiple of Step instead of rounding down. This is done by first 2421 // adding Step-1 and then rounding down. Note that it's ok if this addition 2422 // overflows: the vector induction variable will eventually wrap to zero given 2423 // that it starts at zero and its Step is a power of two; the loop will then 2424 // exit, with the last early-exit vector comparison also producing all-true. 2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2426 // is accounted for in emitIterationCountCheck that adds an overflow check. 2427 if (Cost->foldTailByMasking()) { 2428 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2429 "VF*UF must be a power of 2 when folding tail by masking"); 2430 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), 2431 "n.rnd.up"); 2432 } 2433 2434 // Now we need to generate the expression for the part of the loop that the 2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2436 // iterations are not required for correctness, or N - Step, otherwise. Step 2437 // is equal to the vectorization factor (number of SIMD elements) times the 2438 // unroll factor (number of SIMD instructions). 2439 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2440 2441 // There are cases where we *must* run at least one iteration in the remainder 2442 // loop. See the cost model for when this can happen. If the step evenly 2443 // divides the trip count, we set the remainder to be equal to the step. If 2444 // the step does not evenly divide the trip count, no adjustment is necessary 2445 // since there will already be scalar iterations. Note that the minimum 2446 // iterations check ensures that N >= Step. 2447 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2448 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2449 R = Builder.CreateSelect(IsZero, Step, R); 2450 } 2451 2452 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2453 2454 return VectorTripCount; 2455 } 2456 2457 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2458 Value *Count = getTripCount(); 2459 // Reuse existing vector loop preheader for TC checks. 2460 // Note that new preheader block is generated for vector loop. 2461 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2462 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2463 2464 // Generate code to check if the loop's trip count is less than VF * UF, or 2465 // equal to it in case a scalar epilogue is required; this implies that the 2466 // vector trip count is zero. This check also covers the case where adding one 2467 // to the backedge-taken count overflowed leading to an incorrect trip count 2468 // of zero. In this case we will also jump to the scalar loop. 2469 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2470 : ICmpInst::ICMP_ULT; 2471 2472 // If tail is to be folded, vector loop takes care of all iterations. 2473 Type *CountTy = Count->getType(); 2474 Value *CheckMinIters = Builder.getFalse(); 2475 auto CreateStep = [&]() -> Value * { 2476 // Create step with max(MinProTripCount, UF * VF). 2477 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2478 return createStepForVF(Builder, CountTy, VF, UF); 2479 2480 Value *MinProfTC = 2481 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2482 if (!VF.isScalable()) 2483 return MinProfTC; 2484 return Builder.CreateBinaryIntrinsic( 2485 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2486 }; 2487 2488 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2489 if (Style == TailFoldingStyle::None) 2490 CheckMinIters = 2491 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2492 else if (VF.isScalable() && 2493 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2494 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2495 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2496 // an overflow to zero when updating induction variables and so an 2497 // additional overflow check is required before entering the vector loop. 2498 2499 // Get the maximum unsigned value for the type. 2500 Value *MaxUIntTripCount = 2501 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2502 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2503 2504 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2505 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2506 } 2507 2508 // Create new preheader for vector loop. 2509 LoopVectorPreHeader = 2510 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2511 "vector.ph"); 2512 2513 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2514 DT->getNode(Bypass)->getIDom()) && 2515 "TC check is expected to dominate Bypass"); 2516 2517 // Update dominator for Bypass & LoopExit (if needed). 2518 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2519 BranchInst &BI = 2520 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2521 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2522 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 2523 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2524 LoopBypassBlocks.push_back(TCCheckBlock); 2525 } 2526 2527 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2528 BasicBlock *const SCEVCheckBlock = 2529 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2530 if (!SCEVCheckBlock) 2531 return nullptr; 2532 2533 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2534 (OptForSizeBasedOnProfile && 2535 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2536 "Cannot SCEV check stride or overflow when optimizing for size"); 2537 2538 2539 // Update dominator only if this is first RT check. 2540 if (LoopBypassBlocks.empty()) { 2541 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2542 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2543 // If there is an epilogue which must run, there's no edge from the 2544 // middle block to exit blocks and thus no need to update the immediate 2545 // dominator of the exit blocks. 2546 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2547 } 2548 2549 LoopBypassBlocks.push_back(SCEVCheckBlock); 2550 AddedSafetyChecks = true; 2551 return SCEVCheckBlock; 2552 } 2553 2554 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2555 // VPlan-native path does not do any analysis for runtime checks currently. 2556 if (EnableVPlanNativePath) 2557 return nullptr; 2558 2559 BasicBlock *const MemCheckBlock = 2560 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2561 2562 // Check if we generated code that checks in runtime if arrays overlap. We put 2563 // the checks into a separate block to make the more common case of few 2564 // elements faster. 2565 if (!MemCheckBlock) 2566 return nullptr; 2567 2568 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2569 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2570 "Cannot emit memory checks when optimizing for size, unless forced " 2571 "to vectorize."); 2572 ORE->emit([&]() { 2573 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2574 OrigLoop->getStartLoc(), 2575 OrigLoop->getHeader()) 2576 << "Code-size may be reduced by not forcing " 2577 "vectorization, or by source-code modifications " 2578 "eliminating the need for runtime checks " 2579 "(e.g., adding 'restrict')."; 2580 }); 2581 } 2582 2583 LoopBypassBlocks.push_back(MemCheckBlock); 2584 2585 AddedSafetyChecks = true; 2586 2587 return MemCheckBlock; 2588 } 2589 2590 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 2591 LoopScalarBody = OrigLoop->getHeader(); 2592 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2593 assert(LoopVectorPreHeader && "Invalid loop structure"); 2594 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 2595 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 2596 "multiple exit loop without required epilogue?"); 2597 2598 LoopMiddleBlock = 2599 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2600 LI, nullptr, Twine(Prefix) + "middle.block"); 2601 LoopScalarPreHeader = 2602 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2603 nullptr, Twine(Prefix) + "scalar.ph"); 2604 } 2605 2606 PHINode *InnerLoopVectorizer::createInductionResumeValue( 2607 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 2608 ArrayRef<BasicBlock *> BypassBlocks, 2609 std::pair<BasicBlock *, Value *> AdditionalBypass) { 2610 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 2611 assert(VectorTripCount && "Expected valid arguments"); 2612 2613 Instruction *OldInduction = Legal->getPrimaryInduction(); 2614 Value *&EndValue = IVEndValues[OrigPhi]; 2615 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 2616 if (OrigPhi == OldInduction) { 2617 // We know what the end value is. 2618 EndValue = VectorTripCount; 2619 } else { 2620 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 2621 2622 // Fast-math-flags propagate from the original induction instruction. 2623 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 2624 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 2625 2626 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), 2627 Step, II.getKind(), II.getInductionBinOp()); 2628 EndValue->setName("ind.end"); 2629 2630 // Compute the end value for the additional bypass (if applicable). 2631 if (AdditionalBypass.first) { 2632 B.SetInsertPoint(AdditionalBypass.first, 2633 AdditionalBypass.first->getFirstInsertionPt()); 2634 EndValueFromAdditionalBypass = 2635 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), 2636 Step, II.getKind(), II.getInductionBinOp()); 2637 EndValueFromAdditionalBypass->setName("ind.end"); 2638 } 2639 } 2640 2641 // Create phi nodes to merge from the backedge-taken check block. 2642 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 2643 LoopScalarPreHeader->getFirstNonPHI()); 2644 // Copy original phi DL over to the new one. 2645 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2646 2647 // The new PHI merges the original incoming value, in case of a bypass, 2648 // or the value at the end of the vectorized loop. 2649 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 2650 2651 // Fix the scalar body counter (PHI node). 2652 // The old induction's phi node in the scalar body needs the truncated 2653 // value. 2654 for (BasicBlock *BB : BypassBlocks) 2655 BCResumeVal->addIncoming(II.getStartValue(), BB); 2656 2657 if (AdditionalBypass.first) 2658 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 2659 EndValueFromAdditionalBypass); 2660 return BCResumeVal; 2661 } 2662 2663 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2664 /// expansion results. 2665 static Value *getExpandedStep(const InductionDescriptor &ID, 2666 const SCEV2ValueTy &ExpandedSCEVs) { 2667 const SCEV *Step = ID.getStep(); 2668 if (auto *C = dyn_cast<SCEVConstant>(Step)) 2669 return C->getValue(); 2670 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 2671 return U->getValue(); 2672 auto I = ExpandedSCEVs.find(Step); 2673 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 2674 return I->second; 2675 } 2676 2677 void InnerLoopVectorizer::createInductionResumeValues( 2678 const SCEV2ValueTy &ExpandedSCEVs, 2679 std::pair<BasicBlock *, Value *> AdditionalBypass) { 2680 assert(((AdditionalBypass.first && AdditionalBypass.second) || 2681 (!AdditionalBypass.first && !AdditionalBypass.second)) && 2682 "Inconsistent information about additional bypass."); 2683 // We are going to resume the execution of the scalar loop. 2684 // Go over all of the induction variables that we found and fix the 2685 // PHIs that are left in the scalar version of the loop. 2686 // The starting values of PHI nodes depend on the counter of the last 2687 // iteration in the vectorized loop. 2688 // If we come from a bypass edge then we need to start from the original 2689 // start value. 2690 for (const auto &InductionEntry : Legal->getInductionVars()) { 2691 PHINode *OrigPhi = InductionEntry.first; 2692 const InductionDescriptor &II = InductionEntry.second; 2693 PHINode *BCResumeVal = createInductionResumeValue( 2694 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 2695 AdditionalBypass); 2696 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 2697 } 2698 } 2699 2700 std::pair<BasicBlock *, Value *> 2701 InnerLoopVectorizer::createVectorizedLoopSkeleton( 2702 const SCEV2ValueTy &ExpandedSCEVs) { 2703 /* 2704 In this function we generate a new loop. The new loop will contain 2705 the vectorized instructions while the old loop will continue to run the 2706 scalar remainder. 2707 2708 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2709 / | preheader are expanded here. Eventually all required SCEV 2710 / | expansion should happen here. 2711 / v 2712 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2713 | / | 2714 | / v 2715 || [ ] <-- vector pre header. 2716 |/ | 2717 | v 2718 | [ ] \ 2719 | [ ]_| <-- vector loop (created during VPlan execution). 2720 | | 2721 | v 2722 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2723 | | successors created during VPlan execution) 2724 \/ | 2725 /\ v 2726 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2727 | | 2728 (opt) v <-- edge from middle to exit iff epilogue is not required. 2729 | [ ] \ 2730 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 2731 \ | 2732 \ v 2733 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2734 ... 2735 */ 2736 2737 // Create an empty vector loop, and prepare basic blocks for the runtime 2738 // checks. 2739 createVectorLoopSkeleton(""); 2740 2741 // Now, compare the new count to zero. If it is zero skip the vector loop and 2742 // jump to the scalar loop. This check also covers the case where the 2743 // backedge-taken count is uint##_max: adding one to it will overflow leading 2744 // to an incorrect trip count of zero. In this (rare) case we will also jump 2745 // to the scalar loop. 2746 emitIterationCountCheck(LoopScalarPreHeader); 2747 2748 // Generate the code to check any assumptions that we've made for SCEV 2749 // expressions. 2750 emitSCEVChecks(LoopScalarPreHeader); 2751 2752 // Generate the code that checks in runtime if arrays overlap. We put the 2753 // checks into a separate block to make the more common case of few elements 2754 // faster. 2755 emitMemRuntimeChecks(LoopScalarPreHeader); 2756 2757 // Emit phis for the new starting index of the scalar loop. 2758 createInductionResumeValues(ExpandedSCEVs); 2759 2760 return {LoopVectorPreHeader, nullptr}; 2761 } 2762 2763 // Fix up external users of the induction variable. At this point, we are 2764 // in LCSSA form, with all external PHIs that use the IV having one input value, 2765 // coming from the remainder loop. We need those PHIs to also have a correct 2766 // value for the IV when arriving directly from the middle block. 2767 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 2768 const InductionDescriptor &II, 2769 Value *VectorTripCount, Value *EndValue, 2770 BasicBlock *MiddleBlock, 2771 BasicBlock *VectorHeader, VPlan &Plan, 2772 VPTransformState &State) { 2773 // There are two kinds of external IV usages - those that use the value 2774 // computed in the last iteration (the PHI) and those that use the penultimate 2775 // value (the value that feeds into the phi from the loop latch). 2776 // We allow both, but they, obviously, have different values. 2777 2778 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 2779 2780 DenseMap<Value *, Value *> MissingVals; 2781 2782 // An external user of the last iteration's value should see the value that 2783 // the remainder loop uses to initialize its own IV. 2784 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 2785 for (User *U : PostInc->users()) { 2786 Instruction *UI = cast<Instruction>(U); 2787 if (!OrigLoop->contains(UI)) { 2788 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2789 MissingVals[UI] = EndValue; 2790 } 2791 } 2792 2793 // An external user of the penultimate value need to see EndValue - Step. 2794 // The simplest way to get this is to recompute it from the constituent SCEVs, 2795 // that is Start + (Step * (CRD - 1)). 2796 for (User *U : OrigPhi->users()) { 2797 auto *UI = cast<Instruction>(U); 2798 if (!OrigLoop->contains(UI)) { 2799 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 2800 IRBuilder<> B(MiddleBlock->getTerminator()); 2801 2802 // Fast-math-flags propagate from the original induction instruction. 2803 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 2804 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 2805 2806 Value *CountMinusOne = B.CreateSub( 2807 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 2808 CountMinusOne->setName("cmo"); 2809 2810 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 2811 assert(StepVPV && "step must have been expanded during VPlan execution"); 2812 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 2813 : State.get(StepVPV, {0, 0}); 2814 Value *Escape = 2815 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, 2816 II.getKind(), II.getInductionBinOp()); 2817 Escape->setName("ind.escape"); 2818 MissingVals[UI] = Escape; 2819 } 2820 } 2821 2822 for (auto &I : MissingVals) { 2823 PHINode *PHI = cast<PHINode>(I.first); 2824 // One corner case we have to handle is two IVs "chasing" each-other, 2825 // that is %IV2 = phi [...], [ %IV1, %latch ] 2826 // In this case, if IV1 has an external use, we need to avoid adding both 2827 // "last value of IV1" and "penultimate value of IV2". So, verify that we 2828 // don't already have an incoming value for the middle block. 2829 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 2830 PHI->addIncoming(I.second, MiddleBlock); 2831 Plan.removeLiveOut(PHI); 2832 } 2833 } 2834 } 2835 2836 namespace { 2837 2838 struct CSEDenseMapInfo { 2839 static bool canHandle(const Instruction *I) { 2840 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2841 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2842 } 2843 2844 static inline Instruction *getEmptyKey() { 2845 return DenseMapInfo<Instruction *>::getEmptyKey(); 2846 } 2847 2848 static inline Instruction *getTombstoneKey() { 2849 return DenseMapInfo<Instruction *>::getTombstoneKey(); 2850 } 2851 2852 static unsigned getHashValue(const Instruction *I) { 2853 assert(canHandle(I) && "Unknown instruction!"); 2854 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2855 I->value_op_end())); 2856 } 2857 2858 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 2859 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2860 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2861 return LHS == RHS; 2862 return LHS->isIdenticalTo(RHS); 2863 } 2864 }; 2865 2866 } // end anonymous namespace 2867 2868 ///Perform cse of induction variable instructions. 2869 static void cse(BasicBlock *BB) { 2870 // Perform simple cse. 2871 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2872 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 2873 if (!CSEDenseMapInfo::canHandle(&In)) 2874 continue; 2875 2876 // Check if we can replace this instruction with any of the 2877 // visited instructions. 2878 if (Instruction *V = CSEMap.lookup(&In)) { 2879 In.replaceAllUsesWith(V); 2880 In.eraseFromParent(); 2881 continue; 2882 } 2883 2884 CSEMap[&In] = &In; 2885 } 2886 } 2887 2888 InstructionCost 2889 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 2890 ElementCount VF) const { 2891 // We only need to calculate a cost if the VF is scalar; for actual vectors 2892 // we should already have a pre-calculated cost at each VF. 2893 if (!VF.isScalar()) 2894 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 2895 2896 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 2897 Type *RetTy = CI->getType(); 2898 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 2899 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 2900 return *RedCost; 2901 2902 SmallVector<Type *, 4> Tys; 2903 for (auto &ArgOp : CI->args()) 2904 Tys.push_back(ArgOp->getType()); 2905 2906 InstructionCost ScalarCallCost = 2907 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 2908 2909 // If this is an intrinsic we may have a lower cost for it. 2910 if (getVectorIntrinsicIDForCall(CI, TLI)) { 2911 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 2912 return std::min(ScalarCallCost, IntrinsicCost); 2913 } 2914 return ScalarCallCost; 2915 } 2916 2917 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 2918 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 2919 return Elt; 2920 return VectorType::get(Elt, VF); 2921 } 2922 2923 InstructionCost 2924 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 2925 ElementCount VF) const { 2926 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 2927 assert(ID && "Expected intrinsic call!"); 2928 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 2929 FastMathFlags FMF; 2930 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 2931 FMF = FPMO->getFastMathFlags(); 2932 2933 SmallVector<const Value *> Arguments(CI->args()); 2934 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 2935 SmallVector<Type *> ParamTys; 2936 std::transform(FTy->param_begin(), FTy->param_end(), 2937 std::back_inserter(ParamTys), 2938 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 2939 2940 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 2941 dyn_cast<IntrinsicInst>(CI)); 2942 return TTI.getIntrinsicInstrCost(CostAttrs, 2943 TargetTransformInfo::TCK_RecipThroughput); 2944 } 2945 2946 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 2947 VPlan &Plan) { 2948 // Fix widened non-induction PHIs by setting up the PHI operands. 2949 if (EnableVPlanNativePath) 2950 fixNonInductionPHIs(Plan, State); 2951 2952 // Forget the original basic block. 2953 PSE.getSE()->forgetLoop(OrigLoop); 2954 PSE.getSE()->forgetBlockAndLoopDispositions(); 2955 2956 // After vectorization, the exit blocks of the original loop will have 2957 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 2958 // looked through single-entry phis. 2959 SmallVector<BasicBlock *> ExitBlocks; 2960 OrigLoop->getExitBlocks(ExitBlocks); 2961 for (BasicBlock *Exit : ExitBlocks) 2962 for (PHINode &PN : Exit->phis()) 2963 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 2964 2965 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 2966 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); 2967 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 2968 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2969 // No edge from the middle block to the unique exit block has been inserted 2970 // and there is nothing to fix from vector loop; phis should have incoming 2971 // from scalar loop only. 2972 } else { 2973 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 2974 // the cost model. 2975 2976 // If we inserted an edge from the middle block to the unique exit block, 2977 // update uses outside the loop (phis) to account for the newly inserted 2978 // edge. 2979 2980 // Fix-up external users of the induction variables. 2981 for (const auto &Entry : Legal->getInductionVars()) 2982 fixupIVUsers(Entry.first, Entry.second, 2983 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 2984 IVEndValues[Entry.first], LoopMiddleBlock, 2985 VectorLoop->getHeader(), Plan, State); 2986 } 2987 2988 // Fix live-out phis not already fixed earlier. 2989 for (const auto &KV : Plan.getLiveOuts()) 2990 KV.second->fixPhi(Plan, State); 2991 2992 for (Instruction *PI : PredicatedInstructions) 2993 sinkScalarOperands(&*PI); 2994 2995 // Remove redundant induction instructions. 2996 cse(VectorLoop->getHeader()); 2997 2998 // Set/update profile weights for the vector and remainder loops as original 2999 // loop iterations are now distributed among them. Note that original loop 3000 // represented by LoopScalarBody becomes remainder loop after vectorization. 3001 // 3002 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3003 // end up getting slightly roughened result but that should be OK since 3004 // profile is not inherently precise anyway. Note also possible bypass of 3005 // vector code caused by legality checks is ignored, assigning all the weight 3006 // to the vector loop, optimistically. 3007 // 3008 // For scalable vectorization we can't know at compile time how many iterations 3009 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3010 // vscale of '1'. 3011 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3012 LI->getLoopFor(LoopScalarBody), 3013 VF.getKnownMinValue() * UF); 3014 } 3015 3016 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3017 // The basic block and loop containing the predicated instruction. 3018 auto *PredBB = PredInst->getParent(); 3019 auto *VectorLoop = LI->getLoopFor(PredBB); 3020 3021 // Initialize a worklist with the operands of the predicated instruction. 3022 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3023 3024 // Holds instructions that we need to analyze again. An instruction may be 3025 // reanalyzed if we don't yet know if we can sink it or not. 3026 SmallVector<Instruction *, 8> InstsToReanalyze; 3027 3028 // Returns true if a given use occurs in the predicated block. Phi nodes use 3029 // their operands in their corresponding predecessor blocks. 3030 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3031 auto *I = cast<Instruction>(U.getUser()); 3032 BasicBlock *BB = I->getParent(); 3033 if (auto *Phi = dyn_cast<PHINode>(I)) 3034 BB = Phi->getIncomingBlock( 3035 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3036 return BB == PredBB; 3037 }; 3038 3039 // Iteratively sink the scalarized operands of the predicated instruction 3040 // into the block we created for it. When an instruction is sunk, it's 3041 // operands are then added to the worklist. The algorithm ends after one pass 3042 // through the worklist doesn't sink a single instruction. 3043 bool Changed; 3044 do { 3045 // Add the instructions that need to be reanalyzed to the worklist, and 3046 // reset the changed indicator. 3047 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3048 InstsToReanalyze.clear(); 3049 Changed = false; 3050 3051 while (!Worklist.empty()) { 3052 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3053 3054 // We can't sink an instruction if it is a phi node, is not in the loop, 3055 // may have side effects or may read from memory. 3056 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 3057 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3058 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3059 continue; 3060 3061 // If the instruction is already in PredBB, check if we can sink its 3062 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3063 // sinking the scalar instruction I, hence it appears in PredBB; but it 3064 // may have failed to sink I's operands (recursively), which we try 3065 // (again) here. 3066 if (I->getParent() == PredBB) { 3067 Worklist.insert(I->op_begin(), I->op_end()); 3068 continue; 3069 } 3070 3071 // It's legal to sink the instruction if all its uses occur in the 3072 // predicated block. Otherwise, there's nothing to do yet, and we may 3073 // need to reanalyze the instruction. 3074 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3075 InstsToReanalyze.push_back(I); 3076 continue; 3077 } 3078 3079 // Move the instruction to the beginning of the predicated block, and add 3080 // it's operands to the worklist. 3081 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3082 Worklist.insert(I->op_begin(), I->op_end()); 3083 3084 // The sinking may have enabled other instructions to be sunk, so we will 3085 // need to iterate. 3086 Changed = true; 3087 } 3088 } while (Changed); 3089 } 3090 3091 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 3092 VPTransformState &State) { 3093 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3094 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3095 for (VPRecipeBase &P : VPBB->phis()) { 3096 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3097 if (!VPPhi) 3098 continue; 3099 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 3100 // Make sure the builder has a valid insert point. 3101 Builder.SetInsertPoint(NewPhi); 3102 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 3103 VPValue *Inc = VPPhi->getIncomingValue(i); 3104 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 3105 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 3106 } 3107 } 3108 } 3109 } 3110 3111 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3112 // We should not collect Scalars more than once per VF. Right now, this 3113 // function is called from collectUniformsAndScalars(), which already does 3114 // this check. Collecting Scalars for VF=1 does not make any sense. 3115 assert(VF.isVector() && !Scalars.contains(VF) && 3116 "This function should not be visited twice for the same VF"); 3117 3118 // This avoids any chances of creating a REPLICATE recipe during planning 3119 // since that would result in generation of scalarized code during execution, 3120 // which is not supported for scalable vectors. 3121 if (VF.isScalable()) { 3122 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3123 return; 3124 } 3125 3126 SmallSetVector<Instruction *, 8> Worklist; 3127 3128 // These sets are used to seed the analysis with pointers used by memory 3129 // accesses that will remain scalar. 3130 SmallSetVector<Instruction *, 8> ScalarPtrs; 3131 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3132 auto *Latch = TheLoop->getLoopLatch(); 3133 3134 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3135 // The pointer operands of loads and stores will be scalar as long as the 3136 // memory access is not a gather or scatter operation. The value operand of a 3137 // store will remain scalar if the store is scalarized. 3138 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3139 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3140 assert(WideningDecision != CM_Unknown && 3141 "Widening decision should be ready at this moment"); 3142 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3143 if (Ptr == Store->getValueOperand()) 3144 return WideningDecision == CM_Scalarize; 3145 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3146 "Ptr is neither a value or pointer operand"); 3147 return WideningDecision != CM_GatherScatter; 3148 }; 3149 3150 // A helper that returns true if the given value is a bitcast or 3151 // getelementptr instruction contained in the loop. 3152 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 3153 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 3154 isa<GetElementPtrInst>(V)) && 3155 !TheLoop->isLoopInvariant(V); 3156 }; 3157 3158 // A helper that evaluates a memory access's use of a pointer. If the use will 3159 // be a scalar use and the pointer is only used by memory accesses, we place 3160 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3161 // PossibleNonScalarPtrs. 3162 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3163 // We only care about bitcast and getelementptr instructions contained in 3164 // the loop. 3165 if (!isLoopVaryingBitCastOrGEP(Ptr)) 3166 return; 3167 3168 // If the pointer has already been identified as scalar (e.g., if it was 3169 // also identified as uniform), there's nothing to do. 3170 auto *I = cast<Instruction>(Ptr); 3171 if (Worklist.count(I)) 3172 return; 3173 3174 // If the use of the pointer will be a scalar use, and all users of the 3175 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3176 // place the pointer in PossibleNonScalarPtrs. 3177 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 3178 return isa<LoadInst>(U) || isa<StoreInst>(U); 3179 })) 3180 ScalarPtrs.insert(I); 3181 else 3182 PossibleNonScalarPtrs.insert(I); 3183 }; 3184 3185 // We seed the scalars analysis with three classes of instructions: (1) 3186 // instructions marked uniform-after-vectorization and (2) bitcast, 3187 // getelementptr and (pointer) phi instructions used by memory accesses 3188 // requiring a scalar use. 3189 // 3190 // (1) Add to the worklist all instructions that have been identified as 3191 // uniform-after-vectorization. 3192 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3193 3194 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3195 // memory accesses requiring a scalar use. The pointer operands of loads and 3196 // stores will be scalar as long as the memory accesses is not a gather or 3197 // scatter operation. The value operand of a store will remain scalar if the 3198 // store is scalarized. 3199 for (auto *BB : TheLoop->blocks()) 3200 for (auto &I : *BB) { 3201 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3202 evaluatePtrUse(Load, Load->getPointerOperand()); 3203 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3204 evaluatePtrUse(Store, Store->getPointerOperand()); 3205 evaluatePtrUse(Store, Store->getValueOperand()); 3206 } 3207 } 3208 for (auto *I : ScalarPtrs) 3209 if (!PossibleNonScalarPtrs.count(I)) { 3210 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3211 Worklist.insert(I); 3212 } 3213 3214 // Insert the forced scalars. 3215 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3216 // induction variable when the PHI user is scalarized. 3217 auto ForcedScalar = ForcedScalars.find(VF); 3218 if (ForcedScalar != ForcedScalars.end()) 3219 for (auto *I : ForcedScalar->second) { 3220 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3221 Worklist.insert(I); 3222 } 3223 3224 // Expand the worklist by looking through any bitcasts and getelementptr 3225 // instructions we've already identified as scalar. This is similar to the 3226 // expansion step in collectLoopUniforms(); however, here we're only 3227 // expanding to include additional bitcasts and getelementptr instructions. 3228 unsigned Idx = 0; 3229 while (Idx != Worklist.size()) { 3230 Instruction *Dst = Worklist[Idx++]; 3231 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 3232 continue; 3233 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3234 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3235 auto *J = cast<Instruction>(U); 3236 return !TheLoop->contains(J) || Worklist.count(J) || 3237 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3238 isScalarUse(J, Src)); 3239 })) { 3240 Worklist.insert(Src); 3241 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3242 } 3243 } 3244 3245 // An induction variable will remain scalar if all users of the induction 3246 // variable and induction variable update remain scalar. 3247 for (const auto &Induction : Legal->getInductionVars()) { 3248 auto *Ind = Induction.first; 3249 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3250 3251 // If tail-folding is applied, the primary induction variable will be used 3252 // to feed a vector compare. 3253 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3254 continue; 3255 3256 // Returns true if \p Indvar is a pointer induction that is used directly by 3257 // load/store instruction \p I. 3258 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3259 Instruction *I) { 3260 return Induction.second.getKind() == 3261 InductionDescriptor::IK_PtrInduction && 3262 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3263 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 3264 }; 3265 3266 // Determine if all users of the induction variable are scalar after 3267 // vectorization. 3268 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 3269 auto *I = cast<Instruction>(U); 3270 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3271 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3272 }); 3273 if (!ScalarInd) 3274 continue; 3275 3276 // If the induction variable update is a fixed-order recurrence, neither the 3277 // induction variable or its update should be marked scalar after 3278 // vectorization. 3279 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); 3280 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) 3281 continue; 3282 3283 // Determine if all users of the induction variable update instruction are 3284 // scalar after vectorization. 3285 auto ScalarIndUpdate = 3286 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 3287 auto *I = cast<Instruction>(U); 3288 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3289 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3290 }); 3291 if (!ScalarIndUpdate) 3292 continue; 3293 3294 // The induction variable and its update instruction will remain scalar. 3295 Worklist.insert(Ind); 3296 Worklist.insert(IndUpdate); 3297 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3298 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3299 << "\n"); 3300 } 3301 3302 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3303 } 3304 3305 bool LoopVectorizationCostModel::isScalarWithPredication( 3306 Instruction *I, ElementCount VF) const { 3307 if (!isPredicatedInst(I)) 3308 return false; 3309 3310 // Do we have a non-scalar lowering for this predicated 3311 // instruction? No - it is scalar with predication. 3312 switch(I->getOpcode()) { 3313 default: 3314 return true; 3315 case Instruction::Call: 3316 if (VF.isScalar()) 3317 return true; 3318 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3319 .Kind == CM_Scalarize; 3320 case Instruction::Load: 3321 case Instruction::Store: { 3322 auto *Ptr = getLoadStorePointerOperand(I); 3323 auto *Ty = getLoadStoreType(I); 3324 Type *VTy = Ty; 3325 if (VF.isVector()) 3326 VTy = VectorType::get(Ty, VF); 3327 const Align Alignment = getLoadStoreAlignment(I); 3328 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3329 TTI.isLegalMaskedGather(VTy, Alignment)) 3330 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3331 TTI.isLegalMaskedScatter(VTy, Alignment)); 3332 } 3333 case Instruction::UDiv: 3334 case Instruction::SDiv: 3335 case Instruction::SRem: 3336 case Instruction::URem: { 3337 // We have the option to use the safe-divisor idiom to avoid predication. 3338 // The cost based decision here will always select safe-divisor for 3339 // scalable vectors as scalarization isn't legal. 3340 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3341 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3342 } 3343 } 3344 } 3345 3346 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3347 if (!blockNeedsPredicationForAnyReason(I->getParent())) 3348 return false; 3349 3350 // Can we prove this instruction is safe to unconditionally execute? 3351 // If not, we must use some form of predication. 3352 switch(I->getOpcode()) { 3353 default: 3354 return false; 3355 case Instruction::Load: 3356 case Instruction::Store: { 3357 if (!Legal->isMaskRequired(I)) 3358 return false; 3359 // When we know the load's address is loop invariant and the instruction 3360 // in the original scalar loop was unconditionally executed then we 3361 // don't need to mark it as a predicated instruction. Tail folding may 3362 // introduce additional predication, but we're guaranteed to always have 3363 // at least one active lane. We call Legal->blockNeedsPredication here 3364 // because it doesn't query tail-folding. For stores, we need to prove 3365 // both speculation safety (which follows from the same argument as loads), 3366 // but also must prove the value being stored is correct. The easiest 3367 // form of the later is to require that all values stored are the same. 3368 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 3369 (isa<LoadInst>(I) || 3370 (isa<StoreInst>(I) && 3371 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 3372 !Legal->blockNeedsPredication(I->getParent())) 3373 return false; 3374 return true; 3375 } 3376 case Instruction::UDiv: 3377 case Instruction::SDiv: 3378 case Instruction::SRem: 3379 case Instruction::URem: 3380 // TODO: We can use the loop-preheader as context point here and get 3381 // context sensitive reasoning 3382 return !isSafeToSpeculativelyExecute(I); 3383 case Instruction::Call: 3384 return Legal->isMaskRequired(I); 3385 } 3386 } 3387 3388 std::pair<InstructionCost, InstructionCost> 3389 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 3390 ElementCount VF) const { 3391 assert(I->getOpcode() == Instruction::UDiv || 3392 I->getOpcode() == Instruction::SDiv || 3393 I->getOpcode() == Instruction::SRem || 3394 I->getOpcode() == Instruction::URem); 3395 assert(!isSafeToSpeculativelyExecute(I)); 3396 3397 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3398 3399 // Scalarization isn't legal for scalable vector types 3400 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 3401 if (!VF.isScalable()) { 3402 // Get the scalarization cost and scale this amount by the probability of 3403 // executing the predicated block. If the instruction is not predicated, 3404 // we fall through to the next case. 3405 ScalarizationCost = 0; 3406 3407 // These instructions have a non-void type, so account for the phi nodes 3408 // that we will create. This cost is likely to be zero. The phi node 3409 // cost, if any, should be scaled by the block probability because it 3410 // models a copy at the end of each predicated block. 3411 ScalarizationCost += VF.getKnownMinValue() * 3412 TTI.getCFInstrCost(Instruction::PHI, CostKind); 3413 3414 // The cost of the non-predicated instruction. 3415 ScalarizationCost += VF.getKnownMinValue() * 3416 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 3417 3418 // The cost of insertelement and extractelement instructions needed for 3419 // scalarization. 3420 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 3421 3422 // Scale the cost by the probability of executing the predicated blocks. 3423 // This assumes the predicated block for each vector lane is equally 3424 // likely. 3425 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 3426 } 3427 InstructionCost SafeDivisorCost = 0; 3428 3429 auto *VecTy = ToVectorTy(I->getType(), VF); 3430 3431 // The cost of the select guard to ensure all lanes are well defined 3432 // after we speculate above any internal control flow. 3433 SafeDivisorCost += TTI.getCmpSelInstrCost( 3434 Instruction::Select, VecTy, 3435 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 3436 CmpInst::BAD_ICMP_PREDICATE, CostKind); 3437 3438 // Certain instructions can be cheaper to vectorize if they have a constant 3439 // second vector operand. One example of this are shifts on x86. 3440 Value *Op2 = I->getOperand(1); 3441 auto Op2Info = TTI.getOperandInfo(Op2); 3442 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 3443 Legal->isInvariant(Op2)) 3444 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 3445 3446 SmallVector<const Value *, 4> Operands(I->operand_values()); 3447 SafeDivisorCost += TTI.getArithmeticInstrCost( 3448 I->getOpcode(), VecTy, CostKind, 3449 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 3450 Op2Info, Operands, I); 3451 return {ScalarizationCost, SafeDivisorCost}; 3452 } 3453 3454 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 3455 Instruction *I, ElementCount VF) const { 3456 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 3457 assert(getWideningDecision(I, VF) == CM_Unknown && 3458 "Decision should not be set yet."); 3459 auto *Group = getInterleavedAccessGroup(I); 3460 assert(Group && "Must have a group."); 3461 3462 // If the instruction's allocated size doesn't equal it's type size, it 3463 // requires padding and will be scalarized. 3464 auto &DL = I->getDataLayout(); 3465 auto *ScalarTy = getLoadStoreType(I); 3466 if (hasIrregularType(ScalarTy, DL)) 3467 return false; 3468 3469 // If the group involves a non-integral pointer, we may not be able to 3470 // losslessly cast all values to a common type. 3471 unsigned InterleaveFactor = Group->getFactor(); 3472 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 3473 for (unsigned i = 0; i < InterleaveFactor; i++) { 3474 Instruction *Member = Group->getMember(i); 3475 if (!Member) 3476 continue; 3477 auto *MemberTy = getLoadStoreType(Member); 3478 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 3479 // Don't coerce non-integral pointers to integers or vice versa. 3480 if (MemberNI != ScalarNI) { 3481 // TODO: Consider adding special nullptr value case here 3482 return false; 3483 } else if (MemberNI && ScalarNI && 3484 ScalarTy->getPointerAddressSpace() != 3485 MemberTy->getPointerAddressSpace()) { 3486 return false; 3487 } 3488 } 3489 3490 // Check if masking is required. 3491 // A Group may need masking for one of two reasons: it resides in a block that 3492 // needs predication, or it was decided to use masking to deal with gaps 3493 // (either a gap at the end of a load-access that may result in a speculative 3494 // load, or any gaps in a store-access). 3495 bool PredicatedAccessRequiresMasking = 3496 blockNeedsPredicationForAnyReason(I->getParent()) && 3497 Legal->isMaskRequired(I); 3498 bool LoadAccessWithGapsRequiresEpilogMasking = 3499 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 3500 !isScalarEpilogueAllowed(); 3501 bool StoreAccessWithGapsRequiresMasking = 3502 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 3503 if (!PredicatedAccessRequiresMasking && 3504 !LoadAccessWithGapsRequiresEpilogMasking && 3505 !StoreAccessWithGapsRequiresMasking) 3506 return true; 3507 3508 // If masked interleaving is required, we expect that the user/target had 3509 // enabled it, because otherwise it either wouldn't have been created or 3510 // it should have been invalidated by the CostModel. 3511 assert(useMaskedInterleavedAccesses(TTI) && 3512 "Masked interleave-groups for predicated accesses are not enabled."); 3513 3514 if (Group->isReverse()) 3515 return false; 3516 3517 auto *Ty = getLoadStoreType(I); 3518 const Align Alignment = getLoadStoreAlignment(I); 3519 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 3520 : TTI.isLegalMaskedStore(Ty, Alignment); 3521 } 3522 3523 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 3524 Instruction *I, ElementCount VF) { 3525 // Get and ensure we have a valid memory instruction. 3526 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 3527 3528 auto *Ptr = getLoadStorePointerOperand(I); 3529 auto *ScalarTy = getLoadStoreType(I); 3530 3531 // In order to be widened, the pointer should be consecutive, first of all. 3532 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 3533 return false; 3534 3535 // If the instruction is a store located in a predicated block, it will be 3536 // scalarized. 3537 if (isScalarWithPredication(I, VF)) 3538 return false; 3539 3540 // If the instruction's allocated size doesn't equal it's type size, it 3541 // requires padding and will be scalarized. 3542 auto &DL = I->getDataLayout(); 3543 if (hasIrregularType(ScalarTy, DL)) 3544 return false; 3545 3546 return true; 3547 } 3548 3549 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 3550 // We should not collect Uniforms more than once per VF. Right now, 3551 // this function is called from collectUniformsAndScalars(), which 3552 // already does this check. Collecting Uniforms for VF=1 does not make any 3553 // sense. 3554 3555 assert(VF.isVector() && !Uniforms.contains(VF) && 3556 "This function should not be visited twice for the same VF"); 3557 3558 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 3559 // not analyze again. Uniforms.count(VF) will return 1. 3560 Uniforms[VF].clear(); 3561 3562 // We now know that the loop is vectorizable! 3563 // Collect instructions inside the loop that will remain uniform after 3564 // vectorization. 3565 3566 // Global values, params and instructions outside of current loop are out of 3567 // scope. 3568 auto isOutOfScope = [&](Value *V) -> bool { 3569 Instruction *I = dyn_cast<Instruction>(V); 3570 return (!I || !TheLoop->contains(I)); 3571 }; 3572 3573 // Worklist containing uniform instructions demanding lane 0. 3574 SetVector<Instruction *> Worklist; 3575 3576 // Add uniform instructions demanding lane 0 to the worklist. Instructions 3577 // that require predication must not be considered uniform after 3578 // vectorization, because that would create an erroneous replicating region 3579 // where only a single instance out of VF should be formed. 3580 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 3581 if (isOutOfScope(I)) { 3582 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 3583 << *I << "\n"); 3584 return; 3585 } 3586 if (isPredicatedInst(I)) { 3587 LLVM_DEBUG( 3588 dbgs() << "LV: Found not uniform due to requiring predication: " << *I 3589 << "\n"); 3590 return; 3591 } 3592 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 3593 Worklist.insert(I); 3594 }; 3595 3596 // Start with the conditional branches exiting the loop. If the branch 3597 // condition is an instruction contained in the loop that is only used by the 3598 // branch, it is uniform. 3599 SmallVector<BasicBlock *> Exiting; 3600 TheLoop->getExitingBlocks(Exiting); 3601 for (BasicBlock *E : Exiting) { 3602 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); 3603 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 3604 addToWorklistIfAllowed(Cmp); 3605 } 3606 3607 auto PrevVF = VF.divideCoefficientBy(2); 3608 // Return true if all lanes perform the same memory operation, and we can 3609 // thus chose to execute only one. 3610 auto isUniformMemOpUse = [&](Instruction *I) { 3611 // If the value was already known to not be uniform for the previous 3612 // (smaller VF), it cannot be uniform for the larger VF. 3613 if (PrevVF.isVector()) { 3614 auto Iter = Uniforms.find(PrevVF); 3615 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 3616 return false; 3617 } 3618 if (!Legal->isUniformMemOp(*I, VF)) 3619 return false; 3620 if (isa<LoadInst>(I)) 3621 // Loading the same address always produces the same result - at least 3622 // assuming aliasing and ordering which have already been checked. 3623 return true; 3624 // Storing the same value on every iteration. 3625 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 3626 }; 3627 3628 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 3629 InstWidening WideningDecision = getWideningDecision(I, VF); 3630 assert(WideningDecision != CM_Unknown && 3631 "Widening decision should be ready at this moment"); 3632 3633 if (isUniformMemOpUse(I)) 3634 return true; 3635 3636 return (WideningDecision == CM_Widen || 3637 WideningDecision == CM_Widen_Reverse || 3638 WideningDecision == CM_Interleave); 3639 }; 3640 3641 // Returns true if Ptr is the pointer operand of a memory access instruction 3642 // I, I is known to not require scalarization, and the pointer is not also 3643 // stored. 3644 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 3645 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 3646 return false; 3647 return getLoadStorePointerOperand(I) == Ptr && 3648 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 3649 }; 3650 3651 // Holds a list of values which are known to have at least one uniform use. 3652 // Note that there may be other uses which aren't uniform. A "uniform use" 3653 // here is something which only demands lane 0 of the unrolled iterations; 3654 // it does not imply that all lanes produce the same value (e.g. this is not 3655 // the usual meaning of uniform) 3656 SetVector<Value *> HasUniformUse; 3657 3658 // Scan the loop for instructions which are either a) known to have only 3659 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3660 for (auto *BB : TheLoop->blocks()) 3661 for (auto &I : *BB) { 3662 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 3663 switch (II->getIntrinsicID()) { 3664 case Intrinsic::sideeffect: 3665 case Intrinsic::experimental_noalias_scope_decl: 3666 case Intrinsic::assume: 3667 case Intrinsic::lifetime_start: 3668 case Intrinsic::lifetime_end: 3669 if (TheLoop->hasLoopInvariantOperands(&I)) 3670 addToWorklistIfAllowed(&I); 3671 break; 3672 default: 3673 break; 3674 } 3675 } 3676 3677 // ExtractValue instructions must be uniform, because the operands are 3678 // known to be loop-invariant. 3679 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 3680 assert(isOutOfScope(EVI->getAggregateOperand()) && 3681 "Expected aggregate value to be loop invariant"); 3682 addToWorklistIfAllowed(EVI); 3683 continue; 3684 } 3685 3686 // If there's no pointer operand, there's nothing to do. 3687 auto *Ptr = getLoadStorePointerOperand(&I); 3688 if (!Ptr) 3689 continue; 3690 3691 if (isUniformMemOpUse(&I)) 3692 addToWorklistIfAllowed(&I); 3693 3694 if (isVectorizedMemAccessUse(&I, Ptr)) 3695 HasUniformUse.insert(Ptr); 3696 } 3697 3698 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 3699 // demanding) users. Since loops are assumed to be in LCSSA form, this 3700 // disallows uses outside the loop as well. 3701 for (auto *V : HasUniformUse) { 3702 if (isOutOfScope(V)) 3703 continue; 3704 auto *I = cast<Instruction>(V); 3705 auto UsersAreMemAccesses = 3706 llvm::all_of(I->users(), [&](User *U) -> bool { 3707 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 3708 }); 3709 if (UsersAreMemAccesses) 3710 addToWorklistIfAllowed(I); 3711 } 3712 3713 // Expand Worklist in topological order: whenever a new instruction 3714 // is added , its users should be already inside Worklist. It ensures 3715 // a uniform instruction will only be used by uniform instructions. 3716 unsigned idx = 0; 3717 while (idx != Worklist.size()) { 3718 Instruction *I = Worklist[idx++]; 3719 3720 for (auto *OV : I->operand_values()) { 3721 // isOutOfScope operands cannot be uniform instructions. 3722 if (isOutOfScope(OV)) 3723 continue; 3724 // First order recurrence Phi's should typically be considered 3725 // non-uniform. 3726 auto *OP = dyn_cast<PHINode>(OV); 3727 if (OP && Legal->isFixedOrderRecurrence(OP)) 3728 continue; 3729 // If all the users of the operand are uniform, then add the 3730 // operand into the uniform worklist. 3731 auto *OI = cast<Instruction>(OV); 3732 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 3733 auto *J = cast<Instruction>(U); 3734 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 3735 })) 3736 addToWorklistIfAllowed(OI); 3737 } 3738 } 3739 3740 // For an instruction to be added into Worklist above, all its users inside 3741 // the loop should also be in Worklist. However, this condition cannot be 3742 // true for phi nodes that form a cyclic dependence. We must process phi 3743 // nodes separately. An induction variable will remain uniform if all users 3744 // of the induction variable and induction variable update remain uniform. 3745 // The code below handles both pointer and non-pointer induction variables. 3746 BasicBlock *Latch = TheLoop->getLoopLatch(); 3747 for (const auto &Induction : Legal->getInductionVars()) { 3748 auto *Ind = Induction.first; 3749 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3750 3751 // Determine if all users of the induction variable are uniform after 3752 // vectorization. 3753 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 3754 auto *I = cast<Instruction>(U); 3755 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3756 isVectorizedMemAccessUse(I, Ind); 3757 }); 3758 if (!UniformInd) 3759 continue; 3760 3761 // Determine if all users of the induction variable update instruction are 3762 // uniform after vectorization. 3763 auto UniformIndUpdate = 3764 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 3765 auto *I = cast<Instruction>(U); 3766 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3767 isVectorizedMemAccessUse(I, IndUpdate); 3768 }); 3769 if (!UniformIndUpdate) 3770 continue; 3771 3772 // The induction variable and its update instruction will remain uniform. 3773 addToWorklistIfAllowed(Ind); 3774 addToWorklistIfAllowed(IndUpdate); 3775 } 3776 3777 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 3778 } 3779 3780 bool LoopVectorizationCostModel::runtimeChecksRequired() { 3781 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 3782 3783 if (Legal->getRuntimePointerChecking()->Need) { 3784 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 3785 "runtime pointer checks needed. Enable vectorization of this " 3786 "loop with '#pragma clang loop vectorize(enable)' when " 3787 "compiling with -Os/-Oz", 3788 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3789 return true; 3790 } 3791 3792 if (!PSE.getPredicate().isAlwaysTrue()) { 3793 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 3794 "runtime SCEV checks needed. Enable vectorization of this " 3795 "loop with '#pragma clang loop vectorize(enable)' when " 3796 "compiling with -Os/-Oz", 3797 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3798 return true; 3799 } 3800 3801 // FIXME: Avoid specializing for stride==1 instead of bailing out. 3802 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 3803 reportVectorizationFailure("Runtime stride check for small trip count", 3804 "runtime stride == 1 checks needed. Enable vectorization of " 3805 "this loop without such check by compiling with -Os/-Oz", 3806 "CantVersionLoopWithOptForSize", ORE, TheLoop); 3807 return true; 3808 } 3809 3810 return false; 3811 } 3812 3813 bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { 3814 if (IsScalableVectorizationAllowed) 3815 return *IsScalableVectorizationAllowed; 3816 3817 IsScalableVectorizationAllowed = false; 3818 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 3819 return false; 3820 3821 if (Hints->isScalableVectorizationDisabled()) { 3822 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 3823 "ScalableVectorizationDisabled", ORE, TheLoop); 3824 return false; 3825 } 3826 3827 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 3828 3829 auto MaxScalableVF = ElementCount::getScalable( 3830 std::numeric_limits<ElementCount::ScalarTy>::max()); 3831 3832 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 3833 // FIXME: While for scalable vectors this is currently sufficient, this should 3834 // be replaced by a more detailed mechanism that filters out specific VFs, 3835 // instead of invalidating vectorization for a whole set of VFs based on the 3836 // MaxVF. 3837 3838 // Disable scalable vectorization if the loop contains unsupported reductions. 3839 if (!canVectorizeReductions(MaxScalableVF)) { 3840 reportVectorizationInfo( 3841 "Scalable vectorization not supported for the reduction " 3842 "operations found in this loop.", 3843 "ScalableVFUnfeasible", ORE, TheLoop); 3844 return false; 3845 } 3846 3847 // Disable scalable vectorization if the loop contains any instructions 3848 // with element types not supported for scalable vectors. 3849 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 3850 return !Ty->isVoidTy() && 3851 !this->TTI.isElementTypeLegalForScalableVector(Ty); 3852 })) { 3853 reportVectorizationInfo("Scalable vectorization is not supported " 3854 "for all element types found in this loop.", 3855 "ScalableVFUnfeasible", ORE, TheLoop); 3856 return false; 3857 } 3858 3859 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { 3860 reportVectorizationInfo("The target does not provide maximum vscale value " 3861 "for safe distance analysis.", 3862 "ScalableVFUnfeasible", ORE, TheLoop); 3863 return false; 3864 } 3865 3866 IsScalableVectorizationAllowed = true; 3867 return true; 3868 } 3869 3870 ElementCount 3871 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 3872 if (!isScalableVectorizationAllowed()) 3873 return ElementCount::getScalable(0); 3874 3875 auto MaxScalableVF = ElementCount::getScalable( 3876 std::numeric_limits<ElementCount::ScalarTy>::max()); 3877 if (Legal->isSafeForAnyVectorWidth()) 3878 return MaxScalableVF; 3879 3880 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 3881 // Limit MaxScalableVF by the maximum safe dependence distance. 3882 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 3883 3884 if (!MaxScalableVF) 3885 reportVectorizationInfo( 3886 "Max legal vector width too small, scalable vectorization " 3887 "unfeasible.", 3888 "ScalableVFUnfeasible", ORE, TheLoop); 3889 3890 return MaxScalableVF; 3891 } 3892 3893 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 3894 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 3895 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 3896 unsigned SmallestType, WidestType; 3897 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 3898 3899 // Get the maximum safe dependence distance in bits computed by LAA. 3900 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3901 // the memory accesses that is most restrictive (involved in the smallest 3902 // dependence distance). 3903 unsigned MaxSafeElements = 3904 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 3905 3906 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 3907 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 3908 3909 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 3910 << ".\n"); 3911 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 3912 << ".\n"); 3913 3914 // First analyze the UserVF, fall back if the UserVF should be ignored. 3915 if (UserVF) { 3916 auto MaxSafeUserVF = 3917 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 3918 3919 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 3920 // If `VF=vscale x N` is safe, then so is `VF=N` 3921 if (UserVF.isScalable()) 3922 return FixedScalableVFPair( 3923 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 3924 else 3925 return UserVF; 3926 } 3927 3928 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 3929 3930 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3931 // is better to ignore the hint and let the compiler choose a suitable VF. 3932 if (!UserVF.isScalable()) { 3933 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3934 << " is unsafe, clamping to max safe VF=" 3935 << MaxSafeFixedVF << ".\n"); 3936 ORE->emit([&]() { 3937 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3938 TheLoop->getStartLoc(), 3939 TheLoop->getHeader()) 3940 << "User-specified vectorization factor " 3941 << ore::NV("UserVectorizationFactor", UserVF) 3942 << " is unsafe, clamping to maximum safe vectorization factor " 3943 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 3944 }); 3945 return MaxSafeFixedVF; 3946 } 3947 3948 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 3949 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3950 << " is ignored because scalable vectors are not " 3951 "available.\n"); 3952 ORE->emit([&]() { 3953 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3954 TheLoop->getStartLoc(), 3955 TheLoop->getHeader()) 3956 << "User-specified vectorization factor " 3957 << ore::NV("UserVectorizationFactor", UserVF) 3958 << " is ignored because the target does not support scalable " 3959 "vectors. The compiler will pick a more suitable value."; 3960 }); 3961 } else { 3962 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 3963 << " is unsafe. Ignoring scalable UserVF.\n"); 3964 ORE->emit([&]() { 3965 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 3966 TheLoop->getStartLoc(), 3967 TheLoop->getHeader()) 3968 << "User-specified vectorization factor " 3969 << ore::NV("UserVectorizationFactor", UserVF) 3970 << " is unsafe. Ignoring the hint to let the compiler pick a " 3971 "more suitable value."; 3972 }); 3973 } 3974 } 3975 3976 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 3977 << " / " << WidestType << " bits.\n"); 3978 3979 FixedScalableVFPair Result(ElementCount::getFixed(1), 3980 ElementCount::getScalable(0)); 3981 if (auto MaxVF = 3982 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3983 MaxSafeFixedVF, FoldTailByMasking)) 3984 Result.FixedVF = MaxVF; 3985 3986 if (auto MaxVF = 3987 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 3988 MaxSafeScalableVF, FoldTailByMasking)) 3989 if (MaxVF.isScalable()) { 3990 Result.ScalableVF = MaxVF; 3991 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 3992 << "\n"); 3993 } 3994 3995 return Result; 3996 } 3997 3998 FixedScalableVFPair 3999 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4000 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4001 // TODO: It may by useful to do since it's still likely to be dynamically 4002 // uniform if the target can skip. 4003 reportVectorizationFailure( 4004 "Not inserting runtime ptr check for divergent target", 4005 "runtime pointer checks needed. Not enabled for divergent target", 4006 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4007 return FixedScalableVFPair::getNone(); 4008 } 4009 4010 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4011 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 4012 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4013 if (TC == 1) { 4014 reportVectorizationFailure("Single iteration (non) loop", 4015 "loop trip count is one, irrelevant for vectorization", 4016 "SingleIterationLoop", ORE, TheLoop); 4017 return FixedScalableVFPair::getNone(); 4018 } 4019 4020 switch (ScalarEpilogueStatus) { 4021 case CM_ScalarEpilogueAllowed: 4022 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4023 case CM_ScalarEpilogueNotAllowedUsePredicate: 4024 [[fallthrough]]; 4025 case CM_ScalarEpilogueNotNeededUsePredicate: 4026 LLVM_DEBUG( 4027 dbgs() << "LV: vector predicate hint/switch found.\n" 4028 << "LV: Not allowing scalar epilogue, creating predicated " 4029 << "vector loop.\n"); 4030 break; 4031 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4032 // fallthrough as a special case of OptForSize 4033 case CM_ScalarEpilogueNotAllowedOptSize: 4034 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4035 LLVM_DEBUG( 4036 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4037 else 4038 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4039 << "count.\n"); 4040 4041 // Bail if runtime checks are required, which are not good when optimising 4042 // for size. 4043 if (runtimeChecksRequired()) 4044 return FixedScalableVFPair::getNone(); 4045 4046 break; 4047 } 4048 4049 // The only loops we can vectorize without a scalar epilogue, are loops with 4050 // a bottom-test and a single exiting block. We'd have to handle the fact 4051 // that not every instruction executes on the last iteration. This will 4052 // require a lane mask which varies through the vector loop body. (TODO) 4053 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4054 // If there was a tail-folding hint/switch, but we can't fold the tail by 4055 // masking, fallback to a vectorization with a scalar epilogue. 4056 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4057 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4058 "scalar epilogue instead.\n"); 4059 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4060 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4061 } 4062 return FixedScalableVFPair::getNone(); 4063 } 4064 4065 // Now try the tail folding 4066 4067 // Invalidate interleave groups that require an epilogue if we can't mask 4068 // the interleave-group. 4069 if (!useMaskedInterleavedAccesses(TTI)) { 4070 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4071 "No decisions should have been taken at this point"); 4072 // Note: There is no need to invalidate any cost modeling decisions here, as 4073 // non where taken so far. 4074 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4075 } 4076 4077 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4078 4079 // Avoid tail folding if the trip count is known to be a multiple of any VF 4080 // we choose. 4081 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4082 MaxFactors.FixedVF.getFixedValue(); 4083 if (MaxFactors.ScalableVF) { 4084 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4085 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4086 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4087 *MaxPowerOf2RuntimeVF, 4088 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4089 } else 4090 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4091 } 4092 4093 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4094 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4095 "MaxFixedVF must be a power of 2"); 4096 unsigned MaxVFtimesIC = 4097 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4098 ScalarEvolution *SE = PSE.getSE(); 4099 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 4100 const SCEV *ExitCount = SE->getAddExpr( 4101 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4102 const SCEV *Rem = SE->getURemExpr( 4103 SE->applyLoopGuards(ExitCount, TheLoop), 4104 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4105 if (Rem->isZero()) { 4106 // Accept MaxFixedVF if we do not have a tail. 4107 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4108 return MaxFactors; 4109 } 4110 } 4111 4112 // If we don't know the precise trip count, or if the trip count that we 4113 // found modulo the vectorization factor is not zero, try to fold the tail 4114 // by masking. 4115 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4116 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); 4117 if (foldTailByMasking()) { 4118 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { 4119 LLVM_DEBUG( 4120 dbgs() 4121 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4122 "try to generate VP Intrinsics with scalable vector " 4123 "factors only.\n"); 4124 // Tail folded loop using VP intrinsics restricts the VF to be scalable 4125 // for now. 4126 // TODO: extend it for fixed vectors, if required. 4127 assert(MaxFactors.ScalableVF.isScalable() && 4128 "Expected scalable vector factor."); 4129 4130 MaxFactors.FixedVF = ElementCount::getFixed(1); 4131 } 4132 return MaxFactors; 4133 } 4134 4135 // If there was a tail-folding hint/switch, but we can't fold the tail by 4136 // masking, fallback to a vectorization with a scalar epilogue. 4137 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4138 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4139 "scalar epilogue instead.\n"); 4140 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4141 return MaxFactors; 4142 } 4143 4144 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4145 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4146 return FixedScalableVFPair::getNone(); 4147 } 4148 4149 if (TC == 0) { 4150 reportVectorizationFailure( 4151 "Unable to calculate the loop count due to complex control flow", 4152 "unable to calculate the loop count due to complex control flow", 4153 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4154 return FixedScalableVFPair::getNone(); 4155 } 4156 4157 reportVectorizationFailure( 4158 "Cannot optimize for size and vectorize at the same time.", 4159 "cannot optimize for size and vectorize at the same time. " 4160 "Enable vectorization of this loop with '#pragma clang loop " 4161 "vectorize(enable)' when compiling with -Os/-Oz", 4162 "NoTailLoopWithOptForSize", ORE, TheLoop); 4163 return FixedScalableVFPair::getNone(); 4164 } 4165 4166 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4167 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4168 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4169 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4170 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4171 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4172 : TargetTransformInfo::RGK_FixedWidthVector); 4173 4174 // Convenience function to return the minimum of two ElementCounts. 4175 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4176 assert((LHS.isScalable() == RHS.isScalable()) && 4177 "Scalable flags must match"); 4178 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4179 }; 4180 4181 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4182 // Note that both WidestRegister and WidestType may not be a powers of 2. 4183 auto MaxVectorElementCount = ElementCount::get( 4184 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4185 ComputeScalableMaxVF); 4186 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4187 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4188 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4189 4190 if (!MaxVectorElementCount) { 4191 LLVM_DEBUG(dbgs() << "LV: The target has no " 4192 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4193 << " vector registers.\n"); 4194 return ElementCount::getFixed(1); 4195 } 4196 4197 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4198 if (MaxVectorElementCount.isScalable() && 4199 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4200 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4201 auto Min = Attr.getVScaleRangeMin(); 4202 WidestRegisterMinEC *= Min; 4203 } 4204 4205 // When a scalar epilogue is required, at least one iteration of the scalar 4206 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4207 // max VF that results in a dead vector loop. 4208 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4209 MaxTripCount -= 1; 4210 4211 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4212 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4213 // If upper bound loop trip count (TC) is known at compile time there is no 4214 // point in choosing VF greater than TC (as done in the loop below). Select 4215 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4216 // scalable, we only fall back on a fixed VF when the TC is less than or 4217 // equal to the known number of lanes. 4218 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4219 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4220 "exceeding the constant trip count: " 4221 << ClampedUpperTripCount << "\n"); 4222 return ElementCount::get( 4223 ClampedUpperTripCount, 4224 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4225 } 4226 4227 TargetTransformInfo::RegisterKind RegKind = 4228 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4229 : TargetTransformInfo::RGK_FixedWidthVector; 4230 ElementCount MaxVF = MaxVectorElementCount; 4231 if (MaximizeBandwidth || 4232 (MaximizeBandwidth.getNumOccurrences() == 0 && 4233 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4234 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4235 auto MaxVectorElementCountMaxBW = ElementCount::get( 4236 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4237 ComputeScalableMaxVF); 4238 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4239 4240 // Collect all viable vectorization factors larger than the default MaxVF 4241 // (i.e. MaxVectorElementCount). 4242 SmallVector<ElementCount, 8> VFs; 4243 for (ElementCount VS = MaxVectorElementCount * 2; 4244 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4245 VFs.push_back(VS); 4246 4247 // For each VF calculate its register usage. 4248 auto RUs = calculateRegisterUsage(VFs); 4249 4250 // Select the largest VF which doesn't require more registers than existing 4251 // ones. 4252 for (int I = RUs.size() - 1; I >= 0; --I) { 4253 const auto &MLU = RUs[I].MaxLocalUsers; 4254 if (all_of(MLU, [&](decltype(MLU.front()) &LU) { 4255 return LU.second <= TTI.getNumberOfRegisters(LU.first); 4256 })) { 4257 MaxVF = VFs[I]; 4258 break; 4259 } 4260 } 4261 if (ElementCount MinVF = 4262 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4263 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4264 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4265 << ") with target's minimum: " << MinVF << '\n'); 4266 MaxVF = MinVF; 4267 } 4268 } 4269 4270 // Invalidate any widening decisions we might have made, in case the loop 4271 // requires prediction (decided later), but we have already made some 4272 // load/store widening decisions. 4273 invalidateCostModelingDecisions(); 4274 } 4275 return MaxVF; 4276 } 4277 4278 /// Convenience function that returns the value of vscale_range iff 4279 /// vscale_range.min == vscale_range.max or otherwise returns the value 4280 /// returned by the corresponding TTI method. 4281 static std::optional<unsigned> 4282 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4283 const Function *Fn = L->getHeader()->getParent(); 4284 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4285 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4286 auto Min = Attr.getVScaleRangeMin(); 4287 auto Max = Attr.getVScaleRangeMax(); 4288 if (Max && Min == Max) 4289 return Max; 4290 } 4291 4292 return TTI.getVScaleForTuning(); 4293 } 4294 4295 bool LoopVectorizationPlanner::isMoreProfitable( 4296 const VectorizationFactor &A, const VectorizationFactor &B) const { 4297 InstructionCost CostA = A.Cost; 4298 InstructionCost CostB = B.Cost; 4299 4300 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 4301 4302 // Improve estimate for the vector width if it is scalable. 4303 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4304 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4305 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4306 if (A.Width.isScalable()) 4307 EstimatedWidthA *= *VScale; 4308 if (B.Width.isScalable()) 4309 EstimatedWidthB *= *VScale; 4310 } 4311 4312 // Assume vscale may be larger than 1 (or the value being tuned for), 4313 // so that scalable vectorization is slightly favorable over fixed-width 4314 // vectorization. 4315 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && 4316 A.Width.isScalable() && !B.Width.isScalable(); 4317 4318 auto CmpFn = [PreferScalable](const InstructionCost &LHS, 4319 const InstructionCost &RHS) { 4320 return PreferScalable ? LHS <= RHS : LHS < RHS; 4321 }; 4322 4323 // To avoid the need for FP division: 4324 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4325 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4326 if (!MaxTripCount) 4327 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); 4328 4329 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4330 InstructionCost VectorCost, 4331 InstructionCost ScalarCost) { 4332 // If the trip count is a known (possibly small) constant, the trip count 4333 // will be rounded up to an integer number of iterations under 4334 // FoldTailByMasking. The total cost in that case will be 4335 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4336 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4337 // some extra overheads, but for the purpose of comparing the costs of 4338 // different VFs we can use this to compare the total loop-body cost 4339 // expected after vectorization. 4340 if (CM.foldTailByMasking()) 4341 return VectorCost * divideCeil(MaxTripCount, VF); 4342 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); 4343 }; 4344 4345 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); 4346 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); 4347 return CmpFn(RTCostA, RTCostB); 4348 } 4349 4350 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 4351 OptimizationRemarkEmitter *ORE, 4352 Loop *TheLoop) { 4353 if (InvalidCosts.empty()) 4354 return; 4355 4356 // Emit a report of VFs with invalid costs in the loop. 4357 4358 // Group the remarks per instruction, keeping the instruction order from 4359 // InvalidCosts. 4360 std::map<Instruction *, unsigned> Numbering; 4361 unsigned I = 0; 4362 for (auto &Pair : InvalidCosts) 4363 if (!Numbering.count(Pair.first)) 4364 Numbering[Pair.first] = I++; 4365 4366 // Sort the list, first on instruction(number) then on VF. 4367 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 4368 if (Numbering[A.first] != Numbering[B.first]) 4369 return Numbering[A.first] < Numbering[B.first]; 4370 const auto &LHS = A.second; 4371 const auto &RHS = B.second; 4372 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 4373 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 4374 }); 4375 4376 // For a list of ordered instruction-vf pairs: 4377 // [(load, vf1), (load, vf2), (store, vf1)] 4378 // Group the instructions together to emit separate remarks for: 4379 // load (vf1, vf2) 4380 // store (vf1) 4381 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 4382 auto Subset = ArrayRef<InstructionVFPair>(); 4383 do { 4384 if (Subset.empty()) 4385 Subset = Tail.take_front(1); 4386 4387 Instruction *I = Subset.front().first; 4388 4389 // If the next instruction is different, or if there are no other pairs, 4390 // emit a remark for the collated subset. e.g. 4391 // [(load, vf1), (load, vf2))] 4392 // to emit: 4393 // remark: invalid costs for 'load' at VF=(vf, vf2) 4394 if (Subset == Tail || Tail[Subset.size()].first != I) { 4395 std::string OutString; 4396 raw_string_ostream OS(OutString); 4397 assert(!Subset.empty() && "Unexpected empty range"); 4398 OS << "Instruction with invalid costs prevented vectorization at VF=("; 4399 for (const auto &Pair : Subset) 4400 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 4401 OS << "):"; 4402 if (auto *CI = dyn_cast<CallInst>(I)) 4403 OS << " call to " << CI->getCalledFunction()->getName(); 4404 else 4405 OS << " " << I->getOpcodeName(); 4406 OS.flush(); 4407 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 4408 Tail = Tail.drop_front(Subset.size()); 4409 Subset = {}; 4410 } else 4411 // Grow the subset by one element 4412 Subset = Tail.take_front(Subset.size() + 1); 4413 } while (!Tail.empty()); 4414 } 4415 4416 /// Check if any recipe of \p Plan will generate a vector value, which will be 4417 /// assigned a vector register. 4418 static bool willGenerateVectors(VPlan &Plan, ElementCount VF, 4419 const TargetTransformInfo &TTI) { 4420 assert(VF.isVector() && "Checking a scalar VF?"); 4421 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), 4422 Plan.getCanonicalIV()->getScalarType()->getContext()); 4423 DenseSet<VPRecipeBase *> EphemeralRecipes; 4424 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); 4425 // Set of already visited types. 4426 DenseSet<Type *> Visited; 4427 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( 4428 vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { 4429 for (VPRecipeBase &R : *VPBB) { 4430 if (EphemeralRecipes.contains(&R)) 4431 continue; 4432 // Continue early if the recipe is considered to not produce a vector 4433 // result. Note that this includes VPInstruction where some opcodes may 4434 // produce a vector, to preserve existing behavior as VPInstructions model 4435 // aspects not directly mapped to existing IR instructions. 4436 switch (R.getVPDefID()) { 4437 case VPDef::VPDerivedIVSC: 4438 case VPDef::VPScalarIVStepsSC: 4439 case VPDef::VPScalarCastSC: 4440 case VPDef::VPReplicateSC: 4441 case VPDef::VPInstructionSC: 4442 case VPDef::VPCanonicalIVPHISC: 4443 case VPDef::VPVectorPointerSC: 4444 case VPDef::VPExpandSCEVSC: 4445 case VPDef::VPEVLBasedIVPHISC: 4446 case VPDef::VPPredInstPHISC: 4447 case VPDef::VPBranchOnMaskSC: 4448 continue; 4449 case VPDef::VPReductionSC: 4450 case VPDef::VPActiveLaneMaskPHISC: 4451 case VPDef::VPWidenCallSC: 4452 case VPDef::VPWidenCanonicalIVSC: 4453 case VPDef::VPWidenCastSC: 4454 case VPDef::VPWidenGEPSC: 4455 case VPDef::VPWidenSC: 4456 case VPDef::VPWidenSelectSC: 4457 case VPDef::VPBlendSC: 4458 case VPDef::VPFirstOrderRecurrencePHISC: 4459 case VPDef::VPWidenPHISC: 4460 case VPDef::VPWidenIntOrFpInductionSC: 4461 case VPDef::VPWidenPointerInductionSC: 4462 case VPDef::VPReductionPHISC: 4463 case VPDef::VPInterleaveSC: 4464 case VPDef::VPWidenLoadEVLSC: 4465 case VPDef::VPWidenLoadSC: 4466 case VPDef::VPWidenStoreEVLSC: 4467 case VPDef::VPWidenStoreSC: 4468 break; 4469 default: 4470 llvm_unreachable("unhandled recipe"); 4471 } 4472 4473 auto WillWiden = [&TTI, VF](Type *ScalarTy) { 4474 Type *VectorTy = ToVectorTy(ScalarTy, VF); 4475 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); 4476 if (!NumLegalParts) 4477 return false; 4478 if (VF.isScalable()) { 4479 // <vscale x 1 x iN> is assumed to be profitable over iN because 4480 // scalable registers are a distinct register class from scalar 4481 // ones. If we ever find a target which wants to lower scalable 4482 // vectors back to scalars, we'll need to update this code to 4483 // explicitly ask TTI about the register class uses for each part. 4484 return NumLegalParts <= VF.getKnownMinValue(); 4485 } 4486 // Two or more parts that share a register - are vectorized. 4487 return NumLegalParts < VF.getKnownMinValue(); 4488 }; 4489 4490 // If no def nor is a store, e.g., branches, continue - no value to check. 4491 if (R.getNumDefinedValues() == 0 && 4492 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( 4493 &R)) 4494 continue; 4495 // For multi-def recipes, currently only interleaved loads, suffice to 4496 // check first def only. 4497 // For stores check their stored value; for interleaved stores suffice 4498 // the check first stored value only. In all cases this is the second 4499 // operand. 4500 VPValue *ToCheck = 4501 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); 4502 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); 4503 if (!Visited.insert({ScalarTy}).second) 4504 continue; 4505 if (WillWiden(ScalarTy)) 4506 return true; 4507 } 4508 } 4509 4510 return false; 4511 } 4512 4513 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { 4514 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); 4515 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 4516 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 4517 assert(any_of(VPlans, 4518 [](std::unique_ptr<VPlan> &P) { 4519 return P->hasVF(ElementCount::getFixed(1)); 4520 }) && 4521 "Expected Scalar VF to be a candidate"); 4522 4523 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 4524 ExpectedCost); 4525 VectorizationFactor ChosenFactor = ScalarCost; 4526 4527 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 4528 if (ForceVectorization && 4529 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { 4530 // Ignore scalar width, because the user explicitly wants vectorization. 4531 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4532 // evaluation. 4533 ChosenFactor.Cost = InstructionCost::getMax(); 4534 } 4535 4536 SmallVector<InstructionVFPair> InvalidCosts; 4537 for (auto &P : VPlans) { 4538 for (ElementCount VF : P->vectorFactors()) { 4539 // The cost for scalar VF=1 is already calculated, so ignore it. 4540 if (VF.isScalar()) 4541 continue; 4542 4543 InstructionCost C = CM.expectedCost(VF, &InvalidCosts); 4544 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); 4545 4546 #ifndef NDEBUG 4547 unsigned AssumedMinimumVscale = 4548 getVScaleForTuning(OrigLoop, TTI).value_or(1); 4549 unsigned Width = 4550 Candidate.Width.isScalable() 4551 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 4552 : Candidate.Width.getFixedValue(); 4553 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF 4554 << " costs: " << (Candidate.Cost / Width)); 4555 if (VF.isScalable()) 4556 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 4557 << AssumedMinimumVscale << ")"); 4558 LLVM_DEBUG(dbgs() << ".\n"); 4559 #endif 4560 4561 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 4562 LLVM_DEBUG( 4563 dbgs() 4564 << "LV: Not considering vector loop of width " << VF 4565 << " because it will not generate any vector instructions.\n"); 4566 continue; 4567 } 4568 4569 // If profitable add it to ProfitableVF list. 4570 if (isMoreProfitable(Candidate, ScalarCost)) 4571 ProfitableVFs.push_back(Candidate); 4572 4573 if (isMoreProfitable(Candidate, ChosenFactor)) 4574 ChosenFactor = Candidate; 4575 } 4576 } 4577 4578 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 4579 4580 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 4581 reportVectorizationFailure( 4582 "There are conditional stores.", 4583 "store that is conditionally executed prevents vectorization", 4584 "ConditionalStore", ORE, OrigLoop); 4585 ChosenFactor = ScalarCost; 4586 } 4587 4588 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 4589 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 4590 << "LV: Vectorization seems to be not beneficial, " 4591 << "but was forced by a user.\n"); 4592 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 4593 return ChosenFactor; 4594 } 4595 4596 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 4597 ElementCount VF) const { 4598 // Cross iteration phis such as reductions need special handling and are 4599 // currently unsupported. 4600 if (any_of(OrigLoop->getHeader()->phis(), 4601 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 4602 return false; 4603 4604 // Phis with uses outside of the loop require special handling and are 4605 // currently unsupported. 4606 for (const auto &Entry : Legal->getInductionVars()) { 4607 // Look for uses of the value of the induction at the last iteration. 4608 Value *PostInc = 4609 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 4610 for (User *U : PostInc->users()) 4611 if (!OrigLoop->contains(cast<Instruction>(U))) 4612 return false; 4613 // Look for uses of penultimate value of the induction. 4614 for (User *U : Entry.first->users()) 4615 if (!OrigLoop->contains(cast<Instruction>(U))) 4616 return false; 4617 } 4618 4619 // Epilogue vectorization code has not been auditted to ensure it handles 4620 // non-latch exits properly. It may be fine, but it needs auditted and 4621 // tested. 4622 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 4623 return false; 4624 4625 return true; 4626 } 4627 4628 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 4629 const ElementCount VF) const { 4630 // FIXME: We need a much better cost-model to take different parameters such 4631 // as register pressure, code size increase and cost of extra branches into 4632 // account. For now we apply a very crude heuristic and only consider loops 4633 // with vectorization factors larger than a certain value. 4634 4635 // Allow the target to opt out entirely. 4636 if (!TTI.preferEpilogueVectorization()) 4637 return false; 4638 4639 // We also consider epilogue vectorization unprofitable for targets that don't 4640 // consider interleaving beneficial (eg. MVE). 4641 if (TTI.getMaxInterleaveFactor(VF) <= 1) 4642 return false; 4643 4644 unsigned Multiplier = 1; 4645 if (VF.isScalable()) 4646 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 4647 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 4648 return true; 4649 return false; 4650 } 4651 4652 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 4653 const ElementCount MainLoopVF, unsigned IC) { 4654 VectorizationFactor Result = VectorizationFactor::Disabled(); 4655 if (!EnableEpilogueVectorization) { 4656 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 4657 return Result; 4658 } 4659 4660 if (!CM.isScalarEpilogueAllowed()) { 4661 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 4662 "epilogue is allowed.\n"); 4663 return Result; 4664 } 4665 4666 // Not really a cost consideration, but check for unsupported cases here to 4667 // simplify the logic. 4668 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 4669 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 4670 "is not a supported candidate.\n"); 4671 return Result; 4672 } 4673 4674 if (EpilogueVectorizationForceVF > 1) { 4675 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 4676 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 4677 if (hasPlanWithVF(ForcedEC)) 4678 return {ForcedEC, 0, 0}; 4679 else { 4680 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 4681 "viable.\n"); 4682 return Result; 4683 } 4684 } 4685 4686 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 4687 OrigLoop->getHeader()->getParent()->hasMinSize()) { 4688 LLVM_DEBUG( 4689 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 4690 return Result; 4691 } 4692 4693 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 4694 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 4695 "this loop\n"); 4696 return Result; 4697 } 4698 4699 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4700 // the main loop handles 8 lanes per iteration. We could still benefit from 4701 // vectorizing the epilogue loop with VF=4. 4702 ElementCount EstimatedRuntimeVF = MainLoopVF; 4703 if (MainLoopVF.isScalable()) { 4704 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 4705 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 4706 EstimatedRuntimeVF *= *VScale; 4707 } 4708 4709 ScalarEvolution &SE = *PSE.getSE(); 4710 Type *TCType = Legal->getWidestInductionType(); 4711 const SCEV *RemainingIterations = nullptr; 4712 for (auto &NextVF : ProfitableVFs) { 4713 // Skip candidate VFs without a corresponding VPlan. 4714 if (!hasPlanWithVF(NextVF.Width)) 4715 continue; 4716 4717 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 4718 // vectors) or the VF of the main loop (fixed vectors). 4719 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 4720 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 4721 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 4722 continue; 4723 4724 // If NextVF is greater than the number of remaining iterations, the 4725 // epilogue loop would be dead. Skip such factors. 4726 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 4727 // TODO: extend to support scalable VFs. 4728 if (!RemainingIterations) { 4729 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 4730 RemainingIterations = SE.getURemExpr( 4731 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 4732 } 4733 if (SE.isKnownPredicate( 4734 CmpInst::ICMP_UGT, 4735 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 4736 RemainingIterations)) 4737 continue; 4738 } 4739 4740 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 4741 Result = NextVF; 4742 } 4743 4744 if (Result != VectorizationFactor::Disabled()) 4745 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 4746 << Result.Width << "\n"); 4747 return Result; 4748 } 4749 4750 std::pair<unsigned, unsigned> 4751 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4752 unsigned MinWidth = -1U; 4753 unsigned MaxWidth = 8; 4754 const DataLayout &DL = TheFunction->getDataLayout(); 4755 // For in-loop reductions, no element types are added to ElementTypesInLoop 4756 // if there are no loads/stores in the loop. In this case, check through the 4757 // reduction variables to determine the maximum width. 4758 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 4759 // Reset MaxWidth so that we can find the smallest type used by recurrences 4760 // in the loop. 4761 MaxWidth = -1U; 4762 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 4763 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 4764 // When finding the min width used by the recurrence we need to account 4765 // for casts on the input operands of the recurrence. 4766 MaxWidth = std::min<unsigned>( 4767 MaxWidth, std::min<unsigned>( 4768 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 4769 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 4770 } 4771 } else { 4772 for (Type *T : ElementTypesInLoop) { 4773 MinWidth = std::min<unsigned>( 4774 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4775 MaxWidth = std::max<unsigned>( 4776 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 4777 } 4778 } 4779 return {MinWidth, MaxWidth}; 4780 } 4781 4782 void LoopVectorizationCostModel::collectElementTypesForWidening() { 4783 ElementTypesInLoop.clear(); 4784 // For each block. 4785 for (BasicBlock *BB : TheLoop->blocks()) { 4786 // For each instruction in the loop. 4787 for (Instruction &I : BB->instructionsWithoutDebug()) { 4788 Type *T = I.getType(); 4789 4790 // Skip ignored values. 4791 if (ValuesToIgnore.count(&I)) 4792 continue; 4793 4794 // Only examine Loads, Stores and PHINodes. 4795 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 4796 continue; 4797 4798 // Examine PHI nodes that are reduction variables. Update the type to 4799 // account for the recurrence type. 4800 if (auto *PN = dyn_cast<PHINode>(&I)) { 4801 if (!Legal->isReductionVariable(PN)) 4802 continue; 4803 const RecurrenceDescriptor &RdxDesc = 4804 Legal->getReductionVars().find(PN)->second; 4805 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 4806 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 4807 RdxDesc.getRecurrenceType(), 4808 TargetTransformInfo::ReductionFlags())) 4809 continue; 4810 T = RdxDesc.getRecurrenceType(); 4811 } 4812 4813 // Examine the stored values. 4814 if (auto *ST = dyn_cast<StoreInst>(&I)) 4815 T = ST->getValueOperand()->getType(); 4816 4817 assert(T->isSized() && 4818 "Expected the load/store/recurrence type to be sized"); 4819 4820 ElementTypesInLoop.insert(T); 4821 } 4822 } 4823 } 4824 4825 unsigned 4826 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 4827 InstructionCost LoopCost) { 4828 // -- The interleave heuristics -- 4829 // We interleave the loop in order to expose ILP and reduce the loop overhead. 4830 // There are many micro-architectural considerations that we can't predict 4831 // at this level. For example, frontend pressure (on decode or fetch) due to 4832 // code size, or the number and capabilities of the execution ports. 4833 // 4834 // We use the following heuristics to select the interleave count: 4835 // 1. If the code has reductions, then we interleave to break the cross 4836 // iteration dependency. 4837 // 2. If the loop is really small, then we interleave to reduce the loop 4838 // overhead. 4839 // 3. We don't interleave if we think that we will spill registers to memory 4840 // due to the increased register pressure. 4841 4842 if (!isScalarEpilogueAllowed()) 4843 return 1; 4844 4845 // Do not interleave if EVL is preferred and no User IC is specified. 4846 if (foldTailWithEVL()) { 4847 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " 4848 "Unroll factor forced to be 1.\n"); 4849 return 1; 4850 } 4851 4852 // We used the distance for the interleave count. 4853 if (!Legal->isSafeForAnyVectorWidth()) 4854 return 1; 4855 4856 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 4857 const bool HasReductions = !Legal->getReductionVars().empty(); 4858 4859 // If we did not calculate the cost for VF (because the user selected the VF) 4860 // then we calculate the cost of VF here. 4861 if (LoopCost == 0) { 4862 LoopCost = expectedCost(VF); 4863 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 4864 4865 // Loop body is free and there is no need for interleaving. 4866 if (LoopCost == 0) 4867 return 1; 4868 } 4869 4870 RegisterUsage R = calculateRegisterUsage({VF})[0]; 4871 // We divide by these constants so assume that we have at least one 4872 // instruction that uses at least one register. 4873 for (auto& pair : R.MaxLocalUsers) { 4874 pair.second = std::max(pair.second, 1U); 4875 } 4876 4877 // We calculate the interleave count using the following formula. 4878 // Subtract the number of loop invariants from the number of available 4879 // registers. These registers are used by all of the interleaved instances. 4880 // Next, divide the remaining registers by the number of registers that is 4881 // required by the loop, in order to estimate how many parallel instances 4882 // fit without causing spills. All of this is rounded down if necessary to be 4883 // a power of two. We want power of two interleave count to simplify any 4884 // addressing operations or alignment considerations. 4885 // We also want power of two interleave counts to ensure that the induction 4886 // variable of the vector loop wraps to zero, when tail is folded by masking; 4887 // this currently happens when OptForSize, in which case IC is set to 1 above. 4888 unsigned IC = UINT_MAX; 4889 4890 for (auto& pair : R.MaxLocalUsers) { 4891 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 4892 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 4893 << " registers of " 4894 << TTI.getRegisterClassName(pair.first) << " register class\n"); 4895 if (VF.isScalar()) { 4896 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 4897 TargetNumRegisters = ForceTargetNumScalarRegs; 4898 } else { 4899 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 4900 TargetNumRegisters = ForceTargetNumVectorRegs; 4901 } 4902 unsigned MaxLocalUsers = pair.second; 4903 unsigned LoopInvariantRegs = 0; 4904 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 4905 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 4906 4907 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 4908 MaxLocalUsers); 4909 // Don't count the induction variable as interleaved. 4910 if (EnableIndVarRegisterHeur) { 4911 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 4912 std::max(1U, (MaxLocalUsers - 1))); 4913 } 4914 4915 IC = std::min(IC, TmpIC); 4916 } 4917 4918 // Clamp the interleave ranges to reasonable counts. 4919 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 4920 4921 // Check if the user has overridden the max. 4922 if (VF.isScalar()) { 4923 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 4924 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 4925 } else { 4926 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 4927 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 4928 } 4929 4930 unsigned EstimatedVF = VF.getKnownMinValue(); 4931 if (VF.isScalable()) { 4932 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) 4933 EstimatedVF *= *VScale; 4934 } 4935 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 4936 4937 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4938 if (KnownTC > 0) { 4939 // At least one iteration must be scalar when this constraint holds. So the 4940 // maximum available iterations for interleaving is one less. 4941 unsigned AvailableTC = 4942 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; 4943 4944 // If trip count is known we select between two prospective ICs, where 4945 // 1) the aggressive IC is capped by the trip count divided by VF 4946 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 4947 // The final IC is selected in a way that the epilogue loop trip count is 4948 // minimized while maximizing the IC itself, so that we either run the 4949 // vector loop at least once if it generates a small epilogue loop, or else 4950 // we run the vector loop at least twice. 4951 4952 unsigned InterleaveCountUB = bit_floor( 4953 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); 4954 unsigned InterleaveCountLB = bit_floor(std::max( 4955 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 4956 MaxInterleaveCount = InterleaveCountLB; 4957 4958 if (InterleaveCountUB != InterleaveCountLB) { 4959 unsigned TailTripCountUB = 4960 (AvailableTC % (EstimatedVF * InterleaveCountUB)); 4961 unsigned TailTripCountLB = 4962 (AvailableTC % (EstimatedVF * InterleaveCountLB)); 4963 // If both produce same scalar tail, maximize the IC to do the same work 4964 // in fewer vector loop iterations 4965 if (TailTripCountUB == TailTripCountLB) 4966 MaxInterleaveCount = InterleaveCountUB; 4967 } 4968 } else if (BestKnownTC && *BestKnownTC > 0) { 4969 // At least one iteration must be scalar when this constraint holds. So the 4970 // maximum available iterations for interleaving is one less. 4971 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) 4972 ? (*BestKnownTC) - 1 4973 : *BestKnownTC; 4974 4975 // If trip count is an estimated compile time constant, limit the 4976 // IC to be capped by the trip count divided by VF * 2, such that the vector 4977 // loop runs at least twice to make interleaving seem profitable when there 4978 // is an epilogue loop present. Since exact Trip count is not known we 4979 // choose to be conservative in our IC estimate. 4980 MaxInterleaveCount = bit_floor(std::max( 4981 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); 4982 } 4983 4984 assert(MaxInterleaveCount > 0 && 4985 "Maximum interleave count must be greater than 0"); 4986 4987 // Clamp the calculated IC to be between the 1 and the max interleave count 4988 // that the target and trip count allows. 4989 if (IC > MaxInterleaveCount) 4990 IC = MaxInterleaveCount; 4991 else 4992 // Make sure IC is greater than 0. 4993 IC = std::max(1u, IC); 4994 4995 assert(IC > 0 && "Interleave count must be greater than 0."); 4996 4997 // Interleave if we vectorized this loop and there is a reduction that could 4998 // benefit from interleaving. 4999 if (VF.isVector() && HasReductions) { 5000 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5001 return IC; 5002 } 5003 5004 // For any scalar loop that either requires runtime checks or predication we 5005 // are better off leaving this to the unroller. Note that if we've already 5006 // vectorized the loop we will have done the runtime check and so interleaving 5007 // won't require further checks. 5008 bool ScalarInterleavingRequiresPredication = 5009 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5010 return Legal->blockNeedsPredication(BB); 5011 })); 5012 bool ScalarInterleavingRequiresRuntimePointerCheck = 5013 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5014 5015 // We want to interleave small loops in order to reduce the loop overhead and 5016 // potentially expose ILP opportunities. 5017 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5018 << "LV: IC is " << IC << '\n' 5019 << "LV: VF is " << VF << '\n'); 5020 const bool AggressivelyInterleaveReductions = 5021 TTI.enableAggressiveInterleaving(HasReductions); 5022 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5023 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5024 // We assume that the cost overhead is 1 and we use the cost model 5025 // to estimate the cost of the loop and interleave until the cost of the 5026 // loop overhead is about 5% of the cost of the loop. 5027 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5028 SmallLoopCost / *LoopCost.getValue())); 5029 5030 // Interleave until store/load ports (estimated by max interleave count) are 5031 // saturated. 5032 unsigned NumStores = Legal->getNumStores(); 5033 unsigned NumLoads = Legal->getNumLoads(); 5034 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5035 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5036 5037 // There is little point in interleaving for reductions containing selects 5038 // and compares when VF=1 since it may just create more overhead than it's 5039 // worth for loops with small trip counts. This is because we still have to 5040 // do the final reduction after the loop. 5041 bool HasSelectCmpReductions = 5042 HasReductions && 5043 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5044 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5045 return RecurrenceDescriptor::isAnyOfRecurrenceKind( 5046 RdxDesc.getRecurrenceKind()); 5047 }); 5048 if (HasSelectCmpReductions) { 5049 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5050 return 1; 5051 } 5052 5053 // If we have a scalar reduction (vector reductions are already dealt with 5054 // by this point), we can increase the critical path length if the loop 5055 // we're interleaving is inside another loop. For tree-wise reductions 5056 // set the limit to 2, and for ordered reductions it's best to disable 5057 // interleaving entirely. 5058 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5059 bool HasOrderedReductions = 5060 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5061 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5062 return RdxDesc.isOrdered(); 5063 }); 5064 if (HasOrderedReductions) { 5065 LLVM_DEBUG( 5066 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5067 return 1; 5068 } 5069 5070 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5071 SmallIC = std::min(SmallIC, F); 5072 StoresIC = std::min(StoresIC, F); 5073 LoadsIC = std::min(LoadsIC, F); 5074 } 5075 5076 if (EnableLoadStoreRuntimeInterleave && 5077 std::max(StoresIC, LoadsIC) > SmallIC) { 5078 LLVM_DEBUG( 5079 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5080 return std::max(StoresIC, LoadsIC); 5081 } 5082 5083 // If there are scalar reductions and TTI has enabled aggressive 5084 // interleaving for reductions, we will interleave to expose ILP. 5085 if (VF.isScalar() && AggressivelyInterleaveReductions) { 5086 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5087 // Interleave no less than SmallIC but not as aggressive as the normal IC 5088 // to satisfy the rare situation when resources are too limited. 5089 return std::max(IC / 2, SmallIC); 5090 } else { 5091 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5092 return SmallIC; 5093 } 5094 } 5095 5096 // Interleave if this is a large loop (small loops are already dealt with by 5097 // this point) that could benefit from interleaving. 5098 if (AggressivelyInterleaveReductions) { 5099 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5100 return IC; 5101 } 5102 5103 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5104 return 1; 5105 } 5106 5107 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5108 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5109 // This function calculates the register usage by measuring the highest number 5110 // of values that are alive at a single location. Obviously, this is a very 5111 // rough estimation. We scan the loop in a topological order in order and 5112 // assign a number to each instruction. We use RPO to ensure that defs are 5113 // met before their users. We assume that each instruction that has in-loop 5114 // users starts an interval. We record every time that an in-loop value is 5115 // used, so we have a list of the first and last occurrences of each 5116 // instruction. Next, we transpose this data structure into a multi map that 5117 // holds the list of intervals that *end* at a specific location. This multi 5118 // map allows us to perform a linear search. We scan the instructions linearly 5119 // and record each time that a new interval starts, by placing it in a set. 5120 // If we find this value in the multi-map then we remove it from the set. 5121 // The max register usage is the maximum size of the set. 5122 // We also search for instructions that are defined outside the loop, but are 5123 // used inside the loop. We need this number separately from the max-interval 5124 // usage number because when we unroll, loop-invariant values do not take 5125 // more register. 5126 LoopBlocksDFS DFS(TheLoop); 5127 DFS.perform(LI); 5128 5129 RegisterUsage RU; 5130 5131 // Each 'key' in the map opens a new interval. The values 5132 // of the map are the index of the 'last seen' usage of the 5133 // instruction that is the key. 5134 using IntervalMap = DenseMap<Instruction *, unsigned>; 5135 5136 // Maps instruction to its index. 5137 SmallVector<Instruction *, 64> IdxToInstr; 5138 // Marks the end of each interval. 5139 IntervalMap EndPoint; 5140 // Saves the list of instruction indices that are used in the loop. 5141 SmallPtrSet<Instruction *, 8> Ends; 5142 // Saves the list of values that are used in the loop but are defined outside 5143 // the loop (not including non-instruction values such as arguments and 5144 // constants). 5145 SmallSetVector<Instruction *, 8> LoopInvariants; 5146 5147 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5148 for (Instruction &I : BB->instructionsWithoutDebug()) { 5149 IdxToInstr.push_back(&I); 5150 5151 // Save the end location of each USE. 5152 for (Value *U : I.operands()) { 5153 auto *Instr = dyn_cast<Instruction>(U); 5154 5155 // Ignore non-instruction values such as arguments, constants, etc. 5156 // FIXME: Might need some motivation why these values are ignored. If 5157 // for example an argument is used inside the loop it will increase the 5158 // register pressure (so shouldn't we add it to LoopInvariants). 5159 if (!Instr) 5160 continue; 5161 5162 // If this instruction is outside the loop then record it and continue. 5163 if (!TheLoop->contains(Instr)) { 5164 LoopInvariants.insert(Instr); 5165 continue; 5166 } 5167 5168 // Overwrite previous end points. 5169 EndPoint[Instr] = IdxToInstr.size(); 5170 Ends.insert(Instr); 5171 } 5172 } 5173 } 5174 5175 // Saves the list of intervals that end with the index in 'key'. 5176 using InstrList = SmallVector<Instruction *, 2>; 5177 DenseMap<unsigned, InstrList> TransposeEnds; 5178 5179 // Transpose the EndPoints to a list of values that end at each index. 5180 for (auto &Interval : EndPoint) 5181 TransposeEnds[Interval.second].push_back(Interval.first); 5182 5183 SmallPtrSet<Instruction *, 8> OpenIntervals; 5184 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5185 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5186 5187 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5188 5189 const auto &TTICapture = TTI; 5190 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5191 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5192 return 0; 5193 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5194 }; 5195 5196 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5197 Instruction *I = IdxToInstr[i]; 5198 5199 // Remove all of the instructions that end at this location. 5200 InstrList &List = TransposeEnds[i]; 5201 for (Instruction *ToRemove : List) 5202 OpenIntervals.erase(ToRemove); 5203 5204 // Ignore instructions that are never used within the loop. 5205 if (!Ends.count(I)) 5206 continue; 5207 5208 // Skip ignored values. 5209 if (ValuesToIgnore.count(I)) 5210 continue; 5211 5212 collectInLoopReductions(); 5213 5214 // For each VF find the maximum usage of registers. 5215 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5216 // Count the number of registers used, per register class, given all open 5217 // intervals. 5218 // Note that elements in this SmallMapVector will be default constructed 5219 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5220 // there is no previous entry for ClassID. 5221 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5222 5223 if (VFs[j].isScalar()) { 5224 for (auto *Inst : OpenIntervals) { 5225 unsigned ClassID = 5226 TTI.getRegisterClassForType(false, Inst->getType()); 5227 // FIXME: The target might use more than one register for the type 5228 // even in the scalar case. 5229 RegUsage[ClassID] += 1; 5230 } 5231 } else { 5232 collectUniformsAndScalars(VFs[j]); 5233 for (auto *Inst : OpenIntervals) { 5234 // Skip ignored values for VF > 1. 5235 if (VecValuesToIgnore.count(Inst)) 5236 continue; 5237 if (isScalarAfterVectorization(Inst, VFs[j])) { 5238 unsigned ClassID = 5239 TTI.getRegisterClassForType(false, Inst->getType()); 5240 // FIXME: The target might use more than one register for the type 5241 // even in the scalar case. 5242 RegUsage[ClassID] += 1; 5243 } else { 5244 unsigned ClassID = 5245 TTI.getRegisterClassForType(true, Inst->getType()); 5246 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5247 } 5248 } 5249 } 5250 5251 for (auto& pair : RegUsage) { 5252 auto &Entry = MaxUsages[j][pair.first]; 5253 Entry = std::max(Entry, pair.second); 5254 } 5255 } 5256 5257 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5258 << OpenIntervals.size() << '\n'); 5259 5260 // Add the current instruction to the list of open intervals. 5261 OpenIntervals.insert(I); 5262 } 5263 5264 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5265 // Note that elements in this SmallMapVector will be default constructed 5266 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5267 // there is no previous entry for ClassID. 5268 SmallMapVector<unsigned, unsigned, 4> Invariant; 5269 5270 for (auto *Inst : LoopInvariants) { 5271 // FIXME: The target might use more than one register for the type 5272 // even in the scalar case. 5273 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5274 auto *I = cast<Instruction>(U); 5275 return TheLoop != LI->getLoopFor(I->getParent()) || 5276 isScalarAfterVectorization(I, VFs[i]); 5277 }); 5278 5279 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 5280 unsigned ClassID = 5281 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5282 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5283 } 5284 5285 LLVM_DEBUG({ 5286 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5287 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5288 << " item\n"; 5289 for (const auto &pair : MaxUsages[i]) { 5290 dbgs() << "LV(REG): RegisterClass: " 5291 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5292 << " registers\n"; 5293 } 5294 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5295 << " item\n"; 5296 for (const auto &pair : Invariant) { 5297 dbgs() << "LV(REG): RegisterClass: " 5298 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5299 << " registers\n"; 5300 } 5301 }); 5302 5303 RU.LoopInvariantRegs = Invariant; 5304 RU.MaxLocalUsers = MaxUsages[i]; 5305 RUs[i] = RU; 5306 } 5307 5308 return RUs; 5309 } 5310 5311 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5312 ElementCount VF) { 5313 // TODO: Cost model for emulated masked load/store is completely 5314 // broken. This hack guides the cost model to use an artificially 5315 // high enough value to practically disable vectorization with such 5316 // operations, except where previously deployed legality hack allowed 5317 // using very low cost values. This is to avoid regressions coming simply 5318 // from moving "masked load/store" check from legality to cost model. 5319 // Masked Load/Gather emulation was previously never allowed. 5320 // Limited number of Masked Store/Scatter emulation was allowed. 5321 assert((isPredicatedInst(I)) && 5322 "Expecting a scalar emulated instruction"); 5323 return isa<LoadInst>(I) || 5324 (isa<StoreInst>(I) && 5325 NumPredStores > NumberOfStoresToPredicate); 5326 } 5327 5328 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5329 // If we aren't vectorizing the loop, or if we've already collected the 5330 // instructions to scalarize, there's nothing to do. Collection may already 5331 // have occurred if we have a user-selected VF and are now computing the 5332 // expected cost for interleaving. 5333 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5334 return; 5335 5336 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5337 // not profitable to scalarize any instructions, the presence of VF in the 5338 // map will indicate that we've analyzed it already. 5339 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5340 5341 PredicatedBBsAfterVectorization[VF].clear(); 5342 5343 // Find all the instructions that are scalar with predication in the loop and 5344 // determine if it would be better to not if-convert the blocks they are in. 5345 // If so, we also record the instructions to scalarize. 5346 for (BasicBlock *BB : TheLoop->blocks()) { 5347 if (!blockNeedsPredicationForAnyReason(BB)) 5348 continue; 5349 for (Instruction &I : *BB) 5350 if (isScalarWithPredication(&I, VF)) { 5351 ScalarCostsTy ScalarCosts; 5352 // Do not apply discount logic for: 5353 // 1. Scalars after vectorization, as there will only be a single copy 5354 // of the instruction. 5355 // 2. Scalable VF, as that would lead to invalid scalarization costs. 5356 // 3. Emulated masked memrefs, if a hacked cost is needed. 5357 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && 5358 !useEmulatedMaskMemRefHack(&I, VF) && 5359 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5360 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5361 // Remember that BB will remain after vectorization. 5362 PredicatedBBsAfterVectorization[VF].insert(BB); 5363 for (auto *Pred : predecessors(BB)) { 5364 if (Pred->getSingleSuccessor() == BB) 5365 PredicatedBBsAfterVectorization[VF].insert(Pred); 5366 } 5367 } 5368 } 5369 } 5370 5371 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5372 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5373 assert(!isUniformAfterVectorization(PredInst, VF) && 5374 "Instruction marked uniform-after-vectorization will be predicated"); 5375 5376 // Initialize the discount to zero, meaning that the scalar version and the 5377 // vector version cost the same. 5378 InstructionCost Discount = 0; 5379 5380 // Holds instructions to analyze. The instructions we visit are mapped in 5381 // ScalarCosts. Those instructions are the ones that would be scalarized if 5382 // we find that the scalar version costs less. 5383 SmallVector<Instruction *, 8> Worklist; 5384 5385 // Returns true if the given instruction can be scalarized. 5386 auto canBeScalarized = [&](Instruction *I) -> bool { 5387 // We only attempt to scalarize instructions forming a single-use chain 5388 // from the original predicated block that would otherwise be vectorized. 5389 // Although not strictly necessary, we give up on instructions we know will 5390 // already be scalar to avoid traversing chains that are unlikely to be 5391 // beneficial. 5392 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5393 isScalarAfterVectorization(I, VF)) 5394 return false; 5395 5396 // If the instruction is scalar with predication, it will be analyzed 5397 // separately. We ignore it within the context of PredInst. 5398 if (isScalarWithPredication(I, VF)) 5399 return false; 5400 5401 // If any of the instruction's operands are uniform after vectorization, 5402 // the instruction cannot be scalarized. This prevents, for example, a 5403 // masked load from being scalarized. 5404 // 5405 // We assume we will only emit a value for lane zero of an instruction 5406 // marked uniform after vectorization, rather than VF identical values. 5407 // Thus, if we scalarize an instruction that uses a uniform, we would 5408 // create uses of values corresponding to the lanes we aren't emitting code 5409 // for. This behavior can be changed by allowing getScalarValue to clone 5410 // the lane zero values for uniforms rather than asserting. 5411 for (Use &U : I->operands()) 5412 if (auto *J = dyn_cast<Instruction>(U.get())) 5413 if (isUniformAfterVectorization(J, VF)) 5414 return false; 5415 5416 // Otherwise, we can scalarize the instruction. 5417 return true; 5418 }; 5419 5420 // Compute the expected cost discount from scalarizing the entire expression 5421 // feeding the predicated instruction. We currently only consider expressions 5422 // that are single-use instruction chains. 5423 Worklist.push_back(PredInst); 5424 while (!Worklist.empty()) { 5425 Instruction *I = Worklist.pop_back_val(); 5426 5427 // If we've already analyzed the instruction, there's nothing to do. 5428 if (ScalarCosts.contains(I)) 5429 continue; 5430 5431 // Compute the cost of the vector instruction. Note that this cost already 5432 // includes the scalarization overhead of the predicated instruction. 5433 InstructionCost VectorCost = getInstructionCost(I, VF); 5434 5435 // Compute the cost of the scalarized instruction. This cost is the cost of 5436 // the instruction as if it wasn't if-converted and instead remained in the 5437 // predicated block. We will scale this cost by block probability after 5438 // computing the scalarization overhead. 5439 InstructionCost ScalarCost = 5440 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); 5441 5442 // Compute the scalarization overhead of needed insertelement instructions 5443 // and phi nodes. 5444 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5445 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5446 ScalarCost += TTI.getScalarizationOverhead( 5447 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5448 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5449 /*Extract*/ false, CostKind); 5450 ScalarCost += 5451 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5452 } 5453 5454 // Compute the scalarization overhead of needed extractelement 5455 // instructions. For each of the instruction's operands, if the operand can 5456 // be scalarized, add it to the worklist; otherwise, account for the 5457 // overhead. 5458 for (Use &U : I->operands()) 5459 if (auto *J = dyn_cast<Instruction>(U.get())) { 5460 assert(VectorType::isValidElementType(J->getType()) && 5461 "Instruction has non-scalar type"); 5462 if (canBeScalarized(J)) 5463 Worklist.push_back(J); 5464 else if (needsExtract(J, VF)) { 5465 ScalarCost += TTI.getScalarizationOverhead( 5466 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5467 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5468 /*Extract*/ true, CostKind); 5469 } 5470 } 5471 5472 // Scale the total scalar cost by block probability. 5473 ScalarCost /= getReciprocalPredBlockProb(); 5474 5475 // Compute the discount. A non-negative discount means the vector version 5476 // of the instruction costs more, and scalarizing would be beneficial. 5477 Discount += VectorCost - ScalarCost; 5478 ScalarCosts[I] = ScalarCost; 5479 } 5480 5481 return Discount; 5482 } 5483 5484 InstructionCost LoopVectorizationCostModel::expectedCost( 5485 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 5486 InstructionCost Cost; 5487 5488 // For each block. 5489 for (BasicBlock *BB : TheLoop->blocks()) { 5490 InstructionCost BlockCost; 5491 5492 // For each instruction in the old loop. 5493 for (Instruction &I : BB->instructionsWithoutDebug()) { 5494 // Skip ignored values. 5495 if (ValuesToIgnore.count(&I) || 5496 (VF.isVector() && VecValuesToIgnore.count(&I))) 5497 continue; 5498 5499 InstructionCost C = getInstructionCost(&I, VF); 5500 5501 // Check if we should override the cost. 5502 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) 5503 C = InstructionCost(ForceTargetInstructionCost); 5504 5505 // Keep a list of instructions with invalid costs. 5506 if (Invalid && !C.isValid()) 5507 Invalid->emplace_back(&I, VF); 5508 5509 BlockCost += C; 5510 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " 5511 << VF << " For instruction: " << I << '\n'); 5512 } 5513 5514 // If we are vectorizing a predicated block, it will have been 5515 // if-converted. This means that the block's instructions (aside from 5516 // stores and instructions that may divide by zero) will now be 5517 // unconditionally executed. For the scalar case, we may not always execute 5518 // the predicated block, if it is an if-else block. Thus, scale the block's 5519 // cost by the probability of executing it. blockNeedsPredication from 5520 // Legal is used so as to not include all blocks in tail folded loops. 5521 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 5522 BlockCost /= getReciprocalPredBlockProb(); 5523 5524 Cost += BlockCost; 5525 } 5526 5527 return Cost; 5528 } 5529 5530 /// Gets Address Access SCEV after verifying that the access pattern 5531 /// is loop invariant except the induction variable dependence. 5532 /// 5533 /// This SCEV can be sent to the Target in order to estimate the address 5534 /// calculation cost. 5535 static const SCEV *getAddressAccessSCEV( 5536 Value *Ptr, 5537 LoopVectorizationLegality *Legal, 5538 PredicatedScalarEvolution &PSE, 5539 const Loop *TheLoop) { 5540 5541 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5542 if (!Gep) 5543 return nullptr; 5544 5545 // We are looking for a gep with all loop invariant indices except for one 5546 // which should be an induction variable. 5547 auto SE = PSE.getSE(); 5548 unsigned NumOperands = Gep->getNumOperands(); 5549 for (unsigned i = 1; i < NumOperands; ++i) { 5550 Value *Opd = Gep->getOperand(i); 5551 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5552 !Legal->isInductionVariable(Opd)) 5553 return nullptr; 5554 } 5555 5556 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5557 return PSE.getSCEV(Ptr); 5558 } 5559 5560 InstructionCost 5561 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5562 ElementCount VF) { 5563 assert(VF.isVector() && 5564 "Scalarization cost of instruction implies vectorization."); 5565 if (VF.isScalable()) 5566 return InstructionCost::getInvalid(); 5567 5568 Type *ValTy = getLoadStoreType(I); 5569 auto SE = PSE.getSE(); 5570 5571 unsigned AS = getLoadStoreAddressSpace(I); 5572 Value *Ptr = getLoadStorePointerOperand(I); 5573 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5574 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5575 // that it is being called from this specific place. 5576 5577 // Figure out whether the access is strided and get the stride value 5578 // if it's known in compile time 5579 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5580 5581 // Get the cost of the scalar memory instruction and address computation. 5582 InstructionCost Cost = 5583 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5584 5585 // Don't pass *I here, since it is scalar but will actually be part of a 5586 // vectorized loop where the user of it is a vectorized instruction. 5587 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5588 const Align Alignment = getLoadStoreAlignment(I); 5589 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 5590 ValTy->getScalarType(), 5591 Alignment, AS, CostKind); 5592 5593 // Get the overhead of the extractelement and insertelement instructions 5594 // we might create due to scalarization. 5595 Cost += getScalarizationOverhead(I, VF, CostKind); 5596 5597 // If we have a predicated load/store, it will need extra i1 extracts and 5598 // conditional branches, but may not be executed for each vector lane. Scale 5599 // the cost by the probability of executing the predicated block. 5600 if (isPredicatedInst(I)) { 5601 Cost /= getReciprocalPredBlockProb(); 5602 5603 // Add the cost of an i1 extract and a branch 5604 auto *Vec_i1Ty = 5605 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 5606 Cost += TTI.getScalarizationOverhead( 5607 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 5608 /*Insert=*/false, /*Extract=*/true, CostKind); 5609 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 5610 5611 if (useEmulatedMaskMemRefHack(I, VF)) 5612 // Artificially setting to a high enough value to practically disable 5613 // vectorization with such operations. 5614 Cost = 3000000; 5615 } 5616 5617 return Cost; 5618 } 5619 5620 InstructionCost 5621 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5622 ElementCount VF) { 5623 Type *ValTy = getLoadStoreType(I); 5624 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5625 Value *Ptr = getLoadStorePointerOperand(I); 5626 unsigned AS = getLoadStoreAddressSpace(I); 5627 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 5628 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5629 5630 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5631 "Stride should be 1 or -1 for consecutive memory access"); 5632 const Align Alignment = getLoadStoreAlignment(I); 5633 InstructionCost Cost = 0; 5634 if (Legal->isMaskRequired(I)) { 5635 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5636 CostKind); 5637 } else { 5638 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5639 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 5640 CostKind, OpInfo, I); 5641 } 5642 5643 bool Reverse = ConsecutiveStride < 0; 5644 if (Reverse) 5645 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 5646 std::nullopt, CostKind, 0); 5647 return Cost; 5648 } 5649 5650 InstructionCost 5651 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5652 ElementCount VF) { 5653 assert(Legal->isUniformMemOp(*I, VF)); 5654 5655 Type *ValTy = getLoadStoreType(I); 5656 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5657 const Align Alignment = getLoadStoreAlignment(I); 5658 unsigned AS = getLoadStoreAddressSpace(I); 5659 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5660 if (isa<LoadInst>(I)) { 5661 return TTI.getAddressComputationCost(ValTy) + 5662 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 5663 CostKind) + 5664 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5665 } 5666 StoreInst *SI = cast<StoreInst>(I); 5667 5668 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 5669 return TTI.getAddressComputationCost(ValTy) + 5670 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 5671 CostKind) + 5672 (isLoopInvariantStoreValue 5673 ? 0 5674 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5675 CostKind, VF.getKnownMinValue() - 1)); 5676 } 5677 5678 InstructionCost 5679 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5680 ElementCount VF) { 5681 Type *ValTy = getLoadStoreType(I); 5682 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5683 const Align Alignment = getLoadStoreAlignment(I); 5684 const Value *Ptr = getLoadStorePointerOperand(I); 5685 5686 return TTI.getAddressComputationCost(VectorTy) + 5687 TTI.getGatherScatterOpCost( 5688 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 5689 TargetTransformInfo::TCK_RecipThroughput, I); 5690 } 5691 5692 InstructionCost 5693 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5694 ElementCount VF) { 5695 Type *ValTy = getLoadStoreType(I); 5696 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 5697 unsigned AS = getLoadStoreAddressSpace(I); 5698 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5699 5700 auto Group = getInterleavedAccessGroup(I); 5701 assert(Group && "Fail to get an interleaved access group."); 5702 5703 unsigned InterleaveFactor = Group->getFactor(); 5704 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5705 5706 // Holds the indices of existing members in the interleaved group. 5707 SmallVector<unsigned, 4> Indices; 5708 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 5709 if (Group->getMember(IF)) 5710 Indices.push_back(IF); 5711 5712 // Calculate the cost of the whole interleaved group. 5713 bool UseMaskForGaps = 5714 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 5715 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 5716 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 5717 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 5718 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 5719 5720 if (Group->isReverse()) { 5721 // TODO: Add support for reversed masked interleaved access. 5722 assert(!Legal->isMaskRequired(I) && 5723 "Reverse masked interleaved access not supported."); 5724 Cost += Group->getNumMembers() * 5725 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 5726 std::nullopt, CostKind, 0); 5727 } 5728 return Cost; 5729 } 5730 5731 std::optional<InstructionCost> 5732 LoopVectorizationCostModel::getReductionPatternCost( 5733 Instruction *I, ElementCount VF, Type *Ty, 5734 TTI::TargetCostKind CostKind) const { 5735 using namespace llvm::PatternMatch; 5736 // Early exit for no inloop reductions 5737 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 5738 return std::nullopt; 5739 auto *VectorTy = cast<VectorType>(Ty); 5740 5741 // We are looking for a pattern of, and finding the minimal acceptable cost: 5742 // reduce(mul(ext(A), ext(B))) or 5743 // reduce(mul(A, B)) or 5744 // reduce(ext(A)) or 5745 // reduce(A). 5746 // The basic idea is that we walk down the tree to do that, finding the root 5747 // reduction instruction in InLoopReductionImmediateChains. From there we find 5748 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 5749 // of the components. If the reduction cost is lower then we return it for the 5750 // reduction instruction and 0 for the other instructions in the pattern. If 5751 // it is not we return an invalid cost specifying the orignal cost method 5752 // should be used. 5753 Instruction *RetI = I; 5754 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 5755 if (!RetI->hasOneUser()) 5756 return std::nullopt; 5757 RetI = RetI->user_back(); 5758 } 5759 5760 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 5761 RetI->user_back()->getOpcode() == Instruction::Add) { 5762 RetI = RetI->user_back(); 5763 } 5764 5765 // Test if the found instruction is a reduction, and if not return an invalid 5766 // cost specifying the parent to use the original cost modelling. 5767 if (!InLoopReductionImmediateChains.count(RetI)) 5768 return std::nullopt; 5769 5770 // Find the reduction this chain is a part of and calculate the basic cost of 5771 // the reduction on its own. 5772 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 5773 Instruction *ReductionPhi = LastChain; 5774 while (!isa<PHINode>(ReductionPhi)) 5775 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 5776 5777 const RecurrenceDescriptor &RdxDesc = 5778 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 5779 5780 InstructionCost BaseCost; 5781 RecurKind RK = RdxDesc.getRecurrenceKind(); 5782 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 5783 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); 5784 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, 5785 RdxDesc.getFastMathFlags(), CostKind); 5786 } else { 5787 BaseCost = TTI.getArithmeticReductionCost( 5788 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 5789 } 5790 5791 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5792 // normal fmul instruction to the cost of the fadd reduction. 5793 if (RK == RecurKind::FMulAdd) 5794 BaseCost += 5795 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 5796 5797 // If we're using ordered reductions then we can just return the base cost 5798 // here, since getArithmeticReductionCost calculates the full ordered 5799 // reduction cost when FP reassociation is not allowed. 5800 if (useOrderedReductions(RdxDesc)) 5801 return BaseCost; 5802 5803 // Get the operand that was not the reduction chain and match it to one of the 5804 // patterns, returning the better cost if it is found. 5805 Instruction *RedOp = RetI->getOperand(1) == LastChain 5806 ? dyn_cast<Instruction>(RetI->getOperand(0)) 5807 : dyn_cast<Instruction>(RetI->getOperand(1)); 5808 5809 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 5810 5811 Instruction *Op0, *Op1; 5812 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5813 match(RedOp, 5814 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 5815 match(Op0, m_ZExtOrSExt(m_Value())) && 5816 Op0->getOpcode() == Op1->getOpcode() && 5817 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 5818 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 5819 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 5820 5821 // Matched reduce.add(ext(mul(ext(A), ext(B))) 5822 // Note that the extend opcodes need to all match, or if A==B they will have 5823 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5824 // which is equally fine. 5825 bool IsUnsigned = isa<ZExtInst>(Op0); 5826 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 5827 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 5828 5829 InstructionCost ExtCost = 5830 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 5831 TTI::CastContextHint::None, CostKind, Op0); 5832 InstructionCost MulCost = 5833 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 5834 InstructionCost Ext2Cost = 5835 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 5836 TTI::CastContextHint::None, CostKind, RedOp); 5837 5838 InstructionCost RedCost = TTI.getMulAccReductionCost( 5839 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5840 5841 if (RedCost.isValid() && 5842 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 5843 return I == RetI ? RedCost : 0; 5844 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 5845 !TheLoop->isLoopInvariant(RedOp)) { 5846 // Matched reduce(ext(A)) 5847 bool IsUnsigned = isa<ZExtInst>(RedOp); 5848 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 5849 InstructionCost RedCost = TTI.getExtendedReductionCost( 5850 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 5851 RdxDesc.getFastMathFlags(), CostKind); 5852 5853 InstructionCost ExtCost = 5854 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 5855 TTI::CastContextHint::None, CostKind, RedOp); 5856 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 5857 return I == RetI ? RedCost : 0; 5858 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 5859 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 5860 if (match(Op0, m_ZExtOrSExt(m_Value())) && 5861 Op0->getOpcode() == Op1->getOpcode() && 5862 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 5863 bool IsUnsigned = isa<ZExtInst>(Op0); 5864 Type *Op0Ty = Op0->getOperand(0)->getType(); 5865 Type *Op1Ty = Op1->getOperand(0)->getType(); 5866 Type *LargestOpTy = 5867 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 5868 : Op0Ty; 5869 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 5870 5871 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 5872 // different sizes. We take the largest type as the ext to reduce, and add 5873 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 5874 InstructionCost ExtCost0 = TTI.getCastInstrCost( 5875 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 5876 TTI::CastContextHint::None, CostKind, Op0); 5877 InstructionCost ExtCost1 = TTI.getCastInstrCost( 5878 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 5879 TTI::CastContextHint::None, CostKind, Op1); 5880 InstructionCost MulCost = 5881 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5882 5883 InstructionCost RedCost = TTI.getMulAccReductionCost( 5884 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 5885 InstructionCost ExtraExtCost = 0; 5886 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 5887 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 5888 ExtraExtCost = TTI.getCastInstrCost( 5889 ExtraExtOp->getOpcode(), ExtType, 5890 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 5891 TTI::CastContextHint::None, CostKind, ExtraExtOp); 5892 } 5893 5894 if (RedCost.isValid() && 5895 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 5896 return I == RetI ? RedCost : 0; 5897 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 5898 // Matched reduce.add(mul()) 5899 InstructionCost MulCost = 5900 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 5901 5902 InstructionCost RedCost = TTI.getMulAccReductionCost( 5903 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 5904 5905 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 5906 return I == RetI ? RedCost : 0; 5907 } 5908 } 5909 5910 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 5911 } 5912 5913 InstructionCost 5914 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5915 ElementCount VF) { 5916 // Calculate scalar cost only. Vectorization cost should be ready at this 5917 // moment. 5918 if (VF.isScalar()) { 5919 Type *ValTy = getLoadStoreType(I); 5920 const Align Alignment = getLoadStoreAlignment(I); 5921 unsigned AS = getLoadStoreAddressSpace(I); 5922 5923 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 5924 return TTI.getAddressComputationCost(ValTy) + 5925 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 5926 TTI::TCK_RecipThroughput, OpInfo, I); 5927 } 5928 return getWideningCost(I, VF); 5929 } 5930 5931 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 5932 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 5933 5934 // There is no mechanism yet to create a scalable scalarization loop, 5935 // so this is currently Invalid. 5936 if (VF.isScalable()) 5937 return InstructionCost::getInvalid(); 5938 5939 if (VF.isScalar()) 5940 return 0; 5941 5942 InstructionCost Cost = 0; 5943 Type *RetTy = ToVectorTy(I->getType(), VF); 5944 if (!RetTy->isVoidTy() && 5945 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5946 Cost += TTI.getScalarizationOverhead( 5947 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 5948 /*Insert*/ true, 5949 /*Extract*/ false, CostKind); 5950 5951 // Some targets keep addresses scalar. 5952 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5953 return Cost; 5954 5955 // Some targets support efficient element stores. 5956 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5957 return Cost; 5958 5959 // Collect operands to consider. 5960 CallInst *CI = dyn_cast<CallInst>(I); 5961 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 5962 5963 // Skip operands that do not require extraction/scalarization and do not incur 5964 // any overhead. 5965 SmallVector<Type *> Tys; 5966 for (auto *V : filterExtractingOperands(Ops, VF)) 5967 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 5968 return Cost + TTI.getOperandsScalarizationOverhead( 5969 filterExtractingOperands(Ops, VF), Tys, CostKind); 5970 } 5971 5972 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 5973 if (VF.isScalar()) 5974 return; 5975 NumPredStores = 0; 5976 for (BasicBlock *BB : TheLoop->blocks()) { 5977 // For each instruction in the old loop. 5978 for (Instruction &I : *BB) { 5979 Value *Ptr = getLoadStorePointerOperand(&I); 5980 if (!Ptr) 5981 continue; 5982 5983 // TODO: We should generate better code and update the cost model for 5984 // predicated uniform stores. Today they are treated as any other 5985 // predicated store (see added test cases in 5986 // invariant-store-vectorization.ll). 5987 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 5988 NumPredStores++; 5989 5990 if (Legal->isUniformMemOp(I, VF)) { 5991 auto isLegalToScalarize = [&]() { 5992 if (!VF.isScalable()) 5993 // Scalarization of fixed length vectors "just works". 5994 return true; 5995 5996 // We have dedicated lowering for unpredicated uniform loads and 5997 // stores. Note that even with tail folding we know that at least 5998 // one lane is active (i.e. generalized predication is not possible 5999 // here), and the logic below depends on this fact. 6000 if (!foldTailByMasking()) 6001 return true; 6002 6003 // For scalable vectors, a uniform memop load is always 6004 // uniform-by-parts and we know how to scalarize that. 6005 if (isa<LoadInst>(I)) 6006 return true; 6007 6008 // A uniform store isn't neccessarily uniform-by-part 6009 // and we can't assume scalarization. 6010 auto &SI = cast<StoreInst>(I); 6011 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6012 }; 6013 6014 const InstructionCost GatherScatterCost = 6015 isLegalGatherOrScatter(&I, VF) ? 6016 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6017 6018 // Load: Scalar load + broadcast 6019 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6020 // FIXME: This cost is a significant under-estimate for tail folded 6021 // memory ops. 6022 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6023 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6024 6025 // Choose better solution for the current VF, Note that Invalid 6026 // costs compare as maximumal large. If both are invalid, we get 6027 // scalable invalid which signals a failure and a vectorization abort. 6028 if (GatherScatterCost < ScalarizationCost) 6029 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6030 else 6031 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6032 continue; 6033 } 6034 6035 // We assume that widening is the best solution when possible. 6036 if (memoryInstructionCanBeWidened(&I, VF)) { 6037 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6038 int ConsecutiveStride = Legal->isConsecutivePtr( 6039 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6040 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6041 "Expected consecutive stride."); 6042 InstWidening Decision = 6043 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6044 setWideningDecision(&I, VF, Decision, Cost); 6045 continue; 6046 } 6047 6048 // Choose between Interleaving, Gather/Scatter or Scalarization. 6049 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6050 unsigned NumAccesses = 1; 6051 if (isAccessInterleaved(&I)) { 6052 auto Group = getInterleavedAccessGroup(&I); 6053 assert(Group && "Fail to get an interleaved access group."); 6054 6055 // Make one decision for the whole group. 6056 if (getWideningDecision(&I, VF) != CM_Unknown) 6057 continue; 6058 6059 NumAccesses = Group->getNumMembers(); 6060 if (interleavedAccessCanBeWidened(&I, VF)) 6061 InterleaveCost = getInterleaveGroupCost(&I, VF); 6062 } 6063 6064 InstructionCost GatherScatterCost = 6065 isLegalGatherOrScatter(&I, VF) 6066 ? getGatherScatterCost(&I, VF) * NumAccesses 6067 : InstructionCost::getInvalid(); 6068 6069 InstructionCost ScalarizationCost = 6070 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6071 6072 // Choose better solution for the current VF, 6073 // write down this decision and use it during vectorization. 6074 InstructionCost Cost; 6075 InstWidening Decision; 6076 if (InterleaveCost <= GatherScatterCost && 6077 InterleaveCost < ScalarizationCost) { 6078 Decision = CM_Interleave; 6079 Cost = InterleaveCost; 6080 } else if (GatherScatterCost < ScalarizationCost) { 6081 Decision = CM_GatherScatter; 6082 Cost = GatherScatterCost; 6083 } else { 6084 Decision = CM_Scalarize; 6085 Cost = ScalarizationCost; 6086 } 6087 // If the instructions belongs to an interleave group, the whole group 6088 // receives the same decision. The whole group receives the cost, but 6089 // the cost will actually be assigned to one instruction. 6090 if (auto Group = getInterleavedAccessGroup(&I)) 6091 setWideningDecision(Group, VF, Decision, Cost); 6092 else 6093 setWideningDecision(&I, VF, Decision, Cost); 6094 } 6095 } 6096 6097 // Make sure that any load of address and any other address computation 6098 // remains scalar unless there is gather/scatter support. This avoids 6099 // inevitable extracts into address registers, and also has the benefit of 6100 // activating LSR more, since that pass can't optimize vectorized 6101 // addresses. 6102 if (TTI.prefersVectorizedAddressing()) 6103 return; 6104 6105 // Start with all scalar pointer uses. 6106 SmallPtrSet<Instruction *, 8> AddrDefs; 6107 for (BasicBlock *BB : TheLoop->blocks()) 6108 for (Instruction &I : *BB) { 6109 Instruction *PtrDef = 6110 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6111 if (PtrDef && TheLoop->contains(PtrDef) && 6112 getWideningDecision(&I, VF) != CM_GatherScatter) 6113 AddrDefs.insert(PtrDef); 6114 } 6115 6116 // Add all instructions used to generate the addresses. 6117 SmallVector<Instruction *, 4> Worklist; 6118 append_range(Worklist, AddrDefs); 6119 while (!Worklist.empty()) { 6120 Instruction *I = Worklist.pop_back_val(); 6121 for (auto &Op : I->operands()) 6122 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6123 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6124 AddrDefs.insert(InstOp).second) 6125 Worklist.push_back(InstOp); 6126 } 6127 6128 for (auto *I : AddrDefs) { 6129 if (isa<LoadInst>(I)) { 6130 // Setting the desired widening decision should ideally be handled in 6131 // by cost functions, but since this involves the task of finding out 6132 // if the loaded register is involved in an address computation, it is 6133 // instead changed here when we know this is the case. 6134 InstWidening Decision = getWideningDecision(I, VF); 6135 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6136 // Scalarize a widened load of address. 6137 setWideningDecision( 6138 I, VF, CM_Scalarize, 6139 (VF.getKnownMinValue() * 6140 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6141 else if (auto Group = getInterleavedAccessGroup(I)) { 6142 // Scalarize an interleave group of address loads. 6143 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6144 if (Instruction *Member = Group->getMember(I)) 6145 setWideningDecision( 6146 Member, VF, CM_Scalarize, 6147 (VF.getKnownMinValue() * 6148 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6149 } 6150 } 6151 } else 6152 // Make sure I gets scalarized and a cost estimate without 6153 // scalarization overhead. 6154 ForcedScalars[VF].insert(I); 6155 } 6156 } 6157 6158 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6159 assert(!VF.isScalar() && 6160 "Trying to set a vectorization decision for a scalar VF"); 6161 6162 for (BasicBlock *BB : TheLoop->blocks()) { 6163 // For each instruction in the old loop. 6164 for (Instruction &I : *BB) { 6165 CallInst *CI = dyn_cast<CallInst>(&I); 6166 6167 if (!CI) 6168 continue; 6169 6170 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6171 InstructionCost VectorCost = InstructionCost::getInvalid(); 6172 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6173 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6174 6175 Function *ScalarFunc = CI->getCalledFunction(); 6176 Type *ScalarRetTy = CI->getType(); 6177 SmallVector<Type *, 4> Tys, ScalarTys; 6178 bool MaskRequired = Legal->isMaskRequired(CI); 6179 for (auto &ArgOp : CI->args()) 6180 ScalarTys.push_back(ArgOp->getType()); 6181 6182 // Compute corresponding vector type for return value and arguments. 6183 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 6184 for (Type *ScalarTy : ScalarTys) 6185 Tys.push_back(ToVectorTy(ScalarTy, VF)); 6186 6187 // An in-loop reduction using an fmuladd intrinsic is a special case; 6188 // we don't want the normal cost for that intrinsic. 6189 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6190 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6191 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6192 getVectorIntrinsicIDForCall(CI, TLI), 6193 std::nullopt, *RedCost); 6194 continue; 6195 } 6196 6197 // Estimate cost of scalarized vector call. The source operands are 6198 // assumed to be vectors, so we need to extract individual elements from 6199 // there, execute VF scalar calls, and then gather the result into the 6200 // vector return value. 6201 InstructionCost ScalarCallCost = 6202 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6203 6204 // Compute costs of unpacking argument values for the scalar calls and 6205 // packing the return values to a vector. 6206 InstructionCost ScalarizationCost = 6207 getScalarizationOverhead(CI, VF, CostKind); 6208 6209 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6210 6211 // Find the cost of vectorizing the call, if we can find a suitable 6212 // vector variant of the function. 6213 bool UsesMask = false; 6214 VFInfo FuncInfo; 6215 Function *VecFunc = nullptr; 6216 // Search through any available variants for one we can use at this VF. 6217 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6218 // Must match requested VF. 6219 if (Info.Shape.VF != VF) 6220 continue; 6221 6222 // Must take a mask argument if one is required 6223 if (MaskRequired && !Info.isMasked()) 6224 continue; 6225 6226 // Check that all parameter kinds are supported 6227 bool ParamsOk = true; 6228 for (VFParameter Param : Info.Shape.Parameters) { 6229 switch (Param.ParamKind) { 6230 case VFParamKind::Vector: 6231 break; 6232 case VFParamKind::OMP_Uniform: { 6233 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6234 // Make sure the scalar parameter in the loop is invariant. 6235 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6236 TheLoop)) 6237 ParamsOk = false; 6238 break; 6239 } 6240 case VFParamKind::OMP_Linear: { 6241 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6242 // Find the stride for the scalar parameter in this loop and see if 6243 // it matches the stride for the variant. 6244 // TODO: do we need to figure out the cost of an extract to get the 6245 // first lane? Or do we hope that it will be folded away? 6246 ScalarEvolution *SE = PSE.getSE(); 6247 const auto *SAR = 6248 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6249 6250 if (!SAR || SAR->getLoop() != TheLoop) { 6251 ParamsOk = false; 6252 break; 6253 } 6254 6255 const SCEVConstant *Step = 6256 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6257 6258 if (!Step || 6259 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6260 ParamsOk = false; 6261 6262 break; 6263 } 6264 case VFParamKind::GlobalPredicate: 6265 UsesMask = true; 6266 break; 6267 default: 6268 ParamsOk = false; 6269 break; 6270 } 6271 } 6272 6273 if (!ParamsOk) 6274 continue; 6275 6276 // Found a suitable candidate, stop here. 6277 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6278 FuncInfo = Info; 6279 break; 6280 } 6281 6282 // Add in the cost of synthesizing a mask if one wasn't required. 6283 InstructionCost MaskCost = 0; 6284 if (VecFunc && UsesMask && !MaskRequired) 6285 MaskCost = TTI.getShuffleCost( 6286 TargetTransformInfo::SK_Broadcast, 6287 VectorType::get(IntegerType::getInt1Ty( 6288 VecFunc->getFunctionType()->getContext()), 6289 VF)); 6290 6291 if (TLI && VecFunc && !CI->isNoBuiltin()) 6292 VectorCost = 6293 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6294 6295 // Find the cost of an intrinsic; some targets may have instructions that 6296 // perform the operation without needing an actual call. 6297 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6298 if (IID != Intrinsic::not_intrinsic) 6299 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6300 6301 InstructionCost Cost = ScalarCost; 6302 InstWidening Decision = CM_Scalarize; 6303 6304 if (VectorCost <= Cost) { 6305 Cost = VectorCost; 6306 Decision = CM_VectorCall; 6307 } 6308 6309 if (IntrinsicCost <= Cost) { 6310 Cost = IntrinsicCost; 6311 Decision = CM_IntrinsicCall; 6312 } 6313 6314 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6315 FuncInfo.getParamIndexForOptionalMask(), Cost); 6316 } 6317 } 6318 } 6319 6320 InstructionCost 6321 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6322 ElementCount VF) { 6323 // If we know that this instruction will remain uniform, check the cost of 6324 // the scalar version. 6325 if (isUniformAfterVectorization(I, VF)) 6326 VF = ElementCount::getFixed(1); 6327 6328 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6329 return InstsToScalarize[VF][I]; 6330 6331 // Forced scalars do not have any scalarization overhead. 6332 auto ForcedScalar = ForcedScalars.find(VF); 6333 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6334 auto InstSet = ForcedScalar->second; 6335 if (InstSet.count(I)) 6336 return getInstructionCost(I, ElementCount::getFixed(1)) * 6337 VF.getKnownMinValue(); 6338 } 6339 6340 Type *RetTy = I->getType(); 6341 if (canTruncateToMinimalBitwidth(I, VF)) 6342 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6343 auto SE = PSE.getSE(); 6344 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6345 6346 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6347 ElementCount VF) -> bool { 6348 if (VF.isScalar()) 6349 return true; 6350 6351 auto Scalarized = InstsToScalarize.find(VF); 6352 assert(Scalarized != InstsToScalarize.end() && 6353 "VF not yet analyzed for scalarization profitability"); 6354 return !Scalarized->second.count(I) && 6355 llvm::all_of(I->users(), [&](User *U) { 6356 auto *UI = cast<Instruction>(U); 6357 return !Scalarized->second.count(UI); 6358 }); 6359 }; 6360 (void) hasSingleCopyAfterVectorization; 6361 6362 Type *VectorTy; 6363 if (isScalarAfterVectorization(I, VF)) { 6364 // With the exception of GEPs and PHIs, after scalarization there should 6365 // only be one copy of the instruction generated in the loop. This is 6366 // because the VF is either 1, or any instructions that need scalarizing 6367 // have already been dealt with by the time we get here. As a result, 6368 // it means we don't have to multiply the instruction cost by VF. 6369 assert(I->getOpcode() == Instruction::GetElementPtr || 6370 I->getOpcode() == Instruction::PHI || 6371 (I->getOpcode() == Instruction::BitCast && 6372 I->getType()->isPointerTy()) || 6373 hasSingleCopyAfterVectorization(I, VF)); 6374 VectorTy = RetTy; 6375 } else 6376 VectorTy = ToVectorTy(RetTy, VF); 6377 6378 if (VF.isVector() && VectorTy->isVectorTy() && 6379 !TTI.getNumberOfParts(VectorTy)) 6380 return InstructionCost::getInvalid(); 6381 6382 // TODO: We need to estimate the cost of intrinsic calls. 6383 switch (I->getOpcode()) { 6384 case Instruction::GetElementPtr: 6385 // We mark this instruction as zero-cost because the cost of GEPs in 6386 // vectorized code depends on whether the corresponding memory instruction 6387 // is scalarized or not. Therefore, we handle GEPs with the memory 6388 // instruction cost. 6389 return 0; 6390 case Instruction::Br: { 6391 // In cases of scalarized and predicated instructions, there will be VF 6392 // predicated blocks in the vectorized loop. Each branch around these 6393 // blocks requires also an extract of its vector compare i1 element. 6394 // Note that the conditional branch from the loop latch will be replaced by 6395 // a single branch controlling the loop, so there is no extra overhead from 6396 // scalarization. 6397 bool ScalarPredicatedBB = false; 6398 BranchInst *BI = cast<BranchInst>(I); 6399 if (VF.isVector() && BI->isConditional() && 6400 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6401 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && 6402 BI->getParent() != TheLoop->getLoopLatch()) 6403 ScalarPredicatedBB = true; 6404 6405 if (ScalarPredicatedBB) { 6406 // Not possible to scalarize scalable vector with predicated instructions. 6407 if (VF.isScalable()) 6408 return InstructionCost::getInvalid(); 6409 // Return cost for branches around scalarized and predicated blocks. 6410 auto *Vec_i1Ty = 6411 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6412 return ( 6413 TTI.getScalarizationOverhead( 6414 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 6415 /*Insert*/ false, /*Extract*/ true, CostKind) + 6416 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6417 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6418 // The back-edge branch will remain, as will all scalar branches. 6419 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6420 else 6421 // This branch will be eliminated by if-conversion. 6422 return 0; 6423 // Note: We currently assume zero cost for an unconditional branch inside 6424 // a predicated block since it will become a fall-through, although we 6425 // may decide in the future to call TTI for all branches. 6426 } 6427 case Instruction::PHI: { 6428 auto *Phi = cast<PHINode>(I); 6429 6430 // First-order recurrences are replaced by vector shuffles inside the loop. 6431 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6432 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6433 // penultimate value of the recurrence. 6434 // TODO: Consider vscale_range info. 6435 if (VF.isScalable() && VF.getKnownMinValue() == 1) 6436 return InstructionCost::getInvalid(); 6437 SmallVector<int> Mask(VF.getKnownMinValue()); 6438 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6439 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6440 cast<VectorType>(VectorTy), Mask, CostKind, 6441 VF.getKnownMinValue() - 1); 6442 } 6443 6444 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6445 // converted into select instructions. We require N - 1 selects per phi 6446 // node, where N is the number of incoming values. 6447 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6448 return (Phi->getNumIncomingValues() - 1) * 6449 TTI.getCmpSelInstrCost( 6450 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6451 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6452 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6453 6454 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6455 } 6456 case Instruction::UDiv: 6457 case Instruction::SDiv: 6458 case Instruction::URem: 6459 case Instruction::SRem: 6460 if (VF.isVector() && isPredicatedInst(I)) { 6461 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6462 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6463 ScalarCost : SafeDivisorCost; 6464 } 6465 // We've proven all lanes safe to speculate, fall through. 6466 [[fallthrough]]; 6467 case Instruction::Add: 6468 case Instruction::FAdd: 6469 case Instruction::Sub: 6470 case Instruction::FSub: 6471 case Instruction::Mul: 6472 case Instruction::FMul: 6473 case Instruction::FDiv: 6474 case Instruction::FRem: 6475 case Instruction::Shl: 6476 case Instruction::LShr: 6477 case Instruction::AShr: 6478 case Instruction::And: 6479 case Instruction::Or: 6480 case Instruction::Xor: { 6481 // If we're speculating on the stride being 1, the multiplication may 6482 // fold away. We can generalize this for all operations using the notion 6483 // of neutral elements. (TODO) 6484 if (I->getOpcode() == Instruction::Mul && 6485 (PSE.getSCEV(I->getOperand(0))->isOne() || 6486 PSE.getSCEV(I->getOperand(1))->isOne())) 6487 return 0; 6488 6489 // Detect reduction patterns 6490 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6491 return *RedCost; 6492 6493 // Certain instructions can be cheaper to vectorize if they have a constant 6494 // second vector operand. One example of this are shifts on x86. 6495 Value *Op2 = I->getOperand(1); 6496 auto Op2Info = TTI.getOperandInfo(Op2); 6497 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6498 Legal->isInvariant(Op2)) 6499 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 6500 6501 SmallVector<const Value *, 4> Operands(I->operand_values()); 6502 return TTI.getArithmeticInstrCost( 6503 I->getOpcode(), VectorTy, CostKind, 6504 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6505 Op2Info, Operands, I, TLI); 6506 } 6507 case Instruction::FNeg: { 6508 return TTI.getArithmeticInstrCost( 6509 I->getOpcode(), VectorTy, CostKind, 6510 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 6512 I->getOperand(0), I); 6513 } 6514 case Instruction::Select: { 6515 SelectInst *SI = cast<SelectInst>(I); 6516 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6517 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6518 6519 const Value *Op0, *Op1; 6520 using namespace llvm::PatternMatch; 6521 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 6522 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 6523 // select x, y, false --> x & y 6524 // select x, true, y --> x | y 6525 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 6526 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 6527 assert(Op0->getType()->getScalarSizeInBits() == 1 && 6528 Op1->getType()->getScalarSizeInBits() == 1); 6529 6530 SmallVector<const Value *, 2> Operands{Op0, Op1}; 6531 return TTI.getArithmeticInstrCost( 6532 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 6533 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 6534 } 6535 6536 Type *CondTy = SI->getCondition()->getType(); 6537 if (!ScalarCond) 6538 CondTy = VectorType::get(CondTy, VF); 6539 6540 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 6541 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 6542 Pred = Cmp->getPredicate(); 6543 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 6544 CostKind, I); 6545 } 6546 case Instruction::ICmp: 6547 case Instruction::FCmp: { 6548 Type *ValTy = I->getOperand(0)->getType(); 6549 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6550 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6551 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6552 VectorTy = ToVectorTy(ValTy, VF); 6553 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 6554 cast<CmpInst>(I)->getPredicate(), CostKind, 6555 I); 6556 } 6557 case Instruction::Store: 6558 case Instruction::Load: { 6559 ElementCount Width = VF; 6560 if (Width.isVector()) { 6561 InstWidening Decision = getWideningDecision(I, Width); 6562 assert(Decision != CM_Unknown && 6563 "CM decision should be taken at this point"); 6564 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 6565 return InstructionCost::getInvalid(); 6566 if (Decision == CM_Scalarize) 6567 Width = ElementCount::getFixed(1); 6568 } 6569 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 6570 return getMemoryInstructionCost(I, VF); 6571 } 6572 case Instruction::BitCast: 6573 if (I->getType()->isPointerTy()) 6574 return 0; 6575 [[fallthrough]]; 6576 case Instruction::ZExt: 6577 case Instruction::SExt: 6578 case Instruction::FPToUI: 6579 case Instruction::FPToSI: 6580 case Instruction::FPExt: 6581 case Instruction::PtrToInt: 6582 case Instruction::IntToPtr: 6583 case Instruction::SIToFP: 6584 case Instruction::UIToFP: 6585 case Instruction::Trunc: 6586 case Instruction::FPTrunc: { 6587 // Computes the CastContextHint from a Load/Store instruction. 6588 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6589 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6590 "Expected a load or a store!"); 6591 6592 if (VF.isScalar() || !TheLoop->contains(I)) 6593 return TTI::CastContextHint::Normal; 6594 6595 switch (getWideningDecision(I, VF)) { 6596 case LoopVectorizationCostModel::CM_GatherScatter: 6597 return TTI::CastContextHint::GatherScatter; 6598 case LoopVectorizationCostModel::CM_Interleave: 6599 return TTI::CastContextHint::Interleave; 6600 case LoopVectorizationCostModel::CM_Scalarize: 6601 case LoopVectorizationCostModel::CM_Widen: 6602 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6603 : TTI::CastContextHint::Normal; 6604 case LoopVectorizationCostModel::CM_Widen_Reverse: 6605 return TTI::CastContextHint::Reversed; 6606 case LoopVectorizationCostModel::CM_Unknown: 6607 llvm_unreachable("Instr did not go through cost modelling?"); 6608 case LoopVectorizationCostModel::CM_VectorCall: 6609 case LoopVectorizationCostModel::CM_IntrinsicCall: 6610 llvm_unreachable_internal("Instr has invalid widening decision"); 6611 } 6612 6613 llvm_unreachable("Unhandled case!"); 6614 }; 6615 6616 unsigned Opcode = I->getOpcode(); 6617 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6618 // For Trunc, the context is the only user, which must be a StoreInst. 6619 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6620 if (I->hasOneUse()) 6621 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6622 CCH = ComputeCCH(Store); 6623 } 6624 // For Z/Sext, the context is the operand, which must be a LoadInst. 6625 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6626 Opcode == Instruction::FPExt) { 6627 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6628 CCH = ComputeCCH(Load); 6629 } 6630 6631 // We optimize the truncation of induction variables having constant 6632 // integer steps. The cost of these truncations is the same as the scalar 6633 // operation. 6634 if (isOptimizableIVTruncate(I, VF)) { 6635 auto *Trunc = cast<TruncInst>(I); 6636 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6637 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6638 } 6639 6640 // Detect reduction patterns 6641 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6642 return *RedCost; 6643 6644 Type *SrcScalarTy = I->getOperand(0)->getType(); 6645 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6646 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6647 SrcScalarTy = 6648 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); 6649 Type *SrcVecTy = 6650 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6651 6652 if (canTruncateToMinimalBitwidth(I, VF)) { 6653 // If the result type is <= the source type, there will be no extend 6654 // after truncating the users to the minimal required bitwidth. 6655 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && 6656 (I->getOpcode() == Instruction::ZExt || 6657 I->getOpcode() == Instruction::SExt)) 6658 return 0; 6659 } 6660 6661 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6662 } 6663 case Instruction::Call: 6664 return getVectorCallCost(cast<CallInst>(I), VF); 6665 case Instruction::ExtractValue: 6666 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 6667 case Instruction::Alloca: 6668 // We cannot easily widen alloca to a scalable alloca, as 6669 // the result would need to be a vector of pointers. 6670 if (VF.isScalable()) 6671 return InstructionCost::getInvalid(); 6672 [[fallthrough]]; 6673 default: 6674 // This opcode is unknown. Assume that it is the same as 'mul'. 6675 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6676 } // end of switch. 6677 } 6678 6679 void LoopVectorizationCostModel::collectValuesToIgnore() { 6680 // Ignore ephemeral values. 6681 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6682 6683 SmallVector<Value *, 4> DeadInterleavePointerOps; 6684 for (BasicBlock *BB : TheLoop->blocks()) 6685 for (Instruction &I : *BB) { 6686 // Find all stores to invariant variables. Since they are going to sink 6687 // outside the loop we do not need calculate cost for them. 6688 StoreInst *SI; 6689 if ((SI = dyn_cast<StoreInst>(&I)) && 6690 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 6691 ValuesToIgnore.insert(&I); 6692 6693 // For interleave groups, we only create a pointer for the start of the 6694 // interleave group. Queue up addresses of group members except the insert 6695 // position for further processing. 6696 if (isAccessInterleaved(&I)) { 6697 auto *Group = getInterleavedAccessGroup(&I); 6698 if (Group->getInsertPos() == &I) 6699 continue; 6700 Value *PointerOp = getLoadStorePointerOperand(&I); 6701 DeadInterleavePointerOps.push_back(PointerOp); 6702 } 6703 } 6704 6705 // Mark ops feeding interleave group members as free, if they are only used 6706 // by other dead computations. 6707 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { 6708 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); 6709 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { 6710 Instruction *UI = cast<Instruction>(U); 6711 return !VecValuesToIgnore.contains(U) && 6712 (!isAccessInterleaved(UI) || 6713 getInterleavedAccessGroup(UI)->getInsertPos() == UI); 6714 })) 6715 continue; 6716 VecValuesToIgnore.insert(Op); 6717 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); 6718 } 6719 6720 // Ignore type-promoting instructions we identified during reduction 6721 // detection. 6722 for (const auto &Reduction : Legal->getReductionVars()) { 6723 const RecurrenceDescriptor &RedDes = Reduction.second; 6724 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6725 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6726 } 6727 // Ignore type-casting instructions we identified during induction 6728 // detection. 6729 for (const auto &Induction : Legal->getInductionVars()) { 6730 const InductionDescriptor &IndDes = Induction.second; 6731 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6732 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6733 } 6734 } 6735 6736 void LoopVectorizationCostModel::collectInLoopReductions() { 6737 for (const auto &Reduction : Legal->getReductionVars()) { 6738 PHINode *Phi = Reduction.first; 6739 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6740 6741 // We don't collect reductions that are type promoted (yet). 6742 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6743 continue; 6744 6745 // If the target would prefer this reduction to happen "in-loop", then we 6746 // want to record it as such. 6747 unsigned Opcode = RdxDesc.getOpcode(); 6748 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 6749 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 6750 TargetTransformInfo::ReductionFlags())) 6751 continue; 6752 6753 // Check that we can correctly put the reductions into the loop, by 6754 // finding the chain of operations that leads from the phi to the loop 6755 // exit value. 6756 SmallVector<Instruction *, 4> ReductionOperations = 6757 RdxDesc.getReductionOpChain(Phi, TheLoop); 6758 bool InLoop = !ReductionOperations.empty(); 6759 6760 if (InLoop) { 6761 InLoopReductions.insert(Phi); 6762 // Add the elements to InLoopReductionImmediateChains for cost modelling. 6763 Instruction *LastChain = Phi; 6764 for (auto *I : ReductionOperations) { 6765 InLoopReductionImmediateChains[I] = LastChain; 6766 LastChain = I; 6767 } 6768 } 6769 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6770 << " reduction for phi: " << *Phi << "\n"); 6771 } 6772 } 6773 6774 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, 6775 DebugLoc DL, const Twine &Name) { 6776 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && 6777 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); 6778 return tryInsertInstruction( 6779 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); 6780 } 6781 6782 // This function will select a scalable VF if the target supports scalable 6783 // vectors and a fixed one otherwise. 6784 // TODO: we could return a pair of values that specify the max VF and 6785 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6786 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6787 // doesn't have a cost model that can choose which plan to execute if 6788 // more than one is generated. 6789 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 6790 LoopVectorizationCostModel &CM) { 6791 unsigned WidestType; 6792 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6793 6794 TargetTransformInfo::RegisterKind RegKind = 6795 TTI.enableScalableVectorization() 6796 ? TargetTransformInfo::RGK_ScalableVector 6797 : TargetTransformInfo::RGK_FixedWidthVector; 6798 6799 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 6800 unsigned N = RegSize.getKnownMinValue() / WidestType; 6801 return ElementCount::get(N, RegSize.isScalable()); 6802 } 6803 6804 VectorizationFactor 6805 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6806 ElementCount VF = UserVF; 6807 // Outer loop handling: They may require CFG and instruction level 6808 // transformations before even evaluating whether vectorization is profitable. 6809 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6810 // the vectorization pipeline. 6811 if (!OrigLoop->isInnermost()) { 6812 // If the user doesn't provide a vectorization factor, determine a 6813 // reasonable one. 6814 if (UserVF.isZero()) { 6815 VF = determineVPlanVF(TTI, CM); 6816 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6817 6818 // Make sure we have a VF > 1 for stress testing. 6819 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6820 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6821 << "overriding computed VF.\n"); 6822 VF = ElementCount::getFixed(4); 6823 } 6824 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 6825 !ForceTargetSupportsScalableVectors) { 6826 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 6827 << "not supported by the target.\n"); 6828 reportVectorizationFailure( 6829 "Scalable vectorization requested but not supported by the target", 6830 "the scalable user-specified vectorization width for outer-loop " 6831 "vectorization cannot be used because the target does not support " 6832 "scalable vectors.", 6833 "ScalableVFUnfeasible", ORE, OrigLoop); 6834 return VectorizationFactor::Disabled(); 6835 } 6836 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6837 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6838 "VF needs to be a power of two"); 6839 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6840 << "VF " << VF << " to build VPlans.\n"); 6841 buildVPlans(VF, VF); 6842 6843 // For VPlan build stress testing, we bail out after VPlan construction. 6844 if (VPlanBuildStressTest) 6845 return VectorizationFactor::Disabled(); 6846 6847 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 6848 } 6849 6850 LLVM_DEBUG( 6851 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6852 "VPlan-native path.\n"); 6853 return VectorizationFactor::Disabled(); 6854 } 6855 6856 std::optional<VectorizationFactor> 6857 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6858 assert(OrigLoop->isInnermost() && "Inner loop expected."); 6859 CM.collectValuesToIgnore(); 6860 CM.collectElementTypesForWidening(); 6861 6862 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 6863 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 6864 return std::nullopt; 6865 6866 // Invalidate interleave groups if all blocks of loop will be predicated. 6867 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 6868 !useMaskedInterleavedAccesses(TTI)) { 6869 LLVM_DEBUG( 6870 dbgs() 6871 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6872 "which requires masked-interleaved support.\n"); 6873 if (CM.InterleaveInfo.invalidateGroups()) 6874 // Invalidating interleave groups also requires invalidating all decisions 6875 // based on them, which includes widening decisions and uniform and scalar 6876 // values. 6877 CM.invalidateCostModelingDecisions(); 6878 } 6879 6880 if (CM.foldTailByMasking()) 6881 Legal->prepareToFoldTailByMasking(); 6882 6883 ElementCount MaxUserVF = 6884 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 6885 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 6886 if (!UserVF.isZero() && UserVFIsLegal) { 6887 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6888 "VF needs to be a power of two"); 6889 // Collect the instructions (and their associated costs) that will be more 6890 // profitable to scalarize. 6891 CM.collectInLoopReductions(); 6892 if (CM.selectUserVectorizationFactor(UserVF)) { 6893 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6894 buildVPlansWithVPRecipes(UserVF, UserVF); 6895 if (!hasPlanWithVF(UserVF)) { 6896 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 6897 << ".\n"); 6898 return std::nullopt; 6899 } 6900 6901 LLVM_DEBUG(printPlans(dbgs())); 6902 return {{UserVF, 0, 0}}; 6903 } else 6904 reportVectorizationInfo("UserVF ignored because of invalid costs.", 6905 "InvalidCost", ORE, OrigLoop); 6906 } 6907 6908 // Collect the Vectorization Factor Candidates. 6909 SmallVector<ElementCount> VFCandidates; 6910 for (auto VF = ElementCount::getFixed(1); 6911 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 6912 VFCandidates.push_back(VF); 6913 for (auto VF = ElementCount::getScalable(1); 6914 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 6915 VFCandidates.push_back(VF); 6916 6917 CM.collectInLoopReductions(); 6918 for (const auto &VF : VFCandidates) { 6919 // Collect Uniform and Scalar instructions after vectorization with VF. 6920 CM.collectUniformsAndScalars(VF); 6921 6922 // Collect the instructions (and their associated costs) that will be more 6923 // profitable to scalarize. 6924 if (VF.isVector()) 6925 CM.collectInstsToScalarize(VF); 6926 } 6927 6928 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 6929 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 6930 6931 LLVM_DEBUG(printPlans(dbgs())); 6932 if (VPlans.empty()) 6933 return std::nullopt; 6934 if (all_of(VPlans, 6935 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); })) 6936 return VectorizationFactor::Disabled(); 6937 6938 // Select the optimal vectorization factor according to the legacy cost-model. 6939 // This is now only used to verify the decisions by the new VPlan-based 6940 // cost-model and will be retired once the VPlan-based cost-model is 6941 // stabilized. 6942 VectorizationFactor VF = selectVectorizationFactor(); 6943 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 6944 if (!hasPlanWithVF(VF.Width)) { 6945 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 6946 << ".\n"); 6947 return std::nullopt; 6948 } 6949 return VF; 6950 } 6951 6952 InstructionCost VPCostContext::getLegacyCost(Instruction *UI, 6953 ElementCount VF) const { 6954 return CM.getInstructionCost(UI, VF); 6955 } 6956 6957 bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { 6958 return CM.ValuesToIgnore.contains(UI) || 6959 (IsVector && CM.VecValuesToIgnore.contains(UI)) || 6960 SkipCostComputation.contains(UI); 6961 } 6962 6963 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, 6964 ElementCount VF) const { 6965 InstructionCost Cost = 0; 6966 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); 6967 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM); 6968 6969 // Cost modeling for inductions is inaccurate in the legacy cost model 6970 // compared to the recipes that are generated. To match here initially during 6971 // VPlan cost model bring up directly use the induction costs from the legacy 6972 // cost model. Note that we do this as pre-processing; the VPlan may not have 6973 // any recipes associated with the original induction increment instruction 6974 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 6975 // the cost of induction phis and increments (both that are represented by 6976 // recipes and those that are not), to avoid distinguishing between them here, 6977 // and skip all recipes that represent induction phis and increments (the 6978 // former case) later on, if they exist, to avoid counting them twice. 6979 // Similarly we pre-compute the cost of any optimized truncates. 6980 // TODO: Switch to more accurate costing based on VPlan. 6981 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { 6982 Instruction *IVInc = cast<Instruction>( 6983 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 6984 SmallVector<Instruction *> IVInsts = {IV, IVInc}; 6985 for (User *U : IV->users()) { 6986 auto *CI = cast<Instruction>(U); 6987 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) 6988 continue; 6989 IVInsts.push_back(CI); 6990 } 6991 for (Instruction *IVInst : IVInsts) { 6992 if (!CostCtx.SkipCostComputation.insert(IVInst).second) 6993 continue; 6994 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); 6995 LLVM_DEBUG({ 6996 dbgs() << "Cost of " << InductionCost << " for VF " << VF 6997 << ": induction instruction " << *IVInst << "\n"; 6998 }); 6999 Cost += InductionCost; 7000 } 7001 } 7002 7003 /// Compute the cost of all exiting conditions of the loop using the legacy 7004 /// cost model. This is to match the legacy behavior, which adds the cost of 7005 /// all exit conditions. Note that this over-estimates the cost, as there will 7006 /// be a single condition to control the vector loop. 7007 SmallVector<BasicBlock *> Exiting; 7008 CM.TheLoop->getExitingBlocks(Exiting); 7009 SetVector<Instruction *> ExitInstrs; 7010 // Collect all exit conditions. 7011 for (BasicBlock *EB : Exiting) { 7012 auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); 7013 if (!Term) 7014 continue; 7015 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { 7016 ExitInstrs.insert(CondI); 7017 } 7018 } 7019 // Compute the cost of all instructions only feeding the exit conditions. 7020 for (unsigned I = 0; I != ExitInstrs.size(); ++I) { 7021 Instruction *CondI = ExitInstrs[I]; 7022 if (!OrigLoop->contains(CondI) || 7023 !CostCtx.SkipCostComputation.insert(CondI).second) 7024 continue; 7025 Cost += CostCtx.getLegacyCost(CondI, VF); 7026 for (Value *Op : CondI->operands()) { 7027 auto *OpI = dyn_cast<Instruction>(Op); 7028 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { 7029 return OrigLoop->contains(cast<Instruction>(U)->getParent()) && 7030 !ExitInstrs.contains(cast<Instruction>(U)); 7031 })) 7032 continue; 7033 ExitInstrs.insert(OpI); 7034 } 7035 } 7036 7037 // The legacy cost model has special logic to compute the cost of in-loop 7038 // reductions, which may be smaller than the sum of all instructions involved 7039 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select 7040 // which the legacy cost model uses to assign cost. Pre-compute their costs 7041 // for now. 7042 // TODO: Switch to costing based on VPlan once the logic has been ported. 7043 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { 7044 if (!CM.isInLoopReduction(RedPhi) && 7045 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 7046 RdxDesc.getRecurrenceKind())) 7047 continue; 7048 7049 // AnyOf reduction codegen may remove the select. To match the legacy cost 7050 // model, pre-compute the cost for AnyOf reductions here. 7051 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 7052 RdxDesc.getRecurrenceKind())) { 7053 auto *Select = cast<SelectInst>(*find_if( 7054 RedPhi->users(), [](User *U) { return isa<SelectInst>(U); })); 7055 assert(!CostCtx.SkipCostComputation.contains(Select) && 7056 "reduction op visited multiple times"); 7057 CostCtx.SkipCostComputation.insert(Select); 7058 auto ReductionCost = CostCtx.getLegacyCost(Select, VF); 7059 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7060 << ":\n any-of reduction " << *Select << "\n"); 7061 Cost += ReductionCost; 7062 continue; 7063 } 7064 7065 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); 7066 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), 7067 ChainOps.end()); 7068 // Also include the operands of instructions in the chain, as the cost-model 7069 // may mark extends as free. 7070 for (auto *ChainOp : ChainOps) { 7071 for (Value *Op : ChainOp->operands()) { 7072 if (auto *I = dyn_cast<Instruction>(Op)) 7073 ChainOpsAndOperands.insert(I); 7074 } 7075 } 7076 7077 // Pre-compute the cost for I, if it has a reduction pattern cost. 7078 for (Instruction *I : ChainOpsAndOperands) { 7079 auto ReductionCost = CM.getReductionPatternCost( 7080 I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); 7081 if (!ReductionCost) 7082 continue; 7083 7084 assert(!CostCtx.SkipCostComputation.contains(I) && 7085 "reduction op visited multiple times"); 7086 CostCtx.SkipCostComputation.insert(I); 7087 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF 7088 << ":\n in-loop reduction " << *I << "\n"); 7089 Cost += *ReductionCost; 7090 } 7091 } 7092 7093 // Pre-compute the costs for branches except for the backedge, as the number 7094 // of replicate regions in a VPlan may not directly match the number of 7095 // branches, which would lead to different decisions. 7096 // TODO: Compute cost of branches for each replicate region in the VPlan, 7097 // which is more accurate than the legacy cost model. 7098 for (BasicBlock *BB : OrigLoop->blocks()) { 7099 if (BB == OrigLoop->getLoopLatch()) 7100 continue; 7101 CostCtx.SkipCostComputation.insert(BB->getTerminator()); 7102 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); 7103 Cost += BranchCost; 7104 } 7105 // Now compute and add the VPlan-based cost. 7106 Cost += Plan.cost(VF, CostCtx); 7107 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); 7108 return Cost; 7109 } 7110 7111 VPlan &LoopVectorizationPlanner::getBestPlan() const { 7112 // If there is a single VPlan with a single VF, return it directly. 7113 VPlan &FirstPlan = *VPlans[0]; 7114 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) 7115 return FirstPlan; 7116 7117 VPlan *BestPlan = &FirstPlan; 7118 ElementCount ScalarVF = ElementCount::getFixed(1); 7119 assert(hasPlanWithVF(ScalarVF) && 7120 "More than a single plan/VF w/o any plan having scalar VF"); 7121 7122 // TODO: Compute scalar cost using VPlan-based cost model. 7123 InstructionCost ScalarCost = CM.expectedCost(ScalarVF); 7124 VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); 7125 7126 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 7127 if (ForceVectorization) { 7128 // Ignore scalar width, because the user explicitly wants vectorization. 7129 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 7130 // evaluation. 7131 BestFactor.Cost = InstructionCost::getMax(); 7132 } 7133 7134 for (auto &P : VPlans) { 7135 for (ElementCount VF : P->vectorFactors()) { 7136 if (VF.isScalar()) 7137 continue; 7138 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { 7139 LLVM_DEBUG( 7140 dbgs() 7141 << "LV: Not considering vector loop of width " << VF 7142 << " because it will not generate any vector instructions.\n"); 7143 continue; 7144 } 7145 7146 InstructionCost Cost = cost(*P, VF); 7147 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); 7148 if (isMoreProfitable(CurrentFactor, BestFactor)) { 7149 BestFactor = CurrentFactor; 7150 BestPlan = &*P; 7151 } 7152 } 7153 } 7154 BestPlan->setVF(BestFactor.Width); 7155 return *BestPlan; 7156 } 7157 7158 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7159 assert(count_if(VPlans, 7160 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7161 1 && 7162 "Best VF has not a single VPlan."); 7163 7164 for (const VPlanPtr &Plan : VPlans) { 7165 if (Plan->hasVF(VF)) 7166 return *Plan.get(); 7167 } 7168 llvm_unreachable("No plan found!"); 7169 } 7170 7171 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7172 SmallVector<Metadata *, 4> MDs; 7173 // Reserve first location for self reference to the LoopID metadata node. 7174 MDs.push_back(nullptr); 7175 bool IsUnrollMetadata = false; 7176 MDNode *LoopID = L->getLoopID(); 7177 if (LoopID) { 7178 // First find existing loop unrolling disable metadata. 7179 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7180 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7181 if (MD) { 7182 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7183 IsUnrollMetadata = 7184 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7185 } 7186 MDs.push_back(LoopID->getOperand(i)); 7187 } 7188 } 7189 7190 if (!IsUnrollMetadata) { 7191 // Add runtime unroll disable metadata. 7192 LLVMContext &Context = L->getHeader()->getContext(); 7193 SmallVector<Metadata *, 1> DisableOperands; 7194 DisableOperands.push_back( 7195 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7196 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7197 MDs.push_back(DisableNode); 7198 MDNode *NewLoopID = MDNode::get(Context, MDs); 7199 // Set operand 0 to refer to the loop id itself. 7200 NewLoopID->replaceOperandWith(0, NewLoopID); 7201 L->setLoopID(NewLoopID); 7202 } 7203 } 7204 7205 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is 7206 // create a merge phi node for it and add it to \p ReductionResumeValues. 7207 static void createAndCollectMergePhiForReduction( 7208 VPInstruction *RedResult, 7209 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, 7210 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, 7211 bool VectorizingEpilogue) { 7212 if (!RedResult || 7213 RedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7214 return; 7215 7216 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); 7217 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 7218 7219 Value *FinalValue = 7220 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); 7221 auto *ResumePhi = 7222 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 7223 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( 7224 RdxDesc.getRecurrenceKind())) { 7225 auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue()); 7226 assert(Cmp->getPredicate() == CmpInst::ICMP_NE); 7227 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); 7228 ResumePhi = cast<PHINode>(Cmp->getOperand(0)); 7229 } 7230 assert((!VectorizingEpilogue || ResumePhi) && 7231 "when vectorizing the epilogue loop, we need a resume phi from main " 7232 "vector loop"); 7233 7234 // TODO: bc.merge.rdx should not be created here, instead it should be 7235 // modeled in VPlan. 7236 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); 7237 // Create a phi node that merges control-flow from the backedge-taken check 7238 // block and the middle block. 7239 auto *BCBlockPhi = 7240 PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", 7241 LoopScalarPreHeader->getTerminator()->getIterator()); 7242 7243 // If we are fixing reductions in the epilogue loop then we should already 7244 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 7245 // we carry over the incoming values correctly. 7246 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 7247 if (Incoming == LoopMiddleBlock) 7248 BCBlockPhi->addIncoming(FinalValue, Incoming); 7249 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) 7250 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 7251 Incoming); 7252 else 7253 BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); 7254 } 7255 7256 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 7257 // TODO: This fixup should instead be modeled in VPlan. 7258 // Fix the scalar loop reduction variable with the incoming reduction sum 7259 // from the vector body and from the backedge value. 7260 int IncomingEdgeBlockIdx = 7261 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 7262 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 7263 // Pick the other block. 7264 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 7265 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 7266 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 7267 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 7268 7269 ReductionResumeValues[&RdxDesc] = BCBlockPhi; 7270 } 7271 7272 std::pair<DenseMap<const SCEV *, Value *>, 7273 DenseMap<const RecurrenceDescriptor *, Value *>> 7274 LoopVectorizationPlanner::executePlan( 7275 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7276 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7277 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7278 assert(BestVPlan.hasVF(BestVF) && 7279 "Trying to execute plan with unsupported VF"); 7280 assert(BestVPlan.hasUF(BestUF) && 7281 "Trying to execute plan with unsupported UF"); 7282 assert( 7283 (IsEpilogueVectorization || !ExpandedSCEVs) && 7284 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7285 (void)IsEpilogueVectorization; 7286 7287 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7288 7289 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF 7290 << ", UF=" << BestUF << '\n'); 7291 BestVPlan.setName("Final VPlan"); 7292 LLVM_DEBUG(BestVPlan.dump()); 7293 7294 // Perform the actual loop transformation. 7295 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, 7296 OrigLoop->getHeader()->getContext()); 7297 7298 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7299 // before making any changes to the CFG. 7300 if (!BestVPlan.getPreheader()->empty()) { 7301 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7302 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7303 BestVPlan.getPreheader()->execute(&State); 7304 } 7305 if (!ILV.getTripCount()) 7306 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7307 else 7308 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7309 "count during epilogue vectorization"); 7310 7311 // 1. Set up the skeleton for vectorization, including vector pre-header and 7312 // middle block. The vector loop is created during VPlan execution. 7313 Value *CanonicalIVStartValue; 7314 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7315 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7316 : State.ExpandedSCEVs); 7317 #ifdef EXPENSIVE_CHECKS 7318 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 7319 #endif 7320 7321 // Only use noalias metadata when using memory checks guaranteeing no overlap 7322 // across all iterations. 7323 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7324 std::unique_ptr<LoopVersioning> LVer = nullptr; 7325 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7326 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7327 7328 // We currently don't use LoopVersioning for the actual loop cloning but we 7329 // still use it to add the noalias metadata. 7330 // TODO: Find a better way to re-use LoopVersioning functionality to add 7331 // metadata. 7332 LVer = std::make_unique<LoopVersioning>( 7333 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7334 PSE.getSE()); 7335 State.LVer = &*LVer; 7336 State.LVer->prepareNoAliasMetadata(); 7337 } 7338 7339 ILV.printDebugTracesAtStart(); 7340 7341 //===------------------------------------------------===// 7342 // 7343 // Notice: any optimization or new instruction that go 7344 // into the code below should also be implemented in 7345 // the cost-model. 7346 // 7347 //===------------------------------------------------===// 7348 7349 // 2. Copy and widen instructions from the old loop into the new loop. 7350 BestVPlan.prepareToExecute(ILV.getTripCount(), 7351 ILV.getOrCreateVectorTripCount(nullptr), 7352 CanonicalIVStartValue, State); 7353 7354 BestVPlan.execute(&State); 7355 7356 // 2.5 Collect reduction resume values. 7357 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; 7358 auto *ExitVPBB = 7359 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); 7360 for (VPRecipeBase &R : *ExitVPBB) { 7361 createAndCollectMergePhiForReduction( 7362 dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop, 7363 State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); 7364 } 7365 7366 // 2.6. Maintain Loop Hints 7367 // Keep all loop hints from the original loop on the vector loop (we'll 7368 // replace the vectorizer-specific hints below). 7369 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7370 7371 std::optional<MDNode *> VectorizedLoopID = 7372 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7373 LLVMLoopVectorizeFollowupVectorized}); 7374 7375 VPBasicBlock *HeaderVPBB = 7376 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7377 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7378 if (VectorizedLoopID) 7379 L->setLoopID(*VectorizedLoopID); 7380 else { 7381 // Keep all loop hints from the original loop on the vector loop (we'll 7382 // replace the vectorizer-specific hints below). 7383 if (MDNode *LID = OrigLoop->getLoopID()) 7384 L->setLoopID(LID); 7385 7386 LoopVectorizeHints Hints(L, true, *ORE); 7387 Hints.setAlreadyVectorized(); 7388 } 7389 TargetTransformInfo::UnrollingPreferences UP; 7390 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7391 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7392 AddRuntimeUnrollDisableMetaData(L); 7393 7394 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7395 // predication, updating analyses. 7396 ILV.fixVectorizedLoop(State, BestVPlan); 7397 7398 ILV.printDebugTracesAtEnd(); 7399 7400 // 4. Adjust branch weight of the branch in the middle block. 7401 auto *MiddleTerm = 7402 cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); 7403 if (MiddleTerm->isConditional() && 7404 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7405 // Assume that `Count % VectorTripCount` is equally distributed. 7406 unsigned TripCount = State.UF * State.VF.getKnownMinValue(); 7407 assert(TripCount > 0 && "trip count should not be zero"); 7408 const uint32_t Weights[] = {1, TripCount - 1}; 7409 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); 7410 } 7411 7412 return {State.ExpandedSCEVs, ReductionResumeValues}; 7413 } 7414 7415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7416 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7417 for (const auto &Plan : VPlans) 7418 if (PrintVPlansInDotFormat) 7419 Plan->printDOT(O); 7420 else 7421 Plan->print(O); 7422 } 7423 #endif 7424 7425 //===--------------------------------------------------------------------===// 7426 // EpilogueVectorizerMainLoop 7427 //===--------------------------------------------------------------------===// 7428 7429 /// This function is partially responsible for generating the control flow 7430 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7431 std::pair<BasicBlock *, Value *> 7432 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7433 const SCEV2ValueTy &ExpandedSCEVs) { 7434 createVectorLoopSkeleton(""); 7435 7436 // Generate the code to check the minimum iteration count of the vector 7437 // epilogue (see below). 7438 EPI.EpilogueIterationCountCheck = 7439 emitIterationCountCheck(LoopScalarPreHeader, true); 7440 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7441 7442 // Generate the code to check any assumptions that we've made for SCEV 7443 // expressions. 7444 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7445 7446 // Generate the code that checks at runtime if arrays overlap. We put the 7447 // checks into a separate block to make the more common case of few elements 7448 // faster. 7449 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7450 7451 // Generate the iteration count check for the main loop, *after* the check 7452 // for the epilogue loop, so that the path-length is shorter for the case 7453 // that goes directly through the vector epilogue. The longer-path length for 7454 // the main loop is compensated for, by the gain from vectorizing the larger 7455 // trip count. Note: the branch will get updated later on when we vectorize 7456 // the epilogue. 7457 EPI.MainLoopIterationCountCheck = 7458 emitIterationCountCheck(LoopScalarPreHeader, false); 7459 7460 // Generate the induction variable. 7461 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7462 7463 // Skip induction resume value creation here because they will be created in 7464 // the second pass for the scalar loop. The induction resume values for the 7465 // inductions in the epilogue loop are created before executing the plan for 7466 // the epilogue loop. 7467 7468 return {LoopVectorPreHeader, nullptr}; 7469 } 7470 7471 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7472 LLVM_DEBUG({ 7473 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7474 << "Main Loop VF:" << EPI.MainLoopVF 7475 << ", Main Loop UF:" << EPI.MainLoopUF 7476 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7477 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7478 }); 7479 } 7480 7481 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7482 DEBUG_WITH_TYPE(VerboseDebug, { 7483 dbgs() << "intermediate fn:\n" 7484 << *OrigLoop->getHeader()->getParent() << "\n"; 7485 }); 7486 } 7487 7488 BasicBlock * 7489 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7490 bool ForEpilogue) { 7491 assert(Bypass && "Expected valid bypass basic block."); 7492 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7493 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7494 Value *Count = getTripCount(); 7495 // Reuse existing vector loop preheader for TC checks. 7496 // Note that new preheader block is generated for vector loop. 7497 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7498 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7499 7500 // Generate code to check if the loop's trip count is less than VF * UF of the 7501 // main vector loop. 7502 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7503 : VF.isVector()) 7504 ? ICmpInst::ICMP_ULE 7505 : ICmpInst::ICMP_ULT; 7506 7507 Value *CheckMinIters = Builder.CreateICmp( 7508 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7509 "min.iters.check"); 7510 7511 if (!ForEpilogue) 7512 TCCheckBlock->setName("vector.main.loop.iter.check"); 7513 7514 // Create new preheader for vector loop. 7515 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7516 DT, LI, nullptr, "vector.ph"); 7517 7518 if (ForEpilogue) { 7519 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7520 DT->getNode(Bypass)->getIDom()) && 7521 "TC check is expected to dominate Bypass"); 7522 7523 // Update dominator for Bypass. 7524 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7525 LoopBypassBlocks.push_back(TCCheckBlock); 7526 7527 // Save the trip count so we don't have to regenerate it in the 7528 // vec.epilog.iter.check. This is safe to do because the trip count 7529 // generated here dominates the vector epilog iter check. 7530 EPI.TripCount = Count; 7531 } 7532 7533 BranchInst &BI = 7534 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7535 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7536 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); 7537 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7538 7539 return TCCheckBlock; 7540 } 7541 7542 //===--------------------------------------------------------------------===// 7543 // EpilogueVectorizerEpilogueLoop 7544 //===--------------------------------------------------------------------===// 7545 7546 /// This function is partially responsible for generating the control flow 7547 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7548 std::pair<BasicBlock *, Value *> 7549 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7550 const SCEV2ValueTy &ExpandedSCEVs) { 7551 createVectorLoopSkeleton("vec.epilog."); 7552 7553 // Now, compare the remaining count and if there aren't enough iterations to 7554 // execute the vectorized epilogue skip to the scalar part. 7555 LoopVectorPreHeader->setName("vec.epilog.ph"); 7556 BasicBlock *VecEpilogueIterationCountCheck = 7557 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, 7558 nullptr, "vec.epilog.iter.check", true); 7559 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7560 VecEpilogueIterationCountCheck); 7561 7562 // Adjust the control flow taking the state info from the main loop 7563 // vectorization into account. 7564 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7565 "expected this to be saved from the previous pass."); 7566 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7567 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7568 7569 DT->changeImmediateDominator(LoopVectorPreHeader, 7570 EPI.MainLoopIterationCountCheck); 7571 7572 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7573 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7574 7575 if (EPI.SCEVSafetyCheck) 7576 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7577 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7578 if (EPI.MemSafetyCheck) 7579 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7580 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7581 7582 DT->changeImmediateDominator( 7583 VecEpilogueIterationCountCheck, 7584 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7585 7586 DT->changeImmediateDominator(LoopScalarPreHeader, 7587 EPI.EpilogueIterationCountCheck); 7588 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7589 // If there is an epilogue which must run, there's no edge from the 7590 // middle block to exit blocks and thus no need to update the immediate 7591 // dominator of the exit blocks. 7592 DT->changeImmediateDominator(LoopExitBlock, 7593 EPI.EpilogueIterationCountCheck); 7594 7595 // Keep track of bypass blocks, as they feed start values to the induction and 7596 // reduction phis in the scalar loop preheader. 7597 if (EPI.SCEVSafetyCheck) 7598 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7599 if (EPI.MemSafetyCheck) 7600 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7601 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7602 7603 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7604 // reductions which merge control-flow from the latch block and the middle 7605 // block. Update the incoming values here and move the Phi into the preheader. 7606 SmallVector<PHINode *, 4> PhisInBlock; 7607 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7608 PhisInBlock.push_back(&Phi); 7609 7610 for (PHINode *Phi : PhisInBlock) { 7611 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7612 Phi->replaceIncomingBlockWith( 7613 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7614 VecEpilogueIterationCountCheck); 7615 7616 // If the phi doesn't have an incoming value from the 7617 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7618 // value and also those from other check blocks. This is needed for 7619 // reduction phis only. 7620 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7621 return EPI.EpilogueIterationCountCheck == IncB; 7622 })) 7623 continue; 7624 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7625 if (EPI.SCEVSafetyCheck) 7626 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7627 if (EPI.MemSafetyCheck) 7628 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7629 } 7630 7631 // Generate a resume induction for the vector epilogue and put it in the 7632 // vector epilogue preheader 7633 Type *IdxTy = Legal->getWidestInductionType(); 7634 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); 7635 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7636 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7637 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7638 EPI.MainLoopIterationCountCheck); 7639 7640 // Generate induction resume values. These variables save the new starting 7641 // indexes for the scalar loop. They are used to test if there are any tail 7642 // iterations left once the vector loop has completed. 7643 // Note that when the vectorized epilogue is skipped due to iteration count 7644 // check, then the resume value for the induction variable comes from 7645 // the trip count of the main vector loop, hence passing the AdditionalBypass 7646 // argument. 7647 createInductionResumeValues(ExpandedSCEVs, 7648 {VecEpilogueIterationCountCheck, 7649 EPI.VectorTripCount} /* AdditionalBypass */); 7650 7651 return {LoopVectorPreHeader, EPResumeVal}; 7652 } 7653 7654 BasicBlock * 7655 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7656 BasicBlock *Bypass, BasicBlock *Insert) { 7657 7658 assert(EPI.TripCount && 7659 "Expected trip count to have been safed in the first pass."); 7660 assert( 7661 (!isa<Instruction>(EPI.TripCount) || 7662 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7663 "saved trip count does not dominate insertion point."); 7664 Value *TC = EPI.TripCount; 7665 IRBuilder<> Builder(Insert->getTerminator()); 7666 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7667 7668 // Generate code to check if the loop's trip count is less than VF * UF of the 7669 // vector epilogue loop. 7670 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 7671 ? ICmpInst::ICMP_ULE 7672 : ICmpInst::ICMP_ULT; 7673 7674 Value *CheckMinIters = 7675 Builder.CreateICmp(P, Count, 7676 createStepForVF(Builder, Count->getType(), 7677 EPI.EpilogueVF, EPI.EpilogueUF), 7678 "min.epilog.iters.check"); 7679 7680 BranchInst &BI = 7681 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7682 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7683 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 7684 unsigned EpilogueLoopStep = 7685 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 7686 // We assume the remaining `Count` is equally distributed in 7687 // [0, MainLoopStep) 7688 // So the probability for `Count < EpilogueLoopStep` should be 7689 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 7690 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 7691 const uint32_t Weights[] = {EstimatedSkipCount, 7692 MainLoopStep - EstimatedSkipCount}; 7693 setBranchWeights(BI, Weights, /*IsExpected=*/false); 7694 } 7695 ReplaceInstWithInst(Insert->getTerminator(), &BI); 7696 LoopBypassBlocks.push_back(Insert); 7697 return Insert; 7698 } 7699 7700 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7701 LLVM_DEBUG({ 7702 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7703 << "Epilogue Loop VF:" << EPI.EpilogueVF 7704 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7705 }); 7706 } 7707 7708 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7709 DEBUG_WITH_TYPE(VerboseDebug, { 7710 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7711 }); 7712 } 7713 7714 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7715 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7716 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7717 bool PredicateAtRangeStart = Predicate(Range.Start); 7718 7719 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 7720 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7721 Range.End = TmpVF; 7722 break; 7723 } 7724 7725 return PredicateAtRangeStart; 7726 } 7727 7728 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7729 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7730 /// of VF's starting at a given VF and extending it as much as possible. Each 7731 /// vectorization decision can potentially shorten this sub-range during 7732 /// buildVPlan(). 7733 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7734 ElementCount MaxVF) { 7735 auto MaxVFTimes2 = MaxVF * 2; 7736 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 7737 VFRange SubRange = {VF, MaxVFTimes2}; 7738 VPlans.push_back(buildVPlan(SubRange)); 7739 VF = SubRange.End; 7740 } 7741 } 7742 7743 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> 7744 VPRecipeBuilder::mapToVPValues(User::op_range Operands) { 7745 std::function<VPValue *(Value *)> Fn = [this](Value *Op) { 7746 if (auto *I = dyn_cast<Instruction>(Op)) { 7747 if (auto *R = Ingredient2Recipe.lookup(I)) 7748 return R->getVPSingleValue(); 7749 } 7750 return Plan.getOrAddLiveIn(Op); 7751 }; 7752 return map_range(Operands, Fn); 7753 } 7754 7755 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 7756 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7757 7758 // Look for cached value. 7759 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7760 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7761 if (ECEntryIt != EdgeMaskCache.end()) 7762 return ECEntryIt->second; 7763 7764 VPValue *SrcMask = getBlockInMask(Src); 7765 7766 // The terminator has to be a branch inst! 7767 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7768 assert(BI && "Unexpected terminator found"); 7769 7770 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7771 return EdgeMaskCache[Edge] = SrcMask; 7772 7773 // If source is an exiting block, we know the exit edge is dynamically dead 7774 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7775 // adding uses of an otherwise potentially dead instruction. 7776 if (OrigLoop->isLoopExiting(Src)) 7777 return EdgeMaskCache[Edge] = SrcMask; 7778 7779 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan); 7780 assert(EdgeMask && "No Edge Mask found for condition"); 7781 7782 if (BI->getSuccessor(0) != Dst) 7783 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7784 7785 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 7786 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 7787 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 7788 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 7789 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); 7790 } 7791 7792 return EdgeMaskCache[Edge] = EdgeMask; 7793 } 7794 7795 VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { 7796 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7797 7798 // Look for cached value. 7799 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7800 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); 7801 assert(ECEntryIt != EdgeMaskCache.end() && 7802 "looking up mask for edge which has not been created"); 7803 return ECEntryIt->second; 7804 } 7805 7806 void VPRecipeBuilder::createHeaderMask() { 7807 BasicBlock *Header = OrigLoop->getHeader(); 7808 7809 // When not folding the tail, use nullptr to model all-true mask. 7810 if (!CM.foldTailByMasking()) { 7811 BlockMaskCache[Header] = nullptr; 7812 return; 7813 } 7814 7815 // Introduce the early-exit compare IV <= BTC to form header block mask. 7816 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 7817 // constructing the desired canonical IV in the header block as its first 7818 // non-phi instructions. 7819 7820 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 7821 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 7822 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 7823 HeaderVPBB->insert(IV, NewInsertionPoint); 7824 7825 VPBuilder::InsertPointGuard Guard(Builder); 7826 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 7827 VPValue *BlockMask = nullptr; 7828 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 7829 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 7830 BlockMaskCache[Header] = BlockMask; 7831 } 7832 7833 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 7834 // Return the cached value. 7835 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 7836 assert(BCEntryIt != BlockMaskCache.end() && 7837 "Trying to access mask for block without one."); 7838 return BCEntryIt->second; 7839 } 7840 7841 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { 7842 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7843 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 7844 assert(OrigLoop->getHeader() != BB && 7845 "Loop header must have cached block mask"); 7846 7847 // All-one mask is modelled as no-mask following the convention for masked 7848 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7849 VPValue *BlockMask = nullptr; 7850 // This is the block mask. We OR all incoming edges. 7851 for (auto *Predecessor : predecessors(BB)) { 7852 VPValue *EdgeMask = createEdgeMask(Predecessor, BB); 7853 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 7854 BlockMaskCache[BB] = EdgeMask; 7855 return; 7856 } 7857 7858 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7859 BlockMask = EdgeMask; 7860 continue; 7861 } 7862 7863 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 7864 } 7865 7866 BlockMaskCache[BB] = BlockMask; 7867 } 7868 7869 VPWidenMemoryRecipe * 7870 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, 7871 VFRange &Range) { 7872 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7873 "Must be called with either a load or store"); 7874 7875 auto willWiden = [&](ElementCount VF) -> bool { 7876 LoopVectorizationCostModel::InstWidening Decision = 7877 CM.getWideningDecision(I, VF); 7878 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7879 "CM decision should be taken at this point."); 7880 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7881 return true; 7882 if (CM.isScalarAfterVectorization(I, VF) || 7883 CM.isProfitableToScalarize(I, VF)) 7884 return false; 7885 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7886 }; 7887 7888 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7889 return nullptr; 7890 7891 VPValue *Mask = nullptr; 7892 if (Legal->isMaskRequired(I)) 7893 Mask = getBlockInMask(I->getParent()); 7894 7895 // Determine if the pointer operand of the access is either consecutive or 7896 // reverse consecutive. 7897 LoopVectorizationCostModel::InstWidening Decision = 7898 CM.getWideningDecision(I, Range.Start); 7899 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 7900 bool Consecutive = 7901 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 7902 7903 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 7904 if (Consecutive) { 7905 auto *GEP = dyn_cast<GetElementPtrInst>( 7906 Ptr->getUnderlyingValue()->stripPointerCasts()); 7907 auto *VectorPtr = new VPVectorPointerRecipe( 7908 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, 7909 I->getDebugLoc()); 7910 Builder.getInsertBlock()->appendRecipe(VectorPtr); 7911 Ptr = VectorPtr; 7912 } 7913 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7914 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, 7915 I->getDebugLoc()); 7916 7917 StoreInst *Store = cast<StoreInst>(I); 7918 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, 7919 Reverse, I->getDebugLoc()); 7920 } 7921 7922 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 7923 /// insert a recipe to expand the step for the induction recipe. 7924 static VPWidenIntOrFpInductionRecipe * 7925 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 7926 VPValue *Start, const InductionDescriptor &IndDesc, 7927 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { 7928 assert(IndDesc.getStartValue() == 7929 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 7930 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 7931 "step must be loop invariant"); 7932 7933 VPValue *Step = 7934 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 7935 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 7936 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 7937 } 7938 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 7939 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 7940 } 7941 7942 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( 7943 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { 7944 7945 // Check if this is an integer or fp induction. If so, build the recipe that 7946 // produces its scalar and vector values. 7947 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 7948 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 7949 *PSE.getSE(), *OrigLoop); 7950 7951 // Check if this is pointer induction. If so, build the recipe for it. 7952 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 7953 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 7954 *PSE.getSE()); 7955 return new VPWidenPointerInductionRecipe( 7956 Phi, Operands[0], Step, *II, 7957 LoopVectorizationPlanner::getDecisionAndClampRange( 7958 [&](ElementCount VF) { 7959 return CM.isScalarAfterVectorization(Phi, VF); 7960 }, 7961 Range)); 7962 } 7963 return nullptr; 7964 } 7965 7966 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 7967 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { 7968 // Optimize the special case where the source is a constant integer 7969 // induction variable. Notice that we can only optimize the 'trunc' case 7970 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7971 // (c) other casts depend on pointer size. 7972 7973 // Determine whether \p K is a truncation based on an induction variable that 7974 // can be optimized. 7975 auto isOptimizableIVTruncate = 7976 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7977 return [=](ElementCount VF) -> bool { 7978 return CM.isOptimizableIVTruncate(K, VF); 7979 }; 7980 }; 7981 7982 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7983 isOptimizableIVTruncate(I), Range)) { 7984 7985 auto *Phi = cast<PHINode>(I->getOperand(0)); 7986 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 7987 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); 7988 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 7989 *OrigLoop); 7990 } 7991 return nullptr; 7992 } 7993 7994 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, 7995 ArrayRef<VPValue *> Operands) { 7996 unsigned NumIncoming = Phi->getNumIncomingValues(); 7997 7998 // We know that all PHIs in non-header blocks are converted into selects, so 7999 // we don't have to worry about the insertion order and we can just use the 8000 // builder. At this point we generate the predication tree. There may be 8001 // duplications since this is a simple recursive scan, but future 8002 // optimizations will clean it up. 8003 // TODO: At the moment the first mask is always skipped, but it would be 8004 // better to skip the most expensive mask. 8005 SmallVector<VPValue *, 2> OperandsWithMask; 8006 8007 for (unsigned In = 0; In < NumIncoming; In++) { 8008 OperandsWithMask.push_back(Operands[In]); 8009 VPValue *EdgeMask = 8010 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); 8011 if (!EdgeMask) { 8012 assert(In == 0 && "Both null and non-null edge masks found"); 8013 assert(all_equal(Operands) && 8014 "Distinct incoming values with one having a full mask"); 8015 break; 8016 } 8017 if (In == 0) 8018 continue; 8019 OperandsWithMask.push_back(EdgeMask); 8020 } 8021 return new VPBlendRecipe(Phi, OperandsWithMask); 8022 } 8023 8024 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8025 ArrayRef<VPValue *> Operands, 8026 VFRange &Range) { 8027 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8028 [this, CI](ElementCount VF) { 8029 return CM.isScalarWithPredication(CI, VF); 8030 }, 8031 Range); 8032 8033 if (IsPredicated) 8034 return nullptr; 8035 8036 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8037 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8038 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8039 ID == Intrinsic::pseudoprobe || 8040 ID == Intrinsic::experimental_noalias_scope_decl)) 8041 return nullptr; 8042 8043 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8044 Ops.push_back(Operands.back()); 8045 8046 // Is it beneficial to perform intrinsic call compared to lib call? 8047 bool ShouldUseVectorIntrinsic = 8048 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8049 [&](ElementCount VF) -> bool { 8050 return CM.getCallWideningDecision(CI, VF).Kind == 8051 LoopVectorizationCostModel::CM_IntrinsicCall; 8052 }, 8053 Range); 8054 if (ShouldUseVectorIntrinsic) 8055 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID, 8056 CI->getDebugLoc()); 8057 8058 Function *Variant = nullptr; 8059 std::optional<unsigned> MaskPos; 8060 // Is better to call a vectorized version of the function than to to scalarize 8061 // the call? 8062 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8063 [&](ElementCount VF) -> bool { 8064 // The following case may be scalarized depending on the VF. 8065 // The flag shows whether we can use a usual Call for vectorized 8066 // version of the instruction. 8067 8068 // If we've found a variant at a previous VF, then stop looking. A 8069 // vectorized variant of a function expects input in a certain shape 8070 // -- basically the number of input registers, the number of lanes 8071 // per register, and whether there's a mask required. 8072 // We store a pointer to the variant in the VPWidenCallRecipe, so 8073 // once we have an appropriate variant it's only valid for that VF. 8074 // This will force a different vplan to be generated for each VF that 8075 // finds a valid variant. 8076 if (Variant) 8077 return false; 8078 LoopVectorizationCostModel::CallWideningDecision Decision = 8079 CM.getCallWideningDecision(CI, VF); 8080 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8081 Variant = Decision.Variant; 8082 MaskPos = Decision.MaskPos; 8083 return true; 8084 } 8085 8086 return false; 8087 }, 8088 Range); 8089 if (ShouldUseVectorCall) { 8090 if (MaskPos.has_value()) { 8091 // We have 2 cases that would require a mask: 8092 // 1) The block needs to be predicated, either due to a conditional 8093 // in the scalar loop or use of an active lane mask with 8094 // tail-folding, and we use the appropriate mask for the block. 8095 // 2) No mask is required for the block, but the only available 8096 // vector variant at this VF requires a mask, so we synthesize an 8097 // all-true mask. 8098 VPValue *Mask = nullptr; 8099 if (Legal->isMaskRequired(CI)) 8100 Mask = getBlockInMask(CI->getParent()); 8101 else 8102 Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue( 8103 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8104 8105 Ops.insert(Ops.begin() + *MaskPos, Mask); 8106 } 8107 8108 return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), 8109 Intrinsic::not_intrinsic, CI->getDebugLoc(), 8110 Variant); 8111 } 8112 8113 return nullptr; 8114 } 8115 8116 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8117 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8118 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8119 // Instruction should be widened, unless it is scalar after vectorization, 8120 // scalarization is profitable or it is predicated. 8121 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8122 return CM.isScalarAfterVectorization(I, VF) || 8123 CM.isProfitableToScalarize(I, VF) || 8124 CM.isScalarWithPredication(I, VF); 8125 }; 8126 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8127 Range); 8128 } 8129 8130 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8131 ArrayRef<VPValue *> Operands, 8132 VPBasicBlock *VPBB) { 8133 switch (I->getOpcode()) { 8134 default: 8135 return nullptr; 8136 case Instruction::SDiv: 8137 case Instruction::UDiv: 8138 case Instruction::SRem: 8139 case Instruction::URem: { 8140 // If not provably safe, use a select to form a safe divisor before widening the 8141 // div/rem operation itself. Otherwise fall through to general handling below. 8142 if (CM.isPredicatedInst(I)) { 8143 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8144 VPValue *Mask = getBlockInMask(I->getParent()); 8145 VPValue *One = 8146 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); 8147 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); 8148 Ops[1] = SafeRHS; 8149 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8150 } 8151 [[fallthrough]]; 8152 } 8153 case Instruction::Add: 8154 case Instruction::And: 8155 case Instruction::AShr: 8156 case Instruction::FAdd: 8157 case Instruction::FCmp: 8158 case Instruction::FDiv: 8159 case Instruction::FMul: 8160 case Instruction::FNeg: 8161 case Instruction::FRem: 8162 case Instruction::FSub: 8163 case Instruction::ICmp: 8164 case Instruction::LShr: 8165 case Instruction::Mul: 8166 case Instruction::Or: 8167 case Instruction::Select: 8168 case Instruction::Shl: 8169 case Instruction::Sub: 8170 case Instruction::Xor: 8171 case Instruction::Freeze: 8172 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8173 }; 8174 } 8175 8176 void VPRecipeBuilder::fixHeaderPhis() { 8177 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8178 for (VPHeaderPHIRecipe *R : PhisToFix) { 8179 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8180 VPRecipeBase *IncR = 8181 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8182 R->addOperand(IncR->getVPSingleValue()); 8183 } 8184 } 8185 8186 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, 8187 VFRange &Range) { 8188 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8189 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8190 Range); 8191 8192 bool IsPredicated = CM.isPredicatedInst(I); 8193 8194 // Even if the instruction is not marked as uniform, there are certain 8195 // intrinsic calls that can be effectively treated as such, so we check for 8196 // them here. Conservatively, we only do this for scalable vectors, since 8197 // for fixed-width VFs we can always fall back on full scalarization. 8198 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8199 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8200 case Intrinsic::assume: 8201 case Intrinsic::lifetime_start: 8202 case Intrinsic::lifetime_end: 8203 // For scalable vectors if one of the operands is variant then we still 8204 // want to mark as uniform, which will generate one instruction for just 8205 // the first lane of the vector. We can't scalarize the call in the same 8206 // way as for fixed-width vectors because we don't know how many lanes 8207 // there are. 8208 // 8209 // The reasons for doing it this way for scalable vectors are: 8210 // 1. For the assume intrinsic generating the instruction for the first 8211 // lane is still be better than not generating any at all. For 8212 // example, the input may be a splat across all lanes. 8213 // 2. For the lifetime start/end intrinsics the pointer operand only 8214 // does anything useful when the input comes from a stack object, 8215 // which suggests it should always be uniform. For non-stack objects 8216 // the effect is to poison the object, which still allows us to 8217 // remove the call. 8218 IsUniform = true; 8219 break; 8220 default: 8221 break; 8222 } 8223 } 8224 VPValue *BlockInMask = nullptr; 8225 if (!IsPredicated) { 8226 // Finalize the recipe for Instr, first if it is not predicated. 8227 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8228 } else { 8229 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8230 // Instructions marked for predication are replicated and a mask operand is 8231 // added initially. Masked replicate recipes will later be placed under an 8232 // if-then construct to prevent side-effects. Generate recipes to compute 8233 // the block mask for this region. 8234 BlockInMask = getBlockInMask(I->getParent()); 8235 } 8236 8237 // Note that there is some custom logic to mark some intrinsics as uniform 8238 // manually above for scalable vectors, which this assert needs to account for 8239 // as well. 8240 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || 8241 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && 8242 "Should not predicate a uniform recipe"); 8243 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), 8244 IsUniform, BlockInMask); 8245 return Recipe; 8246 } 8247 8248 VPRecipeBase * 8249 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8250 ArrayRef<VPValue *> Operands, 8251 VFRange &Range, VPBasicBlock *VPBB) { 8252 // First, check for specific widening recipes that deal with inductions, Phi 8253 // nodes, calls and memory operations. 8254 VPRecipeBase *Recipe; 8255 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8256 if (Phi->getParent() != OrigLoop->getHeader()) 8257 return tryToBlend(Phi, Operands); 8258 8259 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) 8260 return Recipe; 8261 8262 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8263 assert((Legal->isReductionVariable(Phi) || 8264 Legal->isFixedOrderRecurrence(Phi)) && 8265 "can only widen reductions and fixed-order recurrences here"); 8266 VPValue *StartV = Operands[0]; 8267 if (Legal->isReductionVariable(Phi)) { 8268 const RecurrenceDescriptor &RdxDesc = 8269 Legal->getReductionVars().find(Phi)->second; 8270 assert(RdxDesc.getRecurrenceStartValue() == 8271 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8272 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8273 CM.isInLoopReduction(Phi), 8274 CM.useOrderedReductions(RdxDesc)); 8275 } else { 8276 // TODO: Currently fixed-order recurrences are modeled as chains of 8277 // first-order recurrences. If there are no users of the intermediate 8278 // recurrences in the chain, the fixed order recurrence should be modeled 8279 // directly, enabling more efficient codegen. 8280 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8281 } 8282 8283 PhisToFix.push_back(PhiRecipe); 8284 return PhiRecipe; 8285 } 8286 8287 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8288 cast<TruncInst>(Instr), Operands, Range))) 8289 return Recipe; 8290 8291 // All widen recipes below deal only with VF > 1. 8292 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8293 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8294 return nullptr; 8295 8296 if (auto *CI = dyn_cast<CallInst>(Instr)) 8297 return tryToWidenCall(CI, Operands, Range); 8298 8299 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8300 return tryToWidenMemory(Instr, Operands, Range); 8301 8302 if (!shouldWiden(Instr, Range)) 8303 return nullptr; 8304 8305 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8306 return new VPWidenGEPRecipe(GEP, 8307 make_range(Operands.begin(), Operands.end())); 8308 8309 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8310 return new VPWidenSelectRecipe( 8311 *SI, make_range(Operands.begin(), Operands.end())); 8312 } 8313 8314 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8315 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), 8316 *CI); 8317 } 8318 8319 return tryToWiden(Instr, Operands, VPBB); 8320 } 8321 8322 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8323 ElementCount MaxVF) { 8324 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8325 8326 auto MaxVFTimes2 = MaxVF * 2; 8327 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8328 VFRange SubRange = {VF, MaxVFTimes2}; 8329 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8330 // Now optimize the initial VPlan. 8331 if (!Plan->hasVF(ElementCount::getFixed(1))) 8332 VPlanTransforms::truncateToMinimalBitwidths( 8333 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); 8334 VPlanTransforms::optimize(*Plan, *PSE.getSE()); 8335 // TODO: try to put it close to addActiveLaneMask(). 8336 // Discard the plan if it is not EVL-compatible 8337 if (CM.foldTailWithEVL() && 8338 !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) 8339 break; 8340 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 8341 VPlans.push_back(std::move(Plan)); 8342 } 8343 VF = SubRange.End; 8344 } 8345 } 8346 8347 // Add the necessary canonical IV and branch recipes required to control the 8348 // loop. 8349 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8350 DebugLoc DL) { 8351 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8352 auto *StartV = Plan.getOrAddLiveIn(StartIdx); 8353 8354 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8355 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8356 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8357 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8358 Header->insert(CanonicalIVPHI, Header->begin()); 8359 8360 VPBuilder Builder(TopRegion->getExitingBasicBlock()); 8361 // Add a VPInstruction to increment the scalar canonical IV by VF * UF. 8362 auto *CanonicalIVIncrement = Builder.createOverflowingOp( 8363 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, 8364 "index.next"); 8365 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8366 8367 // Add the BranchOnCount VPInstruction to the latch. 8368 Builder.createNaryOp(VPInstruction::BranchOnCount, 8369 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8370 } 8371 8372 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8373 // original exit block. 8374 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, 8375 VPRecipeBuilder &Builder, VPlan &Plan) { 8376 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8377 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8378 // Only handle single-exit loops with unique exit blocks for now. 8379 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8380 return; 8381 8382 // Introduce VPUsers modeling the exit values. 8383 for (PHINode &ExitPhi : ExitBB->phis()) { 8384 Value *IncomingValue = 8385 ExitPhi.getIncomingValueForBlock(ExitingBB); 8386 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); 8387 // Exit values for inductions are computed and updated outside of VPlan and 8388 // independent of induction recipes. 8389 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update 8390 // live-outs. 8391 if ((isa<VPWidenIntOrFpInductionRecipe>(V) && 8392 !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || 8393 isa<VPWidenPointerInductionRecipe>(V)) 8394 continue; 8395 Plan.addLiveOut(&ExitPhi, V); 8396 } 8397 } 8398 8399 /// Feed a resume value for every FOR from the vector loop to the scalar loop, 8400 /// if middle block branches to scalar preheader, by introducing ExtractFromEnd 8401 /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the 8402 /// latter and corresponds to the scalar header. 8403 static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { 8404 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); 8405 8406 // Start by finding out if middle block branches to scalar preheader, which is 8407 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of 8408 // middle block. 8409 // TODO: Should be replaced by 8410 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the 8411 // scalar region is modeled as well. 8412 VPBasicBlock *ScalarPHVPBB = nullptr; 8413 auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()); 8414 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) { 8415 if (isa<VPIRBasicBlock>(Succ)) 8416 continue; 8417 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?"); 8418 ScalarPHVPBB = cast<VPBasicBlock>(Succ); 8419 } 8420 if (!ScalarPHVPBB) 8421 return; 8422 8423 VPBuilder ScalarPHBuilder(ScalarPHVPBB); 8424 VPBuilder MiddleBuilder(MiddleVPBB); 8425 // Reset insert point so new recipes are inserted before terminator and 8426 // condition, if there is either the former or both. 8427 if (auto *Terminator = MiddleVPBB->getTerminator()) { 8428 auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0)); 8429 assert((!Condition || Condition->getParent() == MiddleVPBB) && 8430 "Condition expected in MiddleVPBB"); 8431 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator); 8432 } 8433 VPValue *OneVPV = Plan.getOrAddLiveIn( 8434 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); 8435 8436 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { 8437 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); 8438 if (!FOR) 8439 continue; 8440 8441 // Extract the resume value and create a new VPLiveOut for it. 8442 auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, 8443 {FOR->getBackedgeValue(), OneVPV}, 8444 {}, "vector.recur.extract"); 8445 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( 8446 VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {}, 8447 "scalar.recur.init"); 8448 Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe); 8449 } 8450 } 8451 8452 VPlanPtr 8453 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 8454 8455 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8456 8457 // --------------------------------------------------------------------------- 8458 // Build initial VPlan: Scan the body of the loop in a topological order to 8459 // visit each basic block after having visited its predecessor basic blocks. 8460 // --------------------------------------------------------------------------- 8461 8462 // Create initial VPlan skeleton, having a basic block for the pre-header 8463 // which contains SCEV expansions that need to happen before the CFG is 8464 // modified; a basic block for the vector pre-header, followed by a region for 8465 // the vector loop, followed by the middle basic block. The skeleton vector 8466 // loop region contains a header and latch basic blocks. 8467 8468 bool RequiresScalarEpilogueCheck = 8469 LoopVectorizationPlanner::getDecisionAndClampRange( 8470 [this](ElementCount VF) { 8471 return !CM.requiresScalarEpilogue(VF.isVector()); 8472 }, 8473 Range); 8474 VPlanPtr Plan = VPlan::createInitialVPlan( 8475 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8476 *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), 8477 OrigLoop); 8478 8479 // Don't use getDecisionAndClampRange here, because we don't know the UF 8480 // so this function is better to be conservative, rather than to split 8481 // it up into different VPlans. 8482 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 8483 bool IVUpdateMayOverflow = false; 8484 for (ElementCount VF : Range) 8485 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8486 8487 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8488 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 8489 // When not folding the tail, we know that the induction increment will not 8490 // overflow. 8491 bool HasNUW = Style == TailFoldingStyle::None; 8492 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 8493 8494 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); 8495 8496 // --------------------------------------------------------------------------- 8497 // Pre-construction: record ingredients whose recipes we'll need to further 8498 // process after constructing the initial VPlan. 8499 // --------------------------------------------------------------------------- 8500 8501 // For each interleave group which is relevant for this (possibly trimmed) 8502 // Range, add it to the set of groups to be later applied to the VPlan and add 8503 // placeholders for its members' Recipes which we'll be replacing with a 8504 // single VPInterleaveRecipe. 8505 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8506 auto applyIG = [IG, this](ElementCount VF) -> bool { 8507 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8508 CM.getWideningDecision(IG->getInsertPos(), VF) == 8509 LoopVectorizationCostModel::CM_Interleave); 8510 // For scalable vectors, the only interleave factor currently supported 8511 // is 2 since we require the (de)interleave2 intrinsics instead of 8512 // shufflevectors. 8513 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8514 "Unsupported interleave factor for scalable vectors"); 8515 return Result; 8516 }; 8517 if (!getDecisionAndClampRange(applyIG, Range)) 8518 continue; 8519 InterleaveGroups.insert(IG); 8520 }; 8521 8522 // --------------------------------------------------------------------------- 8523 // Construct recipes for the instructions in the loop 8524 // --------------------------------------------------------------------------- 8525 8526 // Scan the body of the loop in a topological order to visit each basic block 8527 // after having visited its predecessor basic blocks. 8528 LoopBlocksDFS DFS(OrigLoop); 8529 DFS.perform(LI); 8530 8531 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8532 VPBasicBlock *VPBB = HeaderVPBB; 8533 BasicBlock *HeaderBB = OrigLoop->getHeader(); 8534 bool NeedsMasks = 8535 CM.foldTailByMasking() || 8536 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { 8537 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); 8538 return Legal->blockNeedsPredication(BB) || NeedsBlends; 8539 }); 8540 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8541 // Relevant instructions from basic block BB will be grouped into VPRecipe 8542 // ingredients and fill a new VPBasicBlock. 8543 if (VPBB != HeaderVPBB) 8544 VPBB->setName(BB->getName()); 8545 Builder.setInsertPoint(VPBB); 8546 8547 if (VPBB == HeaderVPBB) 8548 RecipeBuilder.createHeaderMask(); 8549 else if (NeedsMasks) 8550 RecipeBuilder.createBlockInMask(BB); 8551 8552 // Introduce each ingredient into VPlan. 8553 // TODO: Model and preserve debug intrinsics in VPlan. 8554 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 8555 Instruction *Instr = &I; 8556 SmallVector<VPValue *, 4> Operands; 8557 auto *Phi = dyn_cast<PHINode>(Instr); 8558 if (Phi && Phi->getParent() == HeaderBB) { 8559 Operands.push_back(Plan->getOrAddLiveIn( 8560 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8561 } else { 8562 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); 8563 Operands = {OpRange.begin(), OpRange.end()}; 8564 } 8565 8566 // Invariant stores inside loop will be deleted and a single store 8567 // with the final reduction value will be added to the exit block 8568 StoreInst *SI; 8569 if ((SI = dyn_cast<StoreInst>(&I)) && 8570 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8571 continue; 8572 8573 VPRecipeBase *Recipe = 8574 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); 8575 if (!Recipe) 8576 Recipe = RecipeBuilder.handleReplication(Instr, Range); 8577 8578 RecipeBuilder.setRecipe(Instr, Recipe); 8579 if (isa<VPHeaderPHIRecipe>(Recipe)) { 8580 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 8581 // the following cases, VPHeaderPHIRecipes may be created after non-phi 8582 // recipes and need to be moved to the phi section of HeaderVPBB: 8583 // * tail-folding (non-phi recipes computing the header mask are 8584 // introduced earlier than regular header phi recipes, and should appear 8585 // after them) 8586 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 8587 8588 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 8589 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 8590 "unexpected recipe needs moving"); 8591 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8592 } else 8593 VPBB->appendRecipe(Recipe); 8594 } 8595 8596 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8597 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8598 } 8599 8600 // After here, VPBB should not be used. 8601 VPBB = nullptr; 8602 8603 if (CM.requiresScalarEpilogue(Range)) { 8604 // No edge from the middle block to the unique exit block has been inserted 8605 // and there is nothing to fix from vector loop; phis should have incoming 8606 // from scalar loop only. 8607 } else 8608 addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan); 8609 8610 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8611 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8612 "entry block must be set to a VPRegionBlock having a non-empty entry " 8613 "VPBasicBlock"); 8614 RecipeBuilder.fixHeaderPhis(); 8615 8616 addLiveOutsForFirstOrderRecurrences(*Plan); 8617 8618 // --------------------------------------------------------------------------- 8619 // Transform initial VPlan: Apply previously taken decisions, in order, to 8620 // bring the VPlan to its final state. 8621 // --------------------------------------------------------------------------- 8622 8623 // Adjust the recipes for any inloop reductions. 8624 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); 8625 8626 // Interleave memory: for each Interleave Group we marked earlier as relevant 8627 // for this VPlan, replace the Recipes widening its memory instructions with a 8628 // single VPInterleaveRecipe at its insertion point. 8629 for (const auto *IG : InterleaveGroups) { 8630 auto *Recipe = 8631 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos())); 8632 SmallVector<VPValue *, 4> StoredValues; 8633 for (unsigned i = 0; i < IG->getFactor(); ++i) 8634 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8635 auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI)); 8636 StoredValues.push_back(StoreR->getStoredValue()); 8637 } 8638 8639 bool NeedsMaskForGaps = 8640 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 8641 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) && 8642 "masked interleaved groups are not allowed."); 8643 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8644 Recipe->getMask(), NeedsMaskForGaps); 8645 VPIG->insertBefore(Recipe); 8646 unsigned J = 0; 8647 for (unsigned i = 0; i < IG->getFactor(); ++i) 8648 if (Instruction *Member = IG->getMember(i)) { 8649 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 8650 if (!Member->getType()->isVoidTy()) { 8651 VPValue *OriginalV = MemberR->getVPSingleValue(); 8652 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8653 J++; 8654 } 8655 MemberR->eraseFromParent(); 8656 } 8657 } 8658 8659 for (ElementCount VF : Range) 8660 Plan->addVF(VF); 8661 Plan->setName("Initial VPlan"); 8662 8663 // Replace VPValues for known constant strides guaranteed by predicate scalar 8664 // evolution. 8665 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 8666 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 8667 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 8668 // Only handle constant strides for now. 8669 if (!ScevStride) 8670 continue; 8671 8672 auto *CI = Plan->getOrAddLiveIn( 8673 ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); 8674 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) 8675 StrideVPV->replaceAllUsesWith(CI); 8676 8677 // The versioned value may not be used in the loop directly but through a 8678 // sext/zext. Add new live-ins in those cases. 8679 for (Value *U : StrideV->users()) { 8680 if (!isa<SExtInst, ZExtInst>(U)) 8681 continue; 8682 VPValue *StrideVPV = Plan->getLiveIn(U); 8683 if (!StrideVPV) 8684 continue; 8685 unsigned BW = U->getType()->getScalarSizeInBits(); 8686 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) 8687 : ScevStride->getAPInt().zext(BW); 8688 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); 8689 StrideVPV->replaceAllUsesWith(CI); 8690 } 8691 } 8692 8693 VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) { 8694 return Legal->blockNeedsPredication(BB); 8695 }); 8696 8697 // Sink users of fixed-order recurrence past the recipe defining the previous 8698 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 8699 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 8700 return nullptr; 8701 8702 if (useActiveLaneMask(Style)) { 8703 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 8704 // TailFoldingStyle is visible there. 8705 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 8706 bool WithoutRuntimeCheck = 8707 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 8708 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 8709 WithoutRuntimeCheck); 8710 } 8711 return Plan; 8712 } 8713 8714 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8715 // Outer loop handling: They may require CFG and instruction level 8716 // transformations before even evaluating whether vectorization is profitable. 8717 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8718 // the vectorization pipeline. 8719 assert(!OrigLoop->isInnermost()); 8720 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8721 8722 // Create new empty VPlan 8723 auto Plan = VPlan::createInitialVPlan( 8724 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8725 *PSE.getSE(), true, false, OrigLoop); 8726 8727 // Build hierarchical CFG 8728 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8729 HCFGBuilder.buildHierarchicalCFG(); 8730 8731 for (ElementCount VF : Range) 8732 Plan->addVF(VF); 8733 8734 VPlanTransforms::VPInstructionsToVPRecipes( 8735 Plan, 8736 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 8737 *PSE.getSE(), *TLI); 8738 8739 // Remove the existing terminator of the exiting block of the top-most region. 8740 // A BranchOnCount will be added instead when adding the canonical IV recipes. 8741 auto *Term = 8742 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 8743 Term->eraseFromParent(); 8744 8745 // Tail folding is not supported for outer loops, so the induction increment 8746 // is guaranteed to not wrap. 8747 bool HasNUW = true; 8748 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 8749 DebugLoc()); 8750 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); 8751 return Plan; 8752 } 8753 8754 // Adjust the recipes for reductions. For in-loop reductions the chain of 8755 // instructions leading from the loop exit instr to the phi need to be converted 8756 // to reductions, with one operand being vector and the other being the scalar 8757 // reduction chain. For other reductions, a select is introduced between the phi 8758 // and live-out recipes when folding the tail. 8759 // 8760 // A ComputeReductionResult recipe is added to the middle block, also for 8761 // in-loop reductions which compute their result in-loop, because generating 8762 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 8763 // 8764 // Adjust AnyOf reductions; replace the reduction phi for the selected value 8765 // with a boolean reduction phi node to check if the condition is true in any 8766 // iteration. The final value is selected by the final ComputeReductionResult. 8767 void LoopVectorizationPlanner::adjustRecipesForReductions( 8768 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { 8769 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 8770 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 8771 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 8772 // sank outside of the loop would keep the same order as they had in the 8773 // original loop. 8774 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 8775 for (VPRecipeBase &R : Header->phis()) { 8776 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 8777 ReductionPHIList.emplace_back(ReductionPhi); 8778 } 8779 bool HasIntermediateStore = false; 8780 stable_sort(ReductionPHIList, 8781 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, 8782 const VPReductionPHIRecipe *R2) { 8783 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 8784 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 8785 HasIntermediateStore |= IS1 || IS2; 8786 8787 // If neither of the recipes has an intermediate store, keep the 8788 // order the same. 8789 if (!IS1 && !IS2) 8790 return false; 8791 8792 // If only one of the recipes has an intermediate store, then 8793 // move it towards the beginning of the list. 8794 if (IS1 && !IS2) 8795 return true; 8796 8797 if (!IS1 && IS2) 8798 return false; 8799 8800 // If both recipes have an intermediate store, then the recipe 8801 // with the later store should be processed earlier. So it 8802 // should go to the beginning of the list. 8803 return DT->dominates(IS2, IS1); 8804 }); 8805 8806 if (HasIntermediateStore && ReductionPHIList.size() > 1) 8807 for (VPRecipeBase *R : ReductionPHIList) 8808 R->moveBefore(*Header, Header->getFirstNonPhi()); 8809 8810 for (VPRecipeBase &R : Header->phis()) { 8811 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 8812 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 8813 continue; 8814 8815 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 8816 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8817 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 8818 "AnyOf reductions are not allowed for in-loop reductions"); 8819 8820 // Collect the chain of "link" recipes for the reduction starting at PhiR. 8821 SetVector<VPSingleDefRecipe *> Worklist; 8822 Worklist.insert(PhiR); 8823 for (unsigned I = 0; I != Worklist.size(); ++I) { 8824 VPSingleDefRecipe *Cur = Worklist[I]; 8825 for (VPUser *U : Cur->users()) { 8826 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U); 8827 if (!UserRecipe) { 8828 assert(isa<VPLiveOut>(U) && 8829 "U must either be a VPSingleDef or VPLiveOut"); 8830 continue; 8831 } 8832 Worklist.insert(UserRecipe); 8833 } 8834 } 8835 8836 // Visit operation "Links" along the reduction chain top-down starting from 8837 // the phi until LoopExitValue. We keep track of the previous item 8838 // (PreviousLink) to tell which of the two operands of a Link will remain 8839 // scalar and which will be reduced. For minmax by select(cmp), Link will be 8840 // the select instructions. Blend recipes of in-loop reduction phi's will 8841 // get folded to their non-phi operand, as the reduction recipe handles the 8842 // condition directly. 8843 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 8844 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 8845 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 8846 8847 // Index of the first operand which holds a non-mask vector operand. 8848 unsigned IndexOfFirstOperand; 8849 // Recognize a call to the llvm.fmuladd intrinsic. 8850 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 8851 VPValue *VecOp; 8852 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 8853 if (IsFMulAdd) { 8854 assert( 8855 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 8856 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 8857 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 8858 isa<VPWidenCallRecipe>(CurrentLink)) && 8859 CurrentLink->getOperand(2) == PreviousLink && 8860 "expected a call where the previous link is the added operand"); 8861 8862 // If the instruction is a call to the llvm.fmuladd intrinsic then we 8863 // need to create an fmul recipe (multiplying the first two operands of 8864 // the fmuladd together) to use as the vector operand for the fadd 8865 // reduction. 8866 VPInstruction *FMulRecipe = new VPInstruction( 8867 Instruction::FMul, 8868 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 8869 CurrentLinkI->getFastMathFlags()); 8870 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 8871 VecOp = FMulRecipe; 8872 } else { 8873 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); 8874 if (PhiR->isInLoop() && Blend) { 8875 assert(Blend->getNumIncomingValues() == 2 && 8876 "Blend must have 2 incoming values"); 8877 if (Blend->getIncomingValue(0) == PhiR) 8878 Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); 8879 else { 8880 assert(Blend->getIncomingValue(1) == PhiR && 8881 "PhiR must be an operand of the blend"); 8882 Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); 8883 } 8884 continue; 8885 } 8886 8887 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8888 if (isa<VPWidenRecipe>(CurrentLink)) { 8889 assert(isa<CmpInst>(CurrentLinkI) && 8890 "need to have the compare of the select"); 8891 continue; 8892 } 8893 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 8894 "must be a select recipe"); 8895 IndexOfFirstOperand = 1; 8896 } else { 8897 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 8898 "Expected to replace a VPWidenSC"); 8899 IndexOfFirstOperand = 0; 8900 } 8901 // Note that for non-commutable operands (cmp-selects), the semantics of 8902 // the cmp-select are captured in the recurrence kind. 8903 unsigned VecOpId = 8904 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 8905 ? IndexOfFirstOperand + 1 8906 : IndexOfFirstOperand; 8907 VecOp = CurrentLink->getOperand(VecOpId); 8908 assert(VecOp != PreviousLink && 8909 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 8910 (VecOpId - IndexOfFirstOperand)) == 8911 PreviousLink && 8912 "PreviousLink must be the operand other than VecOp"); 8913 } 8914 8915 BasicBlock *BB = CurrentLinkI->getParent(); 8916 VPValue *CondOp = nullptr; 8917 if (CM.blockNeedsPredicationForAnyReason(BB)) 8918 CondOp = RecipeBuilder.getBlockInMask(BB); 8919 8920 VPReductionRecipe *RedRecipe = 8921 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, 8922 CondOp, CM.useOrderedReductions(RdxDesc)); 8923 // Append the recipe to the end of the VPBasicBlock because we need to 8924 // ensure that it comes after all of it's inputs, including CondOp. 8925 // Note that this transformation may leave over dead recipes (including 8926 // CurrentLink), which will be cleaned by a later VPlan transform. 8927 LinkVPBB->appendRecipe(RedRecipe); 8928 CurrentLink->replaceAllUsesWith(RedRecipe); 8929 PreviousLink = RedRecipe; 8930 } 8931 } 8932 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); 8933 Builder.setInsertPoint(&*LatchVPBB->begin()); 8934 VPBasicBlock *MiddleVPBB = 8935 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()); 8936 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); 8937 for (VPRecipeBase &R : 8938 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8939 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 8940 if (!PhiR) 8941 continue; 8942 8943 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 8944 // Adjust AnyOf reductions; replace the reduction phi for the selected value 8945 // with a boolean reduction phi node to check if the condition is true in 8946 // any iteration. The final value is selected by the final 8947 // ComputeReductionResult. 8948 if (RecurrenceDescriptor::isAnyOfRecurrenceKind( 8949 RdxDesc.getRecurrenceKind())) { 8950 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { 8951 return isa<VPWidenSelectRecipe>(U) || 8952 (isa<VPReplicateRecipe>(U) && 8953 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == 8954 Instruction::Select); 8955 })); 8956 VPValue *Cmp = Select->getOperand(0); 8957 // If the compare is checking the reduction PHI node, adjust it to check 8958 // the start value. 8959 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { 8960 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) 8961 if (CmpR->getOperand(I) == PhiR) 8962 CmpR->setOperand(I, PhiR->getStartValue()); 8963 } 8964 VPBuilder::InsertPointGuard Guard(Builder); 8965 Builder.setInsertPoint(Select); 8966 8967 // If the true value of the select is the reduction phi, the new value is 8968 // selected if the negated condition is true in any iteration. 8969 if (Select->getOperand(1) == PhiR) 8970 Cmp = Builder.createNot(Cmp); 8971 VPValue *Or = Builder.createOr(PhiR, Cmp); 8972 Select->getVPSingleValue()->replaceAllUsesWith(Or); 8973 8974 // Convert the reduction phi to operate on bools. 8975 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( 8976 OrigLoop->getHeader()->getContext()))); 8977 } 8978 8979 // If tail is folded by masking, introduce selects between the phi 8980 // and the live-out instruction of each reduction, at the beginning of the 8981 // dedicated latch block. 8982 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 8983 auto *NewExitingVPV = PhiR->getBackedgeValue(); 8984 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 8985 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 8986 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 8987 "reduction recipe must be defined before latch"); 8988 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 8989 std::optional<FastMathFlags> FMFs = 8990 PhiTy->isFloatingPointTy() 8991 ? std::make_optional(RdxDesc.getFastMathFlags()) 8992 : std::nullopt; 8993 NewExitingVPV = 8994 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 8995 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 8996 return isa<VPInstruction>(&U) && 8997 cast<VPInstruction>(&U)->getOpcode() == 8998 VPInstruction::ComputeReductionResult; 8999 }); 9000 if (PreferPredicatedReductionSelect || 9001 TTI.preferPredicatedReductionSelect( 9002 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, 9003 TargetTransformInfo::ReductionFlags())) 9004 PhiR->setOperand(1, NewExitingVPV); 9005 } 9006 9007 // If the vector reduction can be performed in a smaller type, we truncate 9008 // then extend the loop exit value to enable InstCombine to evaluate the 9009 // entire expression in the smaller type. 9010 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9011 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && 9012 !RecurrenceDescriptor::isAnyOfRecurrenceKind( 9013 RdxDesc.getRecurrenceKind())) { 9014 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9015 Type *RdxTy = RdxDesc.getRecurrenceType(); 9016 auto *Trunc = 9017 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9018 auto *Extnd = 9019 RdxDesc.isSigned() 9020 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9021 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9022 9023 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9024 Extnd->insertAfter(Trunc); 9025 if (PhiR->getOperand(1) == NewExitingVPV) 9026 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9027 NewExitingVPV = Extnd; 9028 } 9029 9030 // We want code in the middle block to appear to execute on the location of 9031 // the scalar loop's latch terminator because: (a) it is all compiler 9032 // generated, (b) these instructions are always executed after evaluating 9033 // the latch conditional branch, and (c) other passes may add new 9034 // predecessors which terminate on this line. This is the easiest way to 9035 // ensure we don't accidentally cause an extra step back into the loop while 9036 // debugging. 9037 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9038 9039 // TODO: At the moment ComputeReductionResult also drives creation of the 9040 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9041 // even for in-loop reductions, until the reduction resume value handling is 9042 // also modeled in VPlan. 9043 auto *FinalReductionResult = new VPInstruction( 9044 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9045 FinalReductionResult->insertBefore(*MiddleVPBB, IP); 9046 OrigExitingVPV->replaceUsesWithIf( 9047 FinalReductionResult, 9048 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); 9049 } 9050 9051 VPlanTransforms::clearReductionWrapFlags(*Plan); 9052 } 9053 9054 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9055 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9056 "Not a pointer induction according to InductionDescriptor!"); 9057 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9058 "Unexpected type."); 9059 assert(!onlyScalarsGenerated(State.VF.isScalable()) && 9060 "Recipe should have been replaced"); 9061 9062 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9063 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true)); 9064 Type *PhiType = IndDesc.getStep()->getType(); 9065 9066 // Build a pointer phi 9067 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9068 Type *ScStValueType = ScalarStartValue->getType(); 9069 PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", 9070 CanonicalIV->getIterator()); 9071 9072 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9073 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9074 9075 // A pointer induction, performed by using a gep 9076 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); 9077 9078 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9079 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9080 Value *NumUnrolledElems = 9081 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9082 Value *InductionGEP = GetElementPtrInst::Create( 9083 State.Builder.getInt8Ty(), NewPointerPhi, 9084 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9085 InductionLoc); 9086 // Add induction update using an incorrect block temporarily. The phi node 9087 // will be fixed after VPlan execution. Note that at this point the latch 9088 // block cannot be used, as it does not exist yet. 9089 // TODO: Model increment value in VPlan, by turning the recipe into a 9090 // multi-def and a subclass of VPHeaderPHIRecipe. 9091 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9092 9093 // Create UF many actual address geps that use the pointer 9094 // phi as base and a vectorized version of the step value 9095 // (<step*0, ..., step*N>) as offset. 9096 for (unsigned Part = 0; Part < State.UF; ++Part) { 9097 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9098 Value *StartOffsetScalar = 9099 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9100 Value *StartOffset = 9101 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9102 // Create a vector of consecutive numbers from zero to VF. 9103 StartOffset = State.Builder.CreateAdd( 9104 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9105 9106 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9107 "scalar step must be the same across all parts"); 9108 Value *GEP = State.Builder.CreateGEP( 9109 State.Builder.getInt8Ty(), NewPointerPhi, 9110 State.Builder.CreateMul( 9111 StartOffset, 9112 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9113 "vector.gep")); 9114 State.set(this, GEP, Part); 9115 } 9116 } 9117 9118 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9119 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9120 9121 // Fast-math-flags propagate from the original induction instruction. 9122 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9123 if (FPBinOp) 9124 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9125 9126 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9127 Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0)); 9128 Value *DerivedIV = emitTransformedIndex( 9129 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, 9130 Kind, cast_if_present<BinaryOperator>(FPBinOp)); 9131 DerivedIV->setName("offset.idx"); 9132 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9133 9134 State.set(this, DerivedIV, VPIteration(0, 0)); 9135 } 9136 9137 void VPReplicateRecipe::execute(VPTransformState &State) { 9138 Instruction *UI = getUnderlyingInstr(); 9139 if (State.Instance) { // Generate a single instance. 9140 assert((State.VF.isScalar() || !isUniform()) && 9141 "uniform recipe shouldn't be predicated"); 9142 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9143 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9144 // Insert scalar instance packing it into a vector. 9145 if (State.VF.isVector() && shouldPack()) { 9146 // If we're constructing lane 0, initialize to start from poison. 9147 if (State.Instance->Lane.isFirstLane()) { 9148 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9149 Value *Poison = PoisonValue::get( 9150 VectorType::get(UI->getType(), State.VF)); 9151 State.set(this, Poison, State.Instance->Part); 9152 } 9153 State.packScalarIntoVectorValue(this, *State.Instance); 9154 } 9155 return; 9156 } 9157 9158 if (IsUniform) { 9159 // If the recipe is uniform across all parts (instead of just per VF), only 9160 // generate a single instance. 9161 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9162 all_of(operands(), [](VPValue *Op) { 9163 return Op->isDefinedOutsideVectorRegions(); 9164 })) { 9165 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9166 if (user_begin() != user_end()) { 9167 for (unsigned Part = 1; Part < State.UF; ++Part) 9168 State.set(this, State.get(this, VPIteration(0, 0)), 9169 VPIteration(Part, 0)); 9170 } 9171 return; 9172 } 9173 9174 // Uniform within VL means we need to generate lane 0 only for each 9175 // unrolled copy. 9176 for (unsigned Part = 0; Part < State.UF; ++Part) 9177 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9178 return; 9179 } 9180 9181 // A store of a loop varying value to a uniform address only needs the last 9182 // copy of the store. 9183 if (isa<StoreInst>(UI) && 9184 vputils::isUniformAfterVectorization(getOperand(1))) { 9185 auto Lane = VPLane::getLastLaneForVF(State.VF); 9186 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9187 State); 9188 return; 9189 } 9190 9191 // Generate scalar instances for all VF lanes of all UF parts. 9192 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9193 const unsigned EndLane = State.VF.getKnownMinValue(); 9194 for (unsigned Part = 0; Part < State.UF; ++Part) 9195 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9196 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9197 } 9198 9199 void VPWidenLoadRecipe::execute(VPTransformState &State) { 9200 auto *LI = cast<LoadInst>(&Ingredient); 9201 9202 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9203 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9204 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9205 bool CreateGather = !isConsecutive(); 9206 9207 auto &Builder = State.Builder; 9208 State.setDebugLocFrom(getDebugLoc()); 9209 for (unsigned Part = 0; Part < State.UF; ++Part) { 9210 Value *NewLI; 9211 Value *Mask = nullptr; 9212 if (auto *VPMask = getMask()) { 9213 // Mask reversal is only needed for non-all-one (null) masks, as reverse 9214 // of a null all-one mask is a null mask. 9215 Mask = State.get(VPMask, Part); 9216 if (isReverse()) 9217 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9218 } 9219 9220 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); 9221 if (CreateGather) { 9222 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, 9223 "wide.masked.gather"); 9224 } else if (Mask) { 9225 NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, 9226 PoisonValue::get(DataTy), 9227 "wide.masked.load"); 9228 } else { 9229 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); 9230 } 9231 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9232 State.addMetadata(NewLI, LI); 9233 if (Reverse) 9234 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9235 State.set(this, NewLI, Part); 9236 } 9237 } 9238 9239 /// Use all-true mask for reverse rather than actual mask, as it avoids a 9240 /// dependence w/o affecting the result. 9241 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, 9242 Value *EVL, const Twine &Name) { 9243 VectorType *ValTy = cast<VectorType>(Operand->getType()); 9244 Value *AllTrueMask = 9245 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); 9246 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, 9247 {Operand, AllTrueMask, EVL}, nullptr, Name); 9248 } 9249 9250 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 9251 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 9252 "explicit vector length."); 9253 auto *LI = cast<LoadInst>(&Ingredient); 9254 9255 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9256 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9257 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9258 bool CreateGather = !isConsecutive(); 9259 9260 auto &Builder = State.Builder; 9261 State.setDebugLocFrom(getDebugLoc()); 9262 CallInst *NewLI; 9263 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 9264 Value *Addr = State.get(getAddr(), 0, !CreateGather); 9265 Value *Mask = nullptr; 9266 if (VPValue *VPMask = getMask()) { 9267 Mask = State.get(VPMask, 0); 9268 if (isReverse()) 9269 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 9270 } else { 9271 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 9272 } 9273 9274 if (CreateGather) { 9275 NewLI = 9276 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, 9277 nullptr, "wide.masked.gather"); 9278 } else { 9279 VectorBuilder VBuilder(Builder); 9280 VBuilder.setEVL(EVL).setMask(Mask); 9281 NewLI = cast<CallInst>(VBuilder.createVectorInstruction( 9282 Instruction::Load, DataTy, Addr, "vp.op.load")); 9283 } 9284 NewLI->addParamAttr( 9285 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); 9286 State.addMetadata(NewLI, LI); 9287 Instruction *Res = NewLI; 9288 if (isReverse()) 9289 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); 9290 State.set(this, Res, 0); 9291 } 9292 9293 void VPWidenStoreRecipe::execute(VPTransformState &State) { 9294 auto *SI = cast<StoreInst>(&Ingredient); 9295 9296 VPValue *StoredVPValue = getStoredValue(); 9297 bool CreateScatter = !isConsecutive(); 9298 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9299 9300 auto &Builder = State.Builder; 9301 State.setDebugLocFrom(getDebugLoc()); 9302 9303 for (unsigned Part = 0; Part < State.UF; ++Part) { 9304 Instruction *NewSI = nullptr; 9305 Value *Mask = nullptr; 9306 if (auto *VPMask = getMask()) { 9307 // Mask reversal is only needed for non-all-one (null) masks, as reverse 9308 // of a null all-one mask is a null mask. 9309 Mask = State.get(VPMask, Part); 9310 if (isReverse()) 9311 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9312 } 9313 9314 Value *StoredVal = State.get(StoredVPValue, Part); 9315 if (isReverse()) { 9316 // If we store to reverse consecutive memory locations, then we need 9317 // to reverse the order of elements in the stored value. 9318 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9319 // We don't want to update the value in the map as it might be used in 9320 // another expression. So don't call resetVectorValue(StoredVal). 9321 } 9322 Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); 9323 if (CreateScatter) 9324 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); 9325 else if (Mask) 9326 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); 9327 else 9328 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); 9329 State.addMetadata(NewSI, SI); 9330 } 9331 } 9332 9333 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { 9334 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " 9335 "explicit vector length."); 9336 auto *SI = cast<StoreInst>(&Ingredient); 9337 9338 VPValue *StoredValue = getStoredValue(); 9339 bool CreateScatter = !isConsecutive(); 9340 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9341 9342 auto &Builder = State.Builder; 9343 State.setDebugLocFrom(getDebugLoc()); 9344 9345 CallInst *NewSI = nullptr; 9346 Value *StoredVal = State.get(StoredValue, 0); 9347 Value *EVL = State.get(getEVL(), VPIteration(0, 0)); 9348 if (isReverse()) 9349 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); 9350 Value *Mask = nullptr; 9351 if (VPValue *VPMask = getMask()) { 9352 Mask = State.get(VPMask, 0); 9353 if (isReverse()) 9354 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); 9355 } else { 9356 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); 9357 } 9358 Value *Addr = State.get(getAddr(), 0, !CreateScatter); 9359 if (CreateScatter) { 9360 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), 9361 Intrinsic::vp_scatter, 9362 {StoredVal, Addr, Mask, EVL}); 9363 } else { 9364 VectorBuilder VBuilder(Builder); 9365 VBuilder.setEVL(EVL).setMask(Mask); 9366 NewSI = cast<CallInst>(VBuilder.createVectorInstruction( 9367 Instruction::Store, Type::getVoidTy(EVL->getContext()), 9368 {StoredVal, Addr})); 9369 } 9370 NewSI->addParamAttr( 9371 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); 9372 State.addMetadata(NewSI, SI); 9373 } 9374 9375 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9376 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9377 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9378 // for predication. 9379 static ScalarEpilogueLowering getScalarEpilogueLowering( 9380 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9381 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9382 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9383 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9384 // don't look at hints or options, and don't request a scalar epilogue. 9385 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9386 // LoopAccessInfo (due to code dependency and not being able to reliably get 9387 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9388 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9389 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9390 // back to the old way and vectorize with versioning when forced. See D81345.) 9391 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9392 PGSOQueryType::IRPass) && 9393 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9394 return CM_ScalarEpilogueNotAllowedOptSize; 9395 9396 // 2) If set, obey the directives 9397 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9398 switch (PreferPredicateOverEpilogue) { 9399 case PreferPredicateTy::ScalarEpilogue: 9400 return CM_ScalarEpilogueAllowed; 9401 case PreferPredicateTy::PredicateElseScalarEpilogue: 9402 return CM_ScalarEpilogueNotNeededUsePredicate; 9403 case PreferPredicateTy::PredicateOrDontVectorize: 9404 return CM_ScalarEpilogueNotAllowedUsePredicate; 9405 }; 9406 } 9407 9408 // 3) If set, obey the hints 9409 switch (Hints.getPredicate()) { 9410 case LoopVectorizeHints::FK_Enabled: 9411 return CM_ScalarEpilogueNotNeededUsePredicate; 9412 case LoopVectorizeHints::FK_Disabled: 9413 return CM_ScalarEpilogueAllowed; 9414 }; 9415 9416 // 4) if the TTI hook indicates this is profitable, request predication. 9417 TailFoldingInfo TFI(TLI, &LVL, IAI); 9418 if (TTI->preferPredicateOverEpilogue(&TFI)) 9419 return CM_ScalarEpilogueNotNeededUsePredicate; 9420 9421 return CM_ScalarEpilogueAllowed; 9422 } 9423 9424 // Process the loop in the VPlan-native vectorization path. This path builds 9425 // VPlan upfront in the vectorization pipeline, which allows to apply 9426 // VPlan-to-VPlan transformations from the very beginning without modifying the 9427 // input LLVM IR. 9428 static bool processLoopInVPlanNativePath( 9429 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9430 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9431 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9432 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9433 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9434 LoopVectorizationRequirements &Requirements) { 9435 9436 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9437 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9438 return false; 9439 } 9440 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9441 Function *F = L->getHeader()->getParent(); 9442 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9443 9444 ScalarEpilogueLowering SEL = 9445 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9446 9447 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9448 &Hints, IAI); 9449 // Use the planner for outer loop vectorization. 9450 // TODO: CM is not used at this point inside the planner. Turn CM into an 9451 // optional argument if we don't need it in the future. 9452 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9453 ORE); 9454 9455 // Get user vectorization factor. 9456 ElementCount UserVF = Hints.getWidth(); 9457 9458 CM.collectElementTypesForWidening(); 9459 9460 // Plan how to best vectorize, return the best VF and its cost. 9461 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9462 9463 // If we are stress testing VPlan builds, do not attempt to generate vector 9464 // code. Masked vector code generation support will follow soon. 9465 // Also, do not attempt to vectorize if no vector code will be produced. 9466 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9467 return false; 9468 9469 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9470 9471 { 9472 bool AddBranchWeights = 9473 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9474 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9475 F->getDataLayout(), AddBranchWeights); 9476 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9477 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9478 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9479 << L->getHeader()->getParent()->getName() << "\"\n"); 9480 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9481 } 9482 9483 reportVectorization(ORE, L, VF, 1); 9484 9485 // Mark the loop as already vectorized to avoid vectorizing again. 9486 Hints.setAlreadyVectorized(); 9487 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9488 return true; 9489 } 9490 9491 // Emit a remark if there are stores to floats that required a floating point 9492 // extension. If the vectorized loop was generated with floating point there 9493 // will be a performance penalty from the conversion overhead and the change in 9494 // the vector width. 9495 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9496 SmallVector<Instruction *, 4> Worklist; 9497 for (BasicBlock *BB : L->getBlocks()) { 9498 for (Instruction &Inst : *BB) { 9499 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9500 if (S->getValueOperand()->getType()->isFloatTy()) 9501 Worklist.push_back(S); 9502 } 9503 } 9504 } 9505 9506 // Traverse the floating point stores upwards searching, for floating point 9507 // conversions. 9508 SmallPtrSet<const Instruction *, 4> Visited; 9509 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9510 while (!Worklist.empty()) { 9511 auto *I = Worklist.pop_back_val(); 9512 if (!L->contains(I)) 9513 continue; 9514 if (!Visited.insert(I).second) 9515 continue; 9516 9517 // Emit a remark if the floating point store required a floating 9518 // point conversion. 9519 // TODO: More work could be done to identify the root cause such as a 9520 // constant or a function return type and point the user to it. 9521 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9522 ORE->emit([&]() { 9523 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9524 I->getDebugLoc(), L->getHeader()) 9525 << "floating point conversion changes vector width. " 9526 << "Mixed floating point precision requires an up/down " 9527 << "cast that will negatively impact performance."; 9528 }); 9529 9530 for (Use &Op : I->operands()) 9531 if (auto *OpI = dyn_cast<Instruction>(Op)) 9532 Worklist.push_back(OpI); 9533 } 9534 } 9535 9536 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9537 VectorizationFactor &VF, 9538 std::optional<unsigned> VScale, Loop *L, 9539 ScalarEvolution &SE, 9540 ScalarEpilogueLowering SEL) { 9541 InstructionCost CheckCost = Checks.getCost(); 9542 if (!CheckCost.isValid()) 9543 return false; 9544 9545 // When interleaving only scalar and vector cost will be equal, which in turn 9546 // would lead to a divide by 0. Fall back to hard threshold. 9547 if (VF.Width.isScalar()) { 9548 if (CheckCost > VectorizeMemoryCheckThreshold) { 9549 LLVM_DEBUG( 9550 dbgs() 9551 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9552 return false; 9553 } 9554 return true; 9555 } 9556 9557 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9558 uint64_t ScalarC = *VF.ScalarCost.getValue(); 9559 if (ScalarC == 0) 9560 return true; 9561 9562 // First, compute the minimum iteration count required so that the vector 9563 // loop outperforms the scalar loop. 9564 // The total cost of the scalar loop is 9565 // ScalarC * TC 9566 // where 9567 // * TC is the actual trip count of the loop. 9568 // * ScalarC is the cost of a single scalar iteration. 9569 // 9570 // The total cost of the vector loop is 9571 // RtC + VecC * (TC / VF) + EpiC 9572 // where 9573 // * RtC is the cost of the generated runtime checks 9574 // * VecC is the cost of a single vector iteration. 9575 // * TC is the actual trip count of the loop 9576 // * VF is the vectorization factor 9577 // * EpiCost is the cost of the generated epilogue, including the cost 9578 // of the remaining scalar operations. 9579 // 9580 // Vectorization is profitable once the total vector cost is less than the 9581 // total scalar cost: 9582 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9583 // 9584 // Now we can compute the minimum required trip count TC as 9585 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 9586 // 9587 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9588 // the computations are performed on doubles, not integers and the result 9589 // is rounded up, hence we get an upper estimate of the TC. 9590 unsigned IntVF = VF.Width.getKnownMinValue(); 9591 if (VF.Width.isScalable()) { 9592 unsigned AssumedMinimumVscale = 1; 9593 if (VScale) 9594 AssumedMinimumVscale = *VScale; 9595 IntVF *= AssumedMinimumVscale; 9596 } 9597 uint64_t RtC = *CheckCost.getValue(); 9598 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); 9599 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); 9600 9601 // Second, compute a minimum iteration count so that the cost of the 9602 // runtime checks is only a fraction of the total scalar loop cost. This 9603 // adds a loop-dependent bound on the overhead incurred if the runtime 9604 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 9605 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 9606 // cost, compute 9607 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 9608 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); 9609 9610 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 9611 // epilogue is allowed, choose the next closest multiple of VF. This should 9612 // partly compensate for ignoring the epilogue cost. 9613 uint64_t MinTC = std::max(MinTC1, MinTC2); 9614 if (SEL == CM_ScalarEpilogueAllowed) 9615 MinTC = alignTo(MinTC, IntVF); 9616 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 9617 9618 LLVM_DEBUG( 9619 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 9620 << VF.MinProfitableTripCount << "\n"); 9621 9622 // Skip vectorization if the expected trip count is less than the minimum 9623 // required trip count. 9624 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 9625 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 9626 VF.MinProfitableTripCount)) { 9627 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 9628 "trip count < minimum profitable VF (" 9629 << *ExpectedTC << " < " << VF.MinProfitableTripCount 9630 << ")\n"); 9631 9632 return false; 9633 } 9634 } 9635 return true; 9636 } 9637 9638 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9639 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9640 !EnableLoopInterleaving), 9641 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9642 !EnableLoopVectorization) {} 9643 9644 bool LoopVectorizePass::processLoop(Loop *L) { 9645 assert((EnableVPlanNativePath || L->isInnermost()) && 9646 "VPlan-native path is not enabled. Only process inner loops."); 9647 9648 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 9649 << L->getHeader()->getParent()->getName() << "' from " 9650 << L->getLocStr() << "\n"); 9651 9652 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 9653 9654 LLVM_DEBUG( 9655 dbgs() << "LV: Loop hints:" 9656 << " force=" 9657 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9658 ? "disabled" 9659 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9660 ? "enabled" 9661 : "?")) 9662 << " width=" << Hints.getWidth() 9663 << " interleave=" << Hints.getInterleave() << "\n"); 9664 9665 // Function containing loop 9666 Function *F = L->getHeader()->getParent(); 9667 9668 // Looking at the diagnostic output is the only way to determine if a loop 9669 // was vectorized (other than looking at the IR or machine code), so it 9670 // is important to generate an optimization remark for each loop. Most of 9671 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9672 // generated as OptimizationRemark and OptimizationRemarkMissed are 9673 // less verbose reporting vectorized loops and unvectorized loops that may 9674 // benefit from vectorization, respectively. 9675 9676 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9677 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9678 return false; 9679 } 9680 9681 PredicatedScalarEvolution PSE(*SE, *L); 9682 9683 // Check if it is legal to vectorize the loop. 9684 LoopVectorizationRequirements Requirements; 9685 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 9686 &Requirements, &Hints, DB, AC, BFI, PSI); 9687 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9688 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9689 Hints.emitRemarkWithHints(); 9690 return false; 9691 } 9692 9693 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9694 // here. They may require CFG and instruction level transformations before 9695 // even evaluating whether vectorization is profitable. Since we cannot modify 9696 // the incoming IR, we need to build VPlan upfront in the vectorization 9697 // pipeline. 9698 if (!L->isInnermost()) 9699 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9700 ORE, BFI, PSI, Hints, Requirements); 9701 9702 assert(L->isInnermost() && "Inner loop expected."); 9703 9704 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9705 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9706 9707 // If an override option has been passed in for interleaved accesses, use it. 9708 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9709 UseInterleaved = EnableInterleavedMemAccesses; 9710 9711 // Analyze interleaved memory accesses. 9712 if (UseInterleaved) 9713 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9714 9715 // Check the function attributes and profiles to find out if this function 9716 // should be optimized for size. 9717 ScalarEpilogueLowering SEL = 9718 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 9719 9720 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9721 // count by optimizing for size, to minimize overheads. 9722 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9723 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9724 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9725 << "This loop is worth vectorizing only if no scalar " 9726 << "iteration overheads are incurred."); 9727 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9728 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9729 else { 9730 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 9731 LLVM_DEBUG(dbgs() << "\n"); 9732 // Predicate tail-folded loops are efficient even when the loop 9733 // iteration count is low. However, setting the epilogue policy to 9734 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 9735 // with runtime checks. It's more effective to let 9736 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 9737 // for the loop. 9738 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 9739 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9740 } else { 9741 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 9742 "small to consider vectorizing.\n"); 9743 reportVectorizationFailure( 9744 "The trip count is below the minial threshold value.", 9745 "loop trip count is too low, avoiding vectorization", 9746 "LowTripCount", ORE, L); 9747 Hints.emitRemarkWithHints(); 9748 return false; 9749 } 9750 } 9751 } 9752 9753 // Check the function attributes to see if implicit floats or vectors are 9754 // allowed. 9755 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9756 reportVectorizationFailure( 9757 "Can't vectorize when the NoImplicitFloat attribute is used", 9758 "loop not vectorized due to NoImplicitFloat attribute", 9759 "NoImplicitFloat", ORE, L); 9760 Hints.emitRemarkWithHints(); 9761 return false; 9762 } 9763 9764 // Check if the target supports potentially unsafe FP vectorization. 9765 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9766 // for the target we're vectorizing for, to make sure none of the 9767 // additional fp-math flags can help. 9768 if (Hints.isPotentiallyUnsafe() && 9769 TTI->isFPVectorizationPotentiallyUnsafe()) { 9770 reportVectorizationFailure( 9771 "Potentially unsafe FP op prevents vectorization", 9772 "loop not vectorized due to unsafe FP support.", 9773 "UnsafeFP", ORE, L); 9774 Hints.emitRemarkWithHints(); 9775 return false; 9776 } 9777 9778 bool AllowOrderedReductions; 9779 // If the flag is set, use that instead and override the TTI behaviour. 9780 if (ForceOrderedReductions.getNumOccurrences() > 0) 9781 AllowOrderedReductions = ForceOrderedReductions; 9782 else 9783 AllowOrderedReductions = TTI->enableOrderedReductions(); 9784 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 9785 ORE->emit([&]() { 9786 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9787 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9788 ExactFPMathInst->getDebugLoc(), 9789 ExactFPMathInst->getParent()) 9790 << "loop not vectorized: cannot prove it is safe to reorder " 9791 "floating-point operations"; 9792 }); 9793 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9794 "reorder floating-point operations\n"); 9795 Hints.emitRemarkWithHints(); 9796 return false; 9797 } 9798 9799 // Use the cost model. 9800 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9801 F, &Hints, IAI); 9802 // Use the planner for vectorization. 9803 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 9804 ORE); 9805 9806 // Get user vectorization factor and interleave count. 9807 ElementCount UserVF = Hints.getWidth(); 9808 unsigned UserIC = Hints.getInterleave(); 9809 9810 // Plan how to best vectorize, return the best VF and its cost. 9811 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9812 9813 VectorizationFactor VF = VectorizationFactor::Disabled(); 9814 unsigned IC = 1; 9815 9816 bool AddBranchWeights = 9817 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9818 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9819 F->getDataLayout(), AddBranchWeights); 9820 if (MaybeVF) { 9821 VF = *MaybeVF; 9822 // Select the interleave count. 9823 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9824 9825 unsigned SelectedIC = std::max(IC, UserIC); 9826 // Optimistically generate runtime checks if they are needed. Drop them if 9827 // they turn out to not be profitable. 9828 if (VF.Width.isVector() || SelectedIC > 1) 9829 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 9830 9831 // Check if it is profitable to vectorize with runtime checks. 9832 bool ForceVectorization = 9833 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 9834 if (!ForceVectorization && 9835 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 9836 *PSE.getSE(), SEL)) { 9837 ORE->emit([&]() { 9838 return OptimizationRemarkAnalysisAliasing( 9839 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 9840 L->getHeader()) 9841 << "loop not vectorized: cannot prove it is safe to reorder " 9842 "memory operations"; 9843 }); 9844 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 9845 Hints.emitRemarkWithHints(); 9846 return false; 9847 } 9848 } 9849 9850 // Identify the diagnostic messages that should be produced. 9851 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9852 bool VectorizeLoop = true, InterleaveLoop = true; 9853 if (VF.Width.isScalar()) { 9854 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9855 VecDiagMsg = std::make_pair( 9856 "VectorizationNotBeneficial", 9857 "the cost-model indicates that vectorization is not beneficial"); 9858 VectorizeLoop = false; 9859 } 9860 9861 if (!MaybeVF && UserIC > 1) { 9862 // Tell the user interleaving was avoided up-front, despite being explicitly 9863 // requested. 9864 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9865 "interleaving should be avoided up front\n"); 9866 IntDiagMsg = std::make_pair( 9867 "InterleavingAvoided", 9868 "Ignoring UserIC, because interleaving was avoided up front"); 9869 InterleaveLoop = false; 9870 } else if (IC == 1 && UserIC <= 1) { 9871 // Tell the user interleaving is not beneficial. 9872 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9873 IntDiagMsg = std::make_pair( 9874 "InterleavingNotBeneficial", 9875 "the cost-model indicates that interleaving is not beneficial"); 9876 InterleaveLoop = false; 9877 if (UserIC == 1) { 9878 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9879 IntDiagMsg.second += 9880 " and is explicitly disabled or interleave count is set to 1"; 9881 } 9882 } else if (IC > 1 && UserIC == 1) { 9883 // Tell the user interleaving is beneficial, but it explicitly disabled. 9884 LLVM_DEBUG( 9885 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9886 IntDiagMsg = std::make_pair( 9887 "InterleavingBeneficialButDisabled", 9888 "the cost-model indicates that interleaving is beneficial " 9889 "but is explicitly disabled or interleave count is set to 1"); 9890 InterleaveLoop = false; 9891 } 9892 9893 // Override IC if user provided an interleave count. 9894 IC = UserIC > 0 ? UserIC : IC; 9895 9896 // Emit diagnostic messages, if any. 9897 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9898 if (!VectorizeLoop && !InterleaveLoop) { 9899 // Do not vectorize or interleaving the loop. 9900 ORE->emit([&]() { 9901 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9902 L->getStartLoc(), L->getHeader()) 9903 << VecDiagMsg.second; 9904 }); 9905 ORE->emit([&]() { 9906 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9907 L->getStartLoc(), L->getHeader()) 9908 << IntDiagMsg.second; 9909 }); 9910 return false; 9911 } else if (!VectorizeLoop && InterleaveLoop) { 9912 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9913 ORE->emit([&]() { 9914 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9915 L->getStartLoc(), L->getHeader()) 9916 << VecDiagMsg.second; 9917 }); 9918 } else if (VectorizeLoop && !InterleaveLoop) { 9919 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9920 << ") in " << L->getLocStr() << '\n'); 9921 ORE->emit([&]() { 9922 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9923 L->getStartLoc(), L->getHeader()) 9924 << IntDiagMsg.second; 9925 }); 9926 } else if (VectorizeLoop && InterleaveLoop) { 9927 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9928 << ") in " << L->getLocStr() << '\n'); 9929 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9930 } 9931 9932 bool DisableRuntimeUnroll = false; 9933 MDNode *OrigLoopID = L->getLoopID(); 9934 { 9935 using namespace ore; 9936 if (!VectorizeLoop) { 9937 assert(IC > 1 && "interleave count should not be 1 or 0"); 9938 // If we decided that it is not legal to vectorize the loop, then 9939 // interleave it. 9940 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9941 &CM, BFI, PSI, Checks); 9942 9943 VPlan &BestPlan = 9944 UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan(); 9945 assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) && 9946 "VPlan cost model and legacy cost model disagreed"); 9947 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 9948 9949 ORE->emit([&]() { 9950 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9951 L->getHeader()) 9952 << "interleaved loop (interleaved count: " 9953 << NV("InterleaveCount", IC) << ")"; 9954 }); 9955 } else { 9956 // If we decided that it is *legal* to vectorize the loop, then do it. 9957 9958 // Consider vectorizing the epilogue too if it's profitable. 9959 VectorizationFactor EpilogueVF = 9960 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 9961 if (EpilogueVF.Width.isVector()) { 9962 9963 // The first pass vectorizes the main loop and creates a scalar epilogue 9964 // to be vectorized by executing the plan (potentially with a different 9965 // factor) again shortly afterwards. 9966 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 9967 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9968 EPI, &LVL, &CM, BFI, PSI, Checks); 9969 9970 std::unique_ptr<VPlan> BestMainPlan( 9971 LVP.getBestPlanFor(EPI.MainLoopVF).duplicate()); 9972 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( 9973 EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true); 9974 ++LoopsVectorized; 9975 9976 // Second pass vectorizes the epilogue and adjusts the control flow 9977 // edges from the first pass. 9978 EPI.MainLoopVF = EPI.EpilogueVF; 9979 EPI.MainLoopUF = EPI.EpilogueUF; 9980 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9981 ORE, EPI, &LVL, &CM, BFI, PSI, 9982 Checks); 9983 9984 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 9985 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 9986 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 9987 Header->setName("vec.epilog.vector.body"); 9988 9989 // Re-use the trip count and steps expanded for the main loop, as 9990 // skeleton creation needs it as a value that dominates both the scalar 9991 // and vector epilogue loops 9992 // TODO: This is a workaround needed for epilogue vectorization and it 9993 // should be removed once induction resume value creation is done 9994 // directly in VPlan. 9995 EpilogILV.setTripCount(MainILV.getTripCount()); 9996 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 9997 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 9998 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( 9999 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10000 ExpandR->replaceAllUsesWith(ExpandedVal); 10001 if (BestEpiPlan.getTripCount() == ExpandR) 10002 BestEpiPlan.resetTripCount(ExpandedVal); 10003 ExpandR->eraseFromParent(); 10004 } 10005 10006 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10007 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10008 // before vectorizing the epilogue loop. 10009 for (VPRecipeBase &R : Header->phis()) { 10010 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10011 continue; 10012 10013 Value *ResumeV = nullptr; 10014 // TODO: Move setting of resume values to prepareToExecute. 10015 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10016 const RecurrenceDescriptor &RdxDesc = 10017 ReductionPhi->getRecurrenceDescriptor(); 10018 RecurKind RK = RdxDesc.getRecurrenceKind(); 10019 ResumeV = ReductionResumeValues.find(&RdxDesc)->second; 10020 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { 10021 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10022 // start value; compare the final value from the main vector loop 10023 // to the start value. 10024 IRBuilder<> Builder( 10025 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); 10026 ResumeV = Builder.CreateICmpNE(ResumeV, 10027 RdxDesc.getRecurrenceStartValue()); 10028 } 10029 } else { 10030 // Create induction resume values for both widened pointer and 10031 // integer/fp inductions and update the start value of the induction 10032 // recipes to use the resume value. 10033 PHINode *IndPhi = nullptr; 10034 const InductionDescriptor *ID; 10035 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10036 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10037 ID = &Ind->getInductionDescriptor(); 10038 } else { 10039 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10040 IndPhi = WidenInd->getPHINode(); 10041 ID = &WidenInd->getInductionDescriptor(); 10042 } 10043 10044 ResumeV = MainILV.createInductionResumeValue( 10045 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10046 {EPI.MainLoopIterationCountCheck}); 10047 } 10048 assert(ResumeV && "Must have a resume value"); 10049 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV); 10050 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10051 } 10052 10053 assert(DT->verify(DominatorTree::VerificationLevel::Fast) && 10054 "DT not preserved correctly"); 10055 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10056 DT, true, &ExpandedSCEVs); 10057 ++LoopsEpilogueVectorized; 10058 10059 if (!MainILV.areSafetyChecksAdded()) 10060 DisableRuntimeUnroll = true; 10061 } else { 10062 ElementCount Width = VF.Width; 10063 VPlan &BestPlan = 10064 UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan(); 10065 if (!UseLegacyCostModel) { 10066 assert(size(BestPlan.vectorFactors()) == 1 && 10067 "Plan should have a single VF"); 10068 Width = *BestPlan.vectorFactors().begin(); 10069 LLVM_DEBUG(dbgs() 10070 << "VF picked by VPlan cost model: " << Width << "\n"); 10071 assert(VF.Width == Width && 10072 "VPlan cost model and legacy cost model disagreed"); 10073 } 10074 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width, 10075 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10076 PSI, Checks); 10077 LVP.executePlan(Width, IC, BestPlan, LB, DT, false); 10078 ++LoopsVectorized; 10079 10080 // Add metadata to disable runtime unrolling a scalar loop when there 10081 // are no runtime checks about strides and memory. A scalar loop that is 10082 // rarely used is not worth unrolling. 10083 if (!LB.areSafetyChecksAdded()) 10084 DisableRuntimeUnroll = true; 10085 } 10086 // Report the vectorization decision. 10087 reportVectorization(ORE, L, VF, IC); 10088 } 10089 10090 if (ORE->allowExtraAnalysis(LV_NAME)) 10091 checkMixedPrecision(L, ORE); 10092 } 10093 10094 std::optional<MDNode *> RemainderLoopID = 10095 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10096 LLVMLoopVectorizeFollowupEpilogue}); 10097 if (RemainderLoopID) { 10098 L->setLoopID(*RemainderLoopID); 10099 } else { 10100 if (DisableRuntimeUnroll) 10101 AddRuntimeUnrollDisableMetaData(L); 10102 10103 // Mark the loop as already vectorized to avoid vectorizing again. 10104 Hints.setAlreadyVectorized(); 10105 } 10106 10107 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10108 return true; 10109 } 10110 10111 LoopVectorizeResult LoopVectorizePass::runImpl( 10112 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10113 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10114 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10115 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10116 SE = &SE_; 10117 LI = &LI_; 10118 TTI = &TTI_; 10119 DT = &DT_; 10120 BFI = BFI_; 10121 TLI = TLI_; 10122 AC = &AC_; 10123 LAIs = &LAIs_; 10124 DB = &DB_; 10125 ORE = &ORE_; 10126 PSI = PSI_; 10127 10128 // Don't attempt if 10129 // 1. the target claims to have no vector registers, and 10130 // 2. interleaving won't help ILP. 10131 // 10132 // The second condition is necessary because, even if the target has no 10133 // vector registers, loop vectorization may still enable scalar 10134 // interleaving. 10135 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10136 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10137 return LoopVectorizeResult(false, false); 10138 10139 bool Changed = false, CFGChanged = false; 10140 10141 // The vectorizer requires loops to be in simplified form. 10142 // Since simplification may add new inner loops, it has to run before the 10143 // legality and profitability checks. This means running the loop vectorizer 10144 // will simplify all loops, regardless of whether anything end up being 10145 // vectorized. 10146 for (const auto &L : *LI) 10147 Changed |= CFGChanged |= 10148 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10149 10150 // Build up a worklist of inner-loops to vectorize. This is necessary as 10151 // the act of vectorizing or partially unrolling a loop creates new loops 10152 // and can invalidate iterators across the loops. 10153 SmallVector<Loop *, 8> Worklist; 10154 10155 for (Loop *L : *LI) 10156 collectSupportedLoops(*L, LI, ORE, Worklist); 10157 10158 LoopsAnalyzed += Worklist.size(); 10159 10160 // Now walk the identified inner loops. 10161 while (!Worklist.empty()) { 10162 Loop *L = Worklist.pop_back_val(); 10163 10164 // For the inner loops we actually process, form LCSSA to simplify the 10165 // transform. 10166 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10167 10168 Changed |= CFGChanged |= processLoop(L); 10169 10170 if (Changed) { 10171 LAIs->clear(); 10172 10173 #ifndef NDEBUG 10174 if (VerifySCEV) 10175 SE->verify(); 10176 #endif 10177 } 10178 } 10179 10180 // Process each loop nest in the function. 10181 return LoopVectorizeResult(Changed, CFGChanged); 10182 } 10183 10184 PreservedAnalyses LoopVectorizePass::run(Function &F, 10185 FunctionAnalysisManager &AM) { 10186 auto &LI = AM.getResult<LoopAnalysis>(F); 10187 // There are no loops in the function. Return before computing other expensive 10188 // analyses. 10189 if (LI.empty()) 10190 return PreservedAnalyses::all(); 10191 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10192 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10193 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10194 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10195 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10196 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10197 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10198 10199 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10200 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10201 ProfileSummaryInfo *PSI = 10202 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10203 BlockFrequencyInfo *BFI = nullptr; 10204 if (PSI && PSI->hasProfileSummary()) 10205 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10206 LoopVectorizeResult Result = 10207 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10208 if (!Result.MadeAnyChange) 10209 return PreservedAnalyses::all(); 10210 PreservedAnalyses PA; 10211 10212 if (isAssignmentTrackingEnabled(*F.getParent())) { 10213 for (auto &BB : F) 10214 RemoveRedundantDbgInstrs(&BB); 10215 } 10216 10217 PA.preserve<LoopAnalysis>(); 10218 PA.preserve<DominatorTreeAnalysis>(); 10219 PA.preserve<ScalarEvolutionAnalysis>(); 10220 PA.preserve<LoopAccessAnalysis>(); 10221 10222 if (Result.MadeCFGChange) { 10223 // Making CFG changes likely means a loop got vectorized. Indicate that 10224 // extra simplification passes should be run. 10225 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10226 // be run if runtime checks have been added. 10227 AM.getResult<ShouldRunExtraVectorPasses>(F); 10228 PA.preserve<ShouldRunExtraVectorPasses>(); 10229 } else { 10230 PA.preserveSet<CFGAnalyses>(); 10231 } 10232 return PA; 10233 } 10234 10235 void LoopVectorizePass::printPipeline( 10236 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10237 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10238 OS, MapClassName2PassName); 10239 10240 OS << '<'; 10241 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10242 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10243 OS << '>'; 10244 } 10245