1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/STLExtras.h" 69 #include "llvm/ADT/SmallPtrSet.h" 70 #include "llvm/ADT/SmallSet.h" 71 #include "llvm/ADT/SmallVector.h" 72 #include "llvm/ADT/Statistic.h" 73 #include "llvm/ADT/StringRef.h" 74 #include "llvm/ADT/Twine.h" 75 #include "llvm/ADT/iterator_range.h" 76 #include "llvm/Analysis/AssumptionCache.h" 77 #include "llvm/Analysis/BasicAliasAnalysis.h" 78 #include "llvm/Analysis/BlockFrequencyInfo.h" 79 #include "llvm/Analysis/CFG.h" 80 #include "llvm/Analysis/CodeMetrics.h" 81 #include "llvm/Analysis/DemandedBits.h" 82 #include "llvm/Analysis/GlobalsModRef.h" 83 #include "llvm/Analysis/LoopAccessAnalysis.h" 84 #include "llvm/Analysis/LoopAnalysisManager.h" 85 #include "llvm/Analysis/LoopInfo.h" 86 #include "llvm/Analysis/LoopIterator.h" 87 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 88 #include "llvm/Analysis/ProfileSummaryInfo.h" 89 #include "llvm/Analysis/ScalarEvolution.h" 90 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 91 #include "llvm/Analysis/TargetLibraryInfo.h" 92 #include "llvm/Analysis/TargetTransformInfo.h" 93 #include "llvm/Analysis/ValueTracking.h" 94 #include "llvm/Analysis/VectorUtils.h" 95 #include "llvm/IR/Attributes.h" 96 #include "llvm/IR/BasicBlock.h" 97 #include "llvm/IR/CFG.h" 98 #include "llvm/IR/Constant.h" 99 #include "llvm/IR/Constants.h" 100 #include "llvm/IR/DataLayout.h" 101 #include "llvm/IR/DebugInfoMetadata.h" 102 #include "llvm/IR/DebugLoc.h" 103 #include "llvm/IR/DerivedTypes.h" 104 #include "llvm/IR/DiagnosticInfo.h" 105 #include "llvm/IR/Dominators.h" 106 #include "llvm/IR/Function.h" 107 #include "llvm/IR/IRBuilder.h" 108 #include "llvm/IR/InstrTypes.h" 109 #include "llvm/IR/Instruction.h" 110 #include "llvm/IR/Instructions.h" 111 #include "llvm/IR/IntrinsicInst.h" 112 #include "llvm/IR/Intrinsics.h" 113 #include "llvm/IR/Metadata.h" 114 #include "llvm/IR/Module.h" 115 #include "llvm/IR/Operator.h" 116 #include "llvm/IR/PatternMatch.h" 117 #include "llvm/IR/Type.h" 118 #include "llvm/IR/Use.h" 119 #include "llvm/IR/User.h" 120 #include "llvm/IR/Value.h" 121 #include "llvm/IR/ValueHandle.h" 122 #include "llvm/IR/Verifier.h" 123 #include "llvm/InitializePasses.h" 124 #include "llvm/Pass.h" 125 #include "llvm/Support/Casting.h" 126 #include "llvm/Support/CommandLine.h" 127 #include "llvm/Support/Compiler.h" 128 #include "llvm/Support/Debug.h" 129 #include "llvm/Support/ErrorHandling.h" 130 #include "llvm/Support/InstructionCost.h" 131 #include "llvm/Support/MathExtras.h" 132 #include "llvm/Support/raw_ostream.h" 133 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 134 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 135 #include "llvm/Transforms/Utils/LoopSimplify.h" 136 #include "llvm/Transforms/Utils/LoopUtils.h" 137 #include "llvm/Transforms/Utils/LoopVersioning.h" 138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cmath> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks")); 202 203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 204 // that predication is preferred, and this lists all options. I.e., the 205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 206 // and predicate the instructions accordingly. If tail-folding fails, there are 207 // different fallback strategies depending on these values: 208 namespace PreferPredicateTy { 209 enum Option { 210 ScalarEpilogue = 0, 211 PredicateElseScalarEpilogue, 212 PredicateOrDontVectorize 213 }; 214 } // namespace PreferPredicateTy 215 216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 217 "prefer-predicate-over-epilogue", 218 cl::init(PreferPredicateTy::ScalarEpilogue), 219 cl::Hidden, 220 cl::desc("Tail-folding and predication preferences over creating a scalar " 221 "epilogue loop."), 222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 223 "scalar-epilogue", 224 "Don't tail-predicate loops, create scalar epilogue"), 225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 226 "predicate-else-scalar-epilogue", 227 "prefer tail-folding, create scalar epilogue if tail " 228 "folding fails."), 229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 230 "predicate-dont-vectorize", 231 "prefers tail-folding, don't attempt vectorization if " 232 "tail-folding fails."))); 233 234 static cl::opt<bool> MaximizeBandwidth( 235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 236 cl::desc("Maximize bandwidth when selecting vectorization factor which " 237 "will be determined by the smallest type in loop.")); 238 239 static cl::opt<bool> EnableInterleavedMemAccesses( 240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 241 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 242 243 /// An interleave-group may need masking if it resides in a block that needs 244 /// predication, or in order to mask away gaps. 245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 248 249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 251 cl::desc("We don't interleave loops with a estimated constant trip count " 252 "below this number")); 253 254 static cl::opt<unsigned> ForceTargetNumScalarRegs( 255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 256 cl::desc("A flag that overrides the target's number of scalar registers.")); 257 258 static cl::opt<unsigned> ForceTargetNumVectorRegs( 259 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of vector registers.")); 261 262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's max interleave factor for " 265 "scalar loops.")); 266 267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 269 cl::desc("A flag that overrides the target's max interleave factor for " 270 "vectorized loops.")); 271 272 static cl::opt<unsigned> ForceTargetInstructionCost( 273 "force-target-instruction-cost", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's expected cost for " 275 "an instruction to a single constant value. Mostly " 276 "useful for getting consistent testing.")); 277 278 static cl::opt<bool> ForceTargetSupportsScalableVectors( 279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 280 cl::desc( 281 "Pretend that scalable vectors are supported, even if the target does " 282 "not support them. This flag should only be used for testing.")); 283 284 static cl::opt<unsigned> SmallLoopCost( 285 "small-loop-cost", cl::init(20), cl::Hidden, 286 cl::desc( 287 "The cost of a loop that is considered 'small' by the interleaver.")); 288 289 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 291 cl::desc("Enable the use of the block frequency analysis to access PGO " 292 "heuristics minimizing code growth in cold regions and being more " 293 "aggressive in hot regions.")); 294 295 // Runtime interleave loops for load/store throughput. 296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 298 cl::desc( 299 "Enable runtime interleaving until load/store ports are saturated")); 300 301 /// Interleave small loops with scalar reductions. 302 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 304 cl::desc("Enable interleaving for loops with small iteration counts that " 305 "contain scalar reductions to expose ILP.")); 306 307 /// The number of stores in a loop that are allowed to need predication. 308 static cl::opt<unsigned> NumberOfStoresToPredicate( 309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 310 cl::desc("Max number of stores to be predicated behind an if.")); 311 312 static cl::opt<bool> EnableIndVarRegisterHeur( 313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 314 cl::desc("Count the induction variable only once when interleaving")); 315 316 static cl::opt<bool> EnableCondStoresVectorization( 317 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 318 cl::desc("Enable if predication of stores during vectorization.")); 319 320 static cl::opt<unsigned> MaxNestedScalarReductionIC( 321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 322 cl::desc("The maximum interleave count to use when interleaving a scalar " 323 "reduction in a nested loop.")); 324 325 static cl::opt<bool> 326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 327 cl::Hidden, 328 cl::desc("Prefer in-loop vector reductions, " 329 "overriding the targets preference.")); 330 331 static cl::opt<bool> ForceOrderedReductions( 332 "force-ordered-reductions", cl::init(false), cl::Hidden, 333 cl::desc("Enable the vectorisation of loops with in-order (strict) " 334 "FP reductions")); 335 336 static cl::opt<bool> PreferPredicatedReductionSelect( 337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 338 cl::desc( 339 "Prefer predicating a reduction operation over an after loop select.")); 340 341 cl::opt<bool> EnableVPlanNativePath( 342 "enable-vplan-native-path", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 static cl::opt<bool> PrintVPlansInDotFormat( 365 "vplan-print-in-dot-format", cl::Hidden, 366 cl::desc("Use dot format instead of plain text when dumping VPlans")); 367 368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 369 "force-widen-divrem-via-safe-divisor", cl::Hidden, 370 cl::desc( 371 "Override cost based safe divisor widening for div/rem instructions")); 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 377 // Determine if an array of N elements of type Ty is "bitcast compatible" 378 // with a <N x Ty> vector. 379 // This is only true if there is no padding between the array elements. 380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 381 } 382 383 /// A helper function that returns the reciprocal of the block probability of 384 /// predicated blocks. If we return X, we are assuming the predicated block 385 /// will execute once for every X iterations of the loop header. 386 /// 387 /// TODO: We should use actual block probability here, if available. Currently, 388 /// we always assume predicated blocks have a 50% chance of executing. 389 static unsigned getReciprocalPredBlockProb() { return 2; } 390 391 /// A helper function that returns an integer or floating-point constant with 392 /// value C. 393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 395 : ConstantFP::get(Ty, C); 396 } 397 398 /// Returns "best known" trip count for the specified loop \p L as defined by 399 /// the following procedure: 400 /// 1) Returns exact trip count if it is known. 401 /// 2) Returns expected trip count according to profile data if any. 402 /// 3) Returns upper bound estimate if it is known. 403 /// 4) Returns std::nullopt if all of the above failed. 404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 405 Loop *L) { 406 // Check if exact trip count is known. 407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 408 return ExpectedTC; 409 410 // Check if there is an expected trip count available from profile data. 411 if (LoopVectorizeWithBlockFrequency) 412 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 413 return *EstimatedTC; 414 415 // Check if upper bound estimate is known. 416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 417 return ExpectedTC; 418 419 return std::nullopt; 420 } 421 422 namespace { 423 // Forward declare GeneratedRTChecks. 424 class GeneratedRTChecks; 425 } // namespace 426 427 namespace llvm { 428 429 AnalysisKey ShouldRunExtraVectorPasses::Key; 430 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic 432 /// block to a specified vectorization factor (VF). 433 /// This class performs the widening of scalars into vectors, or multiple 434 /// scalars. This class also implements the following features: 435 /// * It inserts an epilogue loop for handling loops that don't have iteration 436 /// counts that are known to be a multiple of the vectorization factor. 437 /// * It handles the code generation for reduction variables. 438 /// * Scalarization (implementation using scalars) of un-vectorizable 439 /// instructions. 440 /// InnerLoopVectorizer does not perform any vectorization-legality 441 /// checks, and relies on the caller to check for the different legality 442 /// aspects. The InnerLoopVectorizer relies on the 443 /// LoopVectorizationLegality class to provide information about the induction 444 /// and reduction variables that were found to a given vectorization factor. 445 class InnerLoopVectorizer { 446 public: 447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 448 LoopInfo *LI, DominatorTree *DT, 449 const TargetLibraryInfo *TLI, 450 const TargetTransformInfo *TTI, AssumptionCache *AC, 451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 452 ElementCount MinProfitableTripCount, 453 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 459 PSI(PSI), RTChecks(RTChecks) { 460 // Query this against the original loop and save it here because the profile 461 // of the original loop header may change as the transformation happens. 462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 464 465 if (MinProfitableTripCount.isZero()) 466 this->MinProfitableTripCount = VecWidth; 467 else 468 this->MinProfitableTripCount = MinProfitableTripCount; 469 } 470 471 virtual ~InnerLoopVectorizer() = default; 472 473 /// Create a new empty loop that will contain vectorized instructions later 474 /// on, while the old loop will be used as the scalar remainder. Control flow 475 /// is generated around the vectorized (and scalar epilogue) loops consisting 476 /// of various checks and bypasses. Return the pre-header block of the new 477 /// loop and the start value for the canonical induction, if it is != 0. The 478 /// latter is the case when vectorizing the epilogue loop. In the case of 479 /// epilogue vectorization, this function is overriden to handle the more 480 /// complex control flow around the loops. 481 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// A helper function to scalarize a single Instruction in the innermost loop. 495 /// Generates a sequence of scalar instances for each lane between \p MinLane 496 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 497 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 498 /// Instr's operands. 499 void scalarizeInstruction(const Instruction *Instr, 500 VPReplicateRecipe *RepRecipe, 501 const VPIteration &Instance, bool IfPredicateInstr, 502 VPTransformState &State); 503 504 /// Construct the vector value of a scalarized value \p V one lane at a time. 505 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 506 VPTransformState &State); 507 508 /// Try to vectorize interleaved access group \p Group with the base address 509 /// given in \p Addr, optionally masking the vector operations if \p 510 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 511 /// values in the vectorized loop. 512 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 513 ArrayRef<VPValue *> VPDefs, 514 VPTransformState &State, VPValue *Addr, 515 ArrayRef<VPValue *> StoredValues, 516 VPValue *BlockInMask = nullptr); 517 518 /// Fix the non-induction PHIs in \p Plan. 519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 520 521 /// Returns true if the reordering of FP operations is not allowed, but we are 522 /// able to vectorize with strict in-order reductions for the given RdxDesc. 523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 524 525 /// Create a broadcast instruction. This method generates a broadcast 526 /// instruction (shuffle) for loop invariant values and for the induction 527 /// value. If this is the induction variable then we extend it to N, N+1, ... 528 /// this is needed because each iteration in the loop corresponds to a SIMD 529 /// element. 530 virtual Value *getBroadcastInstrs(Value *V); 531 532 // Returns the resume value (bc.merge.rdx) for a reduction as 533 // generated by fixReduction. 534 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 535 536 /// Create a new phi node for the induction variable \p OrigPhi to resume 537 /// iteration count in the scalar epilogue, from where the vectorized loop 538 /// left off. In cases where the loop skeleton is more complicated (eg. 539 /// epilogue vectorization) and the resume values can come from an additional 540 /// bypass block, the \p AdditionalBypass pair provides information about the 541 /// bypass block and the end value on the edge from bypass to this loop. 542 PHINode *createInductionResumeValue( 543 PHINode *OrigPhi, const InductionDescriptor &ID, 544 ArrayRef<BasicBlock *> BypassBlocks, 545 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 546 547 protected: 548 friend class LoopVectorizationPlanner; 549 550 /// A small list of PHINodes. 551 using PhiVector = SmallVector<PHINode *, 4>; 552 553 /// A type for scalarized values in the new loop. Each value from the 554 /// original loop, when scalarized, is represented by UF x VF scalar values 555 /// in the new unrolled loop, where UF is the unroll factor and VF is the 556 /// vectorization factor. 557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 558 559 /// Set up the values of the IVs correctly when exiting the vector loop. 560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 561 Value *VectorTripCount, Value *EndValue, 562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 563 VPlan &Plan); 564 565 /// Handle all cross-iteration phis in the header. 566 void fixCrossIterationPHIs(VPTransformState &State); 567 568 /// Create the exit value of first order recurrences in the middle block and 569 /// update their users. 570 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 571 VPTransformState &State); 572 573 /// Create code for the loop exit value of the reduction. 574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 575 576 /// Clear NSW/NUW flags from reduction instructions if necessary. 577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 578 VPTransformState &State); 579 580 /// Iteratively sink the scalarized operands of a predicated instruction into 581 /// the block that was created for it. 582 void sinkScalarOperands(Instruction *PredInst); 583 584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 585 /// represented as. 586 void truncateToMinimalBitwidths(VPTransformState &State); 587 588 /// Returns (and creates if needed) the original loop trip count. 589 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 590 591 /// Returns (and creates if needed) the trip count of the widened loop. 592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 593 594 /// Returns a bitcasted value to the requested vector type. 595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 597 const DataLayout &DL); 598 599 /// Emit a bypass check to see if the vector trip count is zero, including if 600 /// it overflows. 601 void emitIterationCountCheck(BasicBlock *Bypass); 602 603 /// Emit a bypass check to see if all of the SCEV assumptions we've 604 /// had to make are correct. Returns the block containing the checks or 605 /// nullptr if no checks have been added. 606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 607 608 /// Emit bypass checks to check any memory assumptions we may have made. 609 /// Returns the block containing the checks or nullptr if no checks have been 610 /// added. 611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 612 613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 614 /// vector loop preheader, middle block and scalar preheader. 615 void createVectorLoopSkeleton(StringRef Prefix); 616 617 /// Create new phi nodes for the induction variables to resume iteration count 618 /// in the scalar epilogue, from where the vectorized loop left off. 619 /// In cases where the loop skeleton is more complicated (eg. epilogue 620 /// vectorization) and the resume values can come from an additional bypass 621 /// block, the \p AdditionalBypass pair provides information about the bypass 622 /// block and the end value on the edge from bypass to this loop. 623 void createInductionResumeValues( 624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 625 626 /// Complete the loop skeleton by adding debug MDs, creating appropriate 627 /// conditional branches in the middle block, preparing the builder and 628 /// running the verifier. Return the preheader of the completed vector loop. 629 BasicBlock *completeLoopSkeleton(); 630 631 /// Collect poison-generating recipes that may generate a poison value that is 632 /// used after vectorization, even when their operands are not poison. Those 633 /// recipes meet the following conditions: 634 /// * Contribute to the address computation of a recipe generating a widen 635 /// memory load/store (VPWidenMemoryInstructionRecipe or 636 /// VPInterleaveRecipe). 637 /// * Such a widen memory load/store has at least one underlying Instruction 638 /// that is in a basic block that needs predication and after vectorization 639 /// the generated instruction won't be predicated. 640 void collectPoisonGeneratingRecipes(VPTransformState &State); 641 642 /// Allow subclasses to override and print debug traces before/after vplan 643 /// execution, when trace information is requested. 644 virtual void printDebugTracesAtStart(){}; 645 virtual void printDebugTracesAtEnd(){}; 646 647 /// The original loop. 648 Loop *OrigLoop; 649 650 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 651 /// dynamic knowledge to simplify SCEV expressions and converts them to a 652 /// more usable form. 653 PredicatedScalarEvolution &PSE; 654 655 /// Loop Info. 656 LoopInfo *LI; 657 658 /// Dominator Tree. 659 DominatorTree *DT; 660 661 /// Target Library Info. 662 const TargetLibraryInfo *TLI; 663 664 /// Target Transform Info. 665 const TargetTransformInfo *TTI; 666 667 /// Assumption Cache. 668 AssumptionCache *AC; 669 670 /// Interface to emit optimization remarks. 671 OptimizationRemarkEmitter *ORE; 672 673 /// The vectorization SIMD factor to use. Each vector will have this many 674 /// vector elements. 675 ElementCount VF; 676 677 ElementCount MinProfitableTripCount; 678 679 /// The vectorization unroll factor to use. Each scalar is vectorized to this 680 /// many different vector instructions. 681 unsigned UF; 682 683 /// The builder that we use 684 IRBuilder<> Builder; 685 686 // --- Vectorization state --- 687 688 /// The vector-loop preheader. 689 BasicBlock *LoopVectorPreHeader; 690 691 /// The scalar-loop preheader. 692 BasicBlock *LoopScalarPreHeader; 693 694 /// Middle Block between the vector and the scalar. 695 BasicBlock *LoopMiddleBlock; 696 697 /// The unique ExitBlock of the scalar loop if one exists. Note that 698 /// there can be multiple exiting edges reaching this block. 699 BasicBlock *LoopExitBlock; 700 701 /// The scalar loop body. 702 BasicBlock *LoopScalarBody; 703 704 /// A list of all bypass blocks. The first block is the entry of the loop. 705 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 706 707 /// Store instructions that were predicated. 708 SmallVector<Instruction *, 4> PredicatedInstructions; 709 710 /// Trip count of the original loop. 711 Value *TripCount = nullptr; 712 713 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 714 Value *VectorTripCount = nullptr; 715 716 /// The legality analysis. 717 LoopVectorizationLegality *Legal; 718 719 /// The profitablity analysis. 720 LoopVectorizationCostModel *Cost; 721 722 // Record whether runtime checks are added. 723 bool AddedSafetyChecks = false; 724 725 // Holds the end values for each induction variable. We save the end values 726 // so we can later fix-up the external users of the induction variables. 727 DenseMap<PHINode *, Value *> IVEndValues; 728 729 /// BFI and PSI are used to check for profile guided size optimizations. 730 BlockFrequencyInfo *BFI; 731 ProfileSummaryInfo *PSI; 732 733 // Whether this loop should be optimized for size based on profile guided size 734 // optimizatios. 735 bool OptForSizeBasedOnProfile; 736 737 /// Structure to hold information about generated runtime checks, responsible 738 /// for cleaning the checks, if vectorization turns out unprofitable. 739 GeneratedRTChecks &RTChecks; 740 741 // Holds the resume values for reductions in the loops, used to set the 742 // correct start value of reduction PHIs when vectorizing the epilogue. 743 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 744 ReductionResumeValues; 745 }; 746 747 class InnerLoopUnroller : public InnerLoopVectorizer { 748 public: 749 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 750 LoopInfo *LI, DominatorTree *DT, 751 const TargetLibraryInfo *TLI, 752 const TargetTransformInfo *TTI, AssumptionCache *AC, 753 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 754 LoopVectorizationLegality *LVL, 755 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 756 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 757 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 758 ElementCount::getFixed(1), 759 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 760 BFI, PSI, Check) {} 761 762 private: 763 Value *getBroadcastInstrs(Value *V) override; 764 }; 765 766 /// Encapsulate information regarding vectorization of a loop and its epilogue. 767 /// This information is meant to be updated and used across two stages of 768 /// epilogue vectorization. 769 struct EpilogueLoopVectorizationInfo { 770 ElementCount MainLoopVF = ElementCount::getFixed(0); 771 unsigned MainLoopUF = 0; 772 ElementCount EpilogueVF = ElementCount::getFixed(0); 773 unsigned EpilogueUF = 0; 774 BasicBlock *MainLoopIterationCountCheck = nullptr; 775 BasicBlock *EpilogueIterationCountCheck = nullptr; 776 BasicBlock *SCEVSafetyCheck = nullptr; 777 BasicBlock *MemSafetyCheck = nullptr; 778 Value *TripCount = nullptr; 779 Value *VectorTripCount = nullptr; 780 781 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 782 ElementCount EVF, unsigned EUF) 783 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 784 assert(EUF == 1 && 785 "A high UF for the epilogue loop is likely not beneficial."); 786 } 787 }; 788 789 /// An extension of the inner loop vectorizer that creates a skeleton for a 790 /// vectorized loop that has its epilogue (residual) also vectorized. 791 /// The idea is to run the vplan on a given loop twice, firstly to setup the 792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 793 /// from the first step and vectorize the epilogue. This is achieved by 794 /// deriving two concrete strategy classes from this base class and invoking 795 /// them in succession from the loop vectorizer planner. 796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 797 public: 798 InnerLoopAndEpilogueVectorizer( 799 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 800 DominatorTree *DT, const TargetLibraryInfo *TLI, 801 const TargetTransformInfo *TTI, AssumptionCache *AC, 802 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 803 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 804 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 805 GeneratedRTChecks &Checks) 806 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 807 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 808 CM, BFI, PSI, Checks), 809 EPI(EPI) {} 810 811 // Override this function to handle the more complex control flow around the 812 // three loops. 813 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final { 814 return createEpilogueVectorizedLoopSkeleton(); 815 } 816 817 /// The interface for creating a vectorized skeleton using one of two 818 /// different strategies, each corresponding to one execution of the vplan 819 /// as described above. 820 virtual std::pair<BasicBlock *, Value *> 821 createEpilogueVectorizedLoopSkeleton() = 0; 822 823 /// Holds and updates state information required to vectorize the main loop 824 /// and its epilogue in two separate passes. This setup helps us avoid 825 /// regenerating and recomputing runtime safety checks. It also helps us to 826 /// shorten the iteration-count-check path length for the cases where the 827 /// iteration count of the loop is so small that the main vector loop is 828 /// completely skipped. 829 EpilogueLoopVectorizationInfo &EPI; 830 }; 831 832 /// A specialized derived class of inner loop vectorizer that performs 833 /// vectorization of *main* loops in the process of vectorizing loops and their 834 /// epilogues. 835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 836 public: 837 EpilogueVectorizerMainLoop( 838 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 839 DominatorTree *DT, const TargetLibraryInfo *TLI, 840 const TargetTransformInfo *TTI, AssumptionCache *AC, 841 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 842 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 843 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 844 GeneratedRTChecks &Check) 845 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 846 EPI, LVL, CM, BFI, PSI, Check) {} 847 /// Implements the interface for creating a vectorized skeleton using the 848 /// *main loop* strategy (ie the first pass of vplan execution). 849 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; 850 851 protected: 852 /// Emits an iteration count bypass check once for the main loop (when \p 853 /// ForEpilogue is false) and once for the epilogue loop (when \p 854 /// ForEpilogue is true). 855 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 856 void printDebugTracesAtStart() override; 857 void printDebugTracesAtEnd() override; 858 }; 859 860 // A specialized derived class of inner loop vectorizer that performs 861 // vectorization of *epilogue* loops in the process of vectorizing loops and 862 // their epilogues. 863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 864 public: 865 EpilogueVectorizerEpilogueLoop( 866 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 867 DominatorTree *DT, const TargetLibraryInfo *TLI, 868 const TargetTransformInfo *TTI, AssumptionCache *AC, 869 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 870 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 871 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 872 GeneratedRTChecks &Checks) 873 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 874 EPI, LVL, CM, BFI, PSI, Checks) { 875 TripCount = EPI.TripCount; 876 } 877 /// Implements the interface for creating a vectorized skeleton using the 878 /// *epilogue loop* strategy (ie the second pass of vplan execution). 879 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; 880 881 protected: 882 /// Emits an iteration count bypass check after the main vector loop has 883 /// finished to see if there are any iterations left to execute by either 884 /// the vector epilogue or the scalar epilogue. 885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 886 BasicBlock *Bypass, 887 BasicBlock *Insert); 888 void printDebugTracesAtStart() override; 889 void printDebugTracesAtEnd() override; 890 }; 891 } // end namespace llvm 892 893 /// Look for a meaningful debug location on the instruction or it's 894 /// operands. 895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 896 if (!I) 897 return I; 898 899 DebugLoc Empty; 900 if (I->getDebugLoc() != Empty) 901 return I; 902 903 for (Use &Op : I->operands()) { 904 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 905 if (OpInst->getDebugLoc() != Empty) 906 return OpInst; 907 } 908 909 return I; 910 } 911 912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 913 /// is passed, the message relates to that particular instruction. 914 #ifndef NDEBUG 915 static void debugVectorizationMessage(const StringRef Prefix, 916 const StringRef DebugMsg, 917 Instruction *I) { 918 dbgs() << "LV: " << Prefix << DebugMsg; 919 if (I != nullptr) 920 dbgs() << " " << *I; 921 else 922 dbgs() << '.'; 923 dbgs() << '\n'; 924 } 925 #endif 926 927 /// Create an analysis remark that explains why vectorization failed 928 /// 929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 930 /// RemarkName is the identifier for the remark. If \p I is passed it is an 931 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 932 /// the location of the remark. \return the remark object that can be 933 /// streamed to. 934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 935 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 936 Value *CodeRegion = TheLoop->getHeader(); 937 DebugLoc DL = TheLoop->getStartLoc(); 938 939 if (I) { 940 CodeRegion = I->getParent(); 941 // If there is no debug location attached to the instruction, revert back to 942 // using the loop's. 943 if (I->getDebugLoc()) 944 DL = I->getDebugLoc(); 945 } 946 947 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 948 } 949 950 namespace llvm { 951 952 /// Return a value for Step multiplied by VF. 953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 954 int64_t Step) { 955 assert(Ty->isIntegerTy() && "Expected an integer step"); 956 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 957 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 958 } 959 960 /// Return the runtime value for VF. 961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 962 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 963 return VF.isScalable() ? B.CreateVScale(EC) : EC; 964 } 965 966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) { 967 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 969 970 ScalarEvolution &SE = *PSE.getSE(); 971 972 // The exit count might have the type of i64 while the phi is i32. This can 973 // happen if we have an induction variable that is sign extended before the 974 // compare. The only way that we get a backedge taken count is that the 975 // induction variable was signed and as such will not overflow. In such a case 976 // truncation is legal. 977 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) > 978 IdxTy->getPrimitiveSizeInBits()) 979 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy); 980 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 981 982 // Get the total trip count from the count by adding 1. 983 return SE.getAddExpr(BackedgeTakenCount, 984 SE.getOne(BackedgeTakenCount->getType())); 985 } 986 987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 988 ElementCount VF) { 989 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 990 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 991 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 992 return B.CreateUIToFP(RuntimeVF, FTy); 993 } 994 995 void reportVectorizationFailure(const StringRef DebugMsg, 996 const StringRef OREMsg, const StringRef ORETag, 997 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 998 Instruction *I) { 999 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1000 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1001 ORE->emit( 1002 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1003 << "loop not vectorized: " << OREMsg); 1004 } 1005 1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1007 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1008 Instruction *I) { 1009 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1010 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1011 ORE->emit( 1012 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1013 << Msg); 1014 } 1015 1016 } // end namespace llvm 1017 1018 #ifndef NDEBUG 1019 /// \return string containing a file name and a line # for the given loop. 1020 static std::string getDebugLocString(const Loop *L) { 1021 std::string Result; 1022 if (L) { 1023 raw_string_ostream OS(Result); 1024 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1025 LoopDbgLoc.print(OS); 1026 else 1027 // Just print the module name. 1028 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1029 OS.flush(); 1030 } 1031 return Result; 1032 } 1033 #endif 1034 1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1036 VPTransformState &State) { 1037 1038 // Collect recipes in the backward slice of `Root` that may generate a poison 1039 // value that is used after vectorization. 1040 SmallPtrSet<VPRecipeBase *, 16> Visited; 1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1042 SmallVector<VPRecipeBase *, 16> Worklist; 1043 Worklist.push_back(Root); 1044 1045 // Traverse the backward slice of Root through its use-def chain. 1046 while (!Worklist.empty()) { 1047 VPRecipeBase *CurRec = Worklist.back(); 1048 Worklist.pop_back(); 1049 1050 if (!Visited.insert(CurRec).second) 1051 continue; 1052 1053 // Prune search if we find another recipe generating a widen memory 1054 // instruction. Widen memory instructions involved in address computation 1055 // will lead to gather/scatter instructions, which don't need to be 1056 // handled. 1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1058 isa<VPInterleaveRecipe>(CurRec) || 1059 isa<VPScalarIVStepsRecipe>(CurRec) || 1060 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1062 continue; 1063 1064 // This recipe contributes to the address computation of a widen 1065 // load/store. Collect recipe if its underlying instruction has 1066 // poison-generating flags. 1067 Instruction *Instr = CurRec->getUnderlyingInstr(); 1068 if (Instr && Instr->hasPoisonGeneratingFlags()) 1069 State.MayGeneratePoisonRecipes.insert(CurRec); 1070 1071 // Add new definitions to the worklist. 1072 for (VPValue *operand : CurRec->operands()) 1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1074 Worklist.push_back(OpDef); 1075 } 1076 }); 1077 1078 // Traverse all the recipes in the VPlan and collect the poison-generating 1079 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1080 // VPInterleaveRecipe. 1081 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1083 for (VPRecipeBase &Recipe : *VPBB) { 1084 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1085 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1087 if (AddrDef && WidenRec->isConsecutive() && 1088 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1090 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1092 if (AddrDef) { 1093 // Check if any member of the interleave group needs predication. 1094 const InterleaveGroup<Instruction> *InterGroup = 1095 InterleaveRec->getInterleaveGroup(); 1096 bool NeedPredication = false; 1097 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1098 I < NumMembers; ++I) { 1099 Instruction *Member = InterGroup->getMember(I); 1100 if (Member) 1101 NeedPredication |= 1102 Legal->blockNeedsPredication(Member->getParent()); 1103 } 1104 1105 if (NeedPredication) 1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1107 } 1108 } 1109 } 1110 } 1111 } 1112 1113 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1114 const RecurrenceDescriptor &RdxDesc) { 1115 auto It = ReductionResumeValues.find(&RdxDesc); 1116 assert(It != ReductionResumeValues.end() && 1117 "Expected to find a resume value for the reduction."); 1118 return It->second; 1119 } 1120 1121 namespace llvm { 1122 1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1124 // lowered. 1125 enum ScalarEpilogueLowering { 1126 1127 // The default: allowing scalar epilogues. 1128 CM_ScalarEpilogueAllowed, 1129 1130 // Vectorization with OptForSize: don't allow epilogues. 1131 CM_ScalarEpilogueNotAllowedOptSize, 1132 1133 // A special case of vectorisation with OptForSize: loops with a very small 1134 // trip count are considered for vectorization under OptForSize, thereby 1135 // making sure the cost of their loop body is dominant, free of runtime 1136 // guards and scalar iteration overheads. 1137 CM_ScalarEpilogueNotAllowedLowTripLoop, 1138 1139 // Loop hint predicate indicating an epilogue is undesired. 1140 CM_ScalarEpilogueNotNeededUsePredicate, 1141 1142 // Directive indicating we must either tail fold or not vectorize 1143 CM_ScalarEpilogueNotAllowedUsePredicate 1144 }; 1145 1146 /// ElementCountComparator creates a total ordering for ElementCount 1147 /// for the purposes of using it in a set structure. 1148 struct ElementCountComparator { 1149 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1150 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1151 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1152 } 1153 }; 1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1155 1156 /// LoopVectorizationCostModel - estimates the expected speedups due to 1157 /// vectorization. 1158 /// In many cases vectorization is not profitable. This can happen because of 1159 /// a number of reasons. In this class we mainly attempt to predict the 1160 /// expected speedup/slowdowns due to the supported instruction set. We use the 1161 /// TargetTransformInfo to query the different backends for the cost of 1162 /// different operations. 1163 class LoopVectorizationCostModel { 1164 public: 1165 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1166 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1167 LoopVectorizationLegality *Legal, 1168 const TargetTransformInfo &TTI, 1169 const TargetLibraryInfo *TLI, DemandedBits *DB, 1170 AssumptionCache *AC, 1171 OptimizationRemarkEmitter *ORE, const Function *F, 1172 const LoopVectorizeHints *Hints, 1173 InterleavedAccessInfo &IAI) 1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1175 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1176 Hints(Hints), InterleaveInfo(IAI) {} 1177 1178 /// \return An upper bound for the vectorization factors (both fixed and 1179 /// scalable). If the factors are 0, vectorization and interleaving should be 1180 /// avoided up front. 1181 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1182 1183 /// \return True if runtime checks are required for vectorization, and false 1184 /// otherwise. 1185 bool runtimeChecksRequired(); 1186 1187 /// \return The most profitable vectorization factor and the cost of that VF. 1188 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1189 /// then this vectorization factor will be selected if vectorization is 1190 /// possible. 1191 VectorizationFactor 1192 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1193 1194 VectorizationFactor 1195 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1196 const LoopVectorizationPlanner &LVP); 1197 1198 /// Setup cost-based decisions for user vectorization factor. 1199 /// \return true if the UserVF is a feasible VF to be chosen. 1200 bool selectUserVectorizationFactor(ElementCount UserVF) { 1201 collectUniformsAndScalars(UserVF); 1202 collectInstsToScalarize(UserVF); 1203 return expectedCost(UserVF).first.isValid(); 1204 } 1205 1206 /// \return The size (in bits) of the smallest and widest types in the code 1207 /// that needs to be vectorized. We ignore values that remain scalar such as 1208 /// 64 bit loop indices. 1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1210 1211 /// \return The desired interleave count. 1212 /// If interleave count has been specified by metadata it will be returned. 1213 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1214 /// are the selected vectorization factor and the cost of the selected VF. 1215 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1216 1217 /// Memory access instruction may be vectorized in more than one way. 1218 /// Form of instruction after vectorization depends on cost. 1219 /// This function takes cost-based decisions for Load/Store instructions 1220 /// and collects them in a map. This decisions map is used for building 1221 /// the lists of loop-uniform and loop-scalar instructions. 1222 /// The calculated cost is saved with widening decision in order to 1223 /// avoid redundant calculations. 1224 void setCostBasedWideningDecision(ElementCount VF); 1225 1226 /// A struct that represents some properties of the register usage 1227 /// of a loop. 1228 struct RegisterUsage { 1229 /// Holds the number of loop invariant values that are used in the loop. 1230 /// The key is ClassID of target-provided register class. 1231 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1232 /// Holds the maximum number of concurrent live intervals in the loop. 1233 /// The key is ClassID of target-provided register class. 1234 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1235 }; 1236 1237 /// \return Returns information about the register usages of the loop for the 1238 /// given vectorization factors. 1239 SmallVector<RegisterUsage, 8> 1240 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1241 1242 /// Collect values we want to ignore in the cost model. 1243 void collectValuesToIgnore(); 1244 1245 /// Collect all element types in the loop for which widening is needed. 1246 void collectElementTypesForWidening(); 1247 1248 /// Split reductions into those that happen in the loop, and those that happen 1249 /// outside. In loop reductions are collected into InLoopReductionChains. 1250 void collectInLoopReductions(); 1251 1252 /// Returns true if we should use strict in-order reductions for the given 1253 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1254 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1255 /// of FP operations. 1256 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1257 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1258 } 1259 1260 /// \returns The smallest bitwidth each instruction can be represented with. 1261 /// The vector equivalents of these instructions should be truncated to this 1262 /// type. 1263 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1264 return MinBWs; 1265 } 1266 1267 /// \returns True if it is more profitable to scalarize instruction \p I for 1268 /// vectorization factor \p VF. 1269 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1270 assert(VF.isVector() && 1271 "Profitable to scalarize relevant only for VF > 1."); 1272 1273 // Cost model is not run in the VPlan-native path - return conservative 1274 // result until this changes. 1275 if (EnableVPlanNativePath) 1276 return false; 1277 1278 auto Scalars = InstsToScalarize.find(VF); 1279 assert(Scalars != InstsToScalarize.end() && 1280 "VF not yet analyzed for scalarization profitability"); 1281 return Scalars->second.find(I) != Scalars->second.end(); 1282 } 1283 1284 /// Returns true if \p I is known to be uniform after vectorization. 1285 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1286 if (VF.isScalar()) 1287 return true; 1288 1289 // Cost model is not run in the VPlan-native path - return conservative 1290 // result until this changes. 1291 if (EnableVPlanNativePath) 1292 return false; 1293 1294 auto UniformsPerVF = Uniforms.find(VF); 1295 assert(UniformsPerVF != Uniforms.end() && 1296 "VF not yet analyzed for uniformity"); 1297 return UniformsPerVF->second.count(I); 1298 } 1299 1300 /// Returns true if \p I is known to be scalar after vectorization. 1301 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1302 if (VF.isScalar()) 1303 return true; 1304 1305 // Cost model is not run in the VPlan-native path - return conservative 1306 // result until this changes. 1307 if (EnableVPlanNativePath) 1308 return false; 1309 1310 auto ScalarsPerVF = Scalars.find(VF); 1311 assert(ScalarsPerVF != Scalars.end() && 1312 "Scalar values are not calculated for VF"); 1313 return ScalarsPerVF->second.count(I); 1314 } 1315 1316 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1317 /// for vectorization factor \p VF. 1318 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1319 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1320 !isProfitableToScalarize(I, VF) && 1321 !isScalarAfterVectorization(I, VF); 1322 } 1323 1324 /// Decision that was taken during cost calculation for memory instruction. 1325 enum InstWidening { 1326 CM_Unknown, 1327 CM_Widen, // For consecutive accesses with stride +1. 1328 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1329 CM_Interleave, 1330 CM_GatherScatter, 1331 CM_Scalarize 1332 }; 1333 1334 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1335 /// instruction \p I and vector width \p VF. 1336 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1337 InstructionCost Cost) { 1338 assert(VF.isVector() && "Expected VF >=2"); 1339 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1340 } 1341 1342 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1343 /// interleaving group \p Grp and vector width \p VF. 1344 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1345 ElementCount VF, InstWidening W, 1346 InstructionCost Cost) { 1347 assert(VF.isVector() && "Expected VF >=2"); 1348 /// Broadcast this decicion to all instructions inside the group. 1349 /// But the cost will be assigned to one instruction only. 1350 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1351 if (auto *I = Grp->getMember(i)) { 1352 if (Grp->getInsertPos() == I) 1353 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1354 else 1355 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1356 } 1357 } 1358 } 1359 1360 /// Return the cost model decision for the given instruction \p I and vector 1361 /// width \p VF. Return CM_Unknown if this instruction did not pass 1362 /// through the cost modeling. 1363 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1364 assert(VF.isVector() && "Expected VF to be a vector VF"); 1365 // Cost model is not run in the VPlan-native path - return conservative 1366 // result until this changes. 1367 if (EnableVPlanNativePath) 1368 return CM_GatherScatter; 1369 1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1371 auto Itr = WideningDecisions.find(InstOnVF); 1372 if (Itr == WideningDecisions.end()) 1373 return CM_Unknown; 1374 return Itr->second.first; 1375 } 1376 1377 /// Return the vectorization cost for the given instruction \p I and vector 1378 /// width \p VF. 1379 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1380 assert(VF.isVector() && "Expected VF >=2"); 1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1383 "The cost is not calculated"); 1384 return WideningDecisions[InstOnVF].second; 1385 } 1386 1387 /// Return True if instruction \p I is an optimizable truncate whose operand 1388 /// is an induction variable. Such a truncate will be removed by adding a new 1389 /// induction variable with the destination type. 1390 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1391 // If the instruction is not a truncate, return false. 1392 auto *Trunc = dyn_cast<TruncInst>(I); 1393 if (!Trunc) 1394 return false; 1395 1396 // Get the source and destination types of the truncate. 1397 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1398 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1399 1400 // If the truncate is free for the given types, return false. Replacing a 1401 // free truncate with an induction variable would add an induction variable 1402 // update instruction to each iteration of the loop. We exclude from this 1403 // check the primary induction variable since it will need an update 1404 // instruction regardless. 1405 Value *Op = Trunc->getOperand(0); 1406 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1407 return false; 1408 1409 // If the truncated value is not an induction variable, return false. 1410 return Legal->isInductionPhi(Op); 1411 } 1412 1413 /// Collects the instructions to scalarize for each predicated instruction in 1414 /// the loop. 1415 void collectInstsToScalarize(ElementCount VF); 1416 1417 /// Collect Uniform and Scalar values for the given \p VF. 1418 /// The sets depend on CM decision for Load/Store instructions 1419 /// that may be vectorized as interleave, gather-scatter or scalarized. 1420 void collectUniformsAndScalars(ElementCount VF) { 1421 // Do the analysis once. 1422 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1423 return; 1424 setCostBasedWideningDecision(VF); 1425 collectLoopUniforms(VF); 1426 collectLoopScalars(VF); 1427 } 1428 1429 /// Returns true if the target machine supports masked store operation 1430 /// for the given \p DataType and kind of access to \p Ptr. 1431 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1432 return Legal->isConsecutivePtr(DataType, Ptr) && 1433 TTI.isLegalMaskedStore(DataType, Alignment); 1434 } 1435 1436 /// Returns true if the target machine supports masked load operation 1437 /// for the given \p DataType and kind of access to \p Ptr. 1438 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1439 return Legal->isConsecutivePtr(DataType, Ptr) && 1440 TTI.isLegalMaskedLoad(DataType, Alignment); 1441 } 1442 1443 /// Returns true if the target machine can represent \p V as a masked gather 1444 /// or scatter operation. 1445 bool isLegalGatherOrScatter(Value *V, 1446 ElementCount VF = ElementCount::getFixed(1)) { 1447 bool LI = isa<LoadInst>(V); 1448 bool SI = isa<StoreInst>(V); 1449 if (!LI && !SI) 1450 return false; 1451 auto *Ty = getLoadStoreType(V); 1452 Align Align = getLoadStoreAlignment(V); 1453 if (VF.isVector()) 1454 Ty = VectorType::get(Ty, VF); 1455 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1456 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1457 } 1458 1459 /// Returns true if the target machine supports all of the reduction 1460 /// variables found for the given VF. 1461 bool canVectorizeReductions(ElementCount VF) const { 1462 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1463 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1465 })); 1466 } 1467 1468 /// Given costs for both strategies, return true if the scalar predication 1469 /// lowering should be used for div/rem. This incorporates an override 1470 /// option so it is not simply a cost comparison. 1471 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1472 InstructionCost SafeDivisorCost) const { 1473 switch (ForceSafeDivisor) { 1474 case cl::BOU_UNSET: 1475 return ScalarCost < SafeDivisorCost; 1476 case cl::BOU_TRUE: 1477 return false; 1478 case cl::BOU_FALSE: 1479 return true; 1480 }; 1481 llvm_unreachable("impossible case value"); 1482 } 1483 1484 /// Returns true if \p I is an instruction which requires predication and 1485 /// for which our chosen predication strategy is scalarization (i.e. we 1486 /// don't have an alternate strategy such as masking available). 1487 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1488 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1489 1490 /// Returns true if \p I is an instruction that needs to be predicated 1491 /// at runtime. The result is independent of the predication mechanism. 1492 /// Superset of instructions that return true for isScalarWithPredication. 1493 bool isPredicatedInst(Instruction *I) const; 1494 1495 /// Return the costs for our two available strategies for lowering a 1496 /// div/rem operation which requires speculating at least one lane. 1497 /// First result is for scalarization (will be invalid for scalable 1498 /// vectors); second is for the safe-divisor strategy. 1499 std::pair<InstructionCost, InstructionCost> 1500 getDivRemSpeculationCost(Instruction *I, 1501 ElementCount VF) const; 1502 1503 /// Returns true if \p I is a memory instruction with consecutive memory 1504 /// access that can be widened. 1505 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1506 1507 /// Returns true if \p I is a memory instruction in an interleaved-group 1508 /// of memory accesses that can be vectorized with wide vector loads/stores 1509 /// and shuffles. 1510 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1511 1512 /// Check if \p Instr belongs to any interleaved access group. 1513 bool isAccessInterleaved(Instruction *Instr) { 1514 return InterleaveInfo.isInterleaved(Instr); 1515 } 1516 1517 /// Get the interleaved access group that \p Instr belongs to. 1518 const InterleaveGroup<Instruction> * 1519 getInterleavedAccessGroup(Instruction *Instr) { 1520 return InterleaveInfo.getInterleaveGroup(Instr); 1521 } 1522 1523 /// Returns true if we're required to use a scalar epilogue for at least 1524 /// the final iteration of the original loop. 1525 bool requiresScalarEpilogue(ElementCount VF) const { 1526 if (!isScalarEpilogueAllowed()) 1527 return false; 1528 // If we might exit from anywhere but the latch, must run the exiting 1529 // iteration in scalar form. 1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1531 return true; 1532 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1533 } 1534 1535 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1536 /// loop hint annotation. 1537 bool isScalarEpilogueAllowed() const { 1538 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1539 } 1540 1541 /// Returns true if all loop blocks should be masked to fold tail loop. 1542 bool foldTailByMasking() const { return FoldTailByMasking; } 1543 1544 /// Returns true if were tail-folding and want to use the active lane mask 1545 /// for vector loop control flow. 1546 bool useActiveLaneMaskForControlFlow() const { 1547 return FoldTailByMasking && 1548 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; 1549 } 1550 1551 /// Returns true if the instructions in this block requires predication 1552 /// for any reason, e.g. because tail folding now requires a predicate 1553 /// or because the block in the original loop was predicated. 1554 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1555 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1556 } 1557 1558 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1559 /// nodes to the chain of instructions representing the reductions. Uses a 1560 /// MapVector to ensure deterministic iteration order. 1561 using ReductionChainMap = 1562 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1563 1564 /// Return the chain of instructions representing an inloop reduction. 1565 const ReductionChainMap &getInLoopReductionChains() const { 1566 return InLoopReductionChains; 1567 } 1568 1569 /// Returns true if the Phi is part of an inloop reduction. 1570 bool isInLoopReduction(PHINode *Phi) const { 1571 return InLoopReductionChains.count(Phi); 1572 } 1573 1574 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1575 /// with factor VF. Return the cost of the instruction, including 1576 /// scalarization overhead if it's needed. 1577 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1578 1579 /// Estimate cost of a call instruction CI if it were vectorized with factor 1580 /// VF. Return the cost of the instruction, including scalarization overhead 1581 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1582 /// scalarized - 1583 /// i.e. either vector version isn't available, or is too expensive. 1584 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1585 bool &NeedToScalarize) const; 1586 1587 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1588 /// that of B. 1589 bool isMoreProfitable(const VectorizationFactor &A, 1590 const VectorizationFactor &B) const; 1591 1592 /// Invalidates decisions already taken by the cost model. 1593 void invalidateCostModelingDecisions() { 1594 WideningDecisions.clear(); 1595 Uniforms.clear(); 1596 Scalars.clear(); 1597 } 1598 1599 /// Convenience function that returns the value of vscale_range iff 1600 /// vscale_range.min == vscale_range.max or otherwise returns the value 1601 /// returned by the corresponding TLI method. 1602 std::optional<unsigned> getVScaleForTuning() const; 1603 1604 private: 1605 unsigned NumPredStores = 0; 1606 1607 /// \return An upper bound for the vectorization factors for both 1608 /// fixed and scalable vectorization, where the minimum-known number of 1609 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1610 /// disabled or unsupported, then the scalable part will be equal to 1611 /// ElementCount::getScalable(0). 1612 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1613 ElementCount UserVF, 1614 bool FoldTailByMasking); 1615 1616 /// \return the maximized element count based on the targets vector 1617 /// registers and the loop trip-count, but limited to a maximum safe VF. 1618 /// This is a helper function of computeFeasibleMaxVF. 1619 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1620 unsigned SmallestType, 1621 unsigned WidestType, 1622 ElementCount MaxSafeVF, 1623 bool FoldTailByMasking); 1624 1625 /// \return the maximum legal scalable VF, based on the safe max number 1626 /// of elements. 1627 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1628 1629 /// The vectorization cost is a combination of the cost itself and a boolean 1630 /// indicating whether any of the contributing operations will actually 1631 /// operate on vector values after type legalization in the backend. If this 1632 /// latter value is false, then all operations will be scalarized (i.e. no 1633 /// vectorization has actually taken place). 1634 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1635 1636 /// Returns the expected execution cost. The unit of the cost does 1637 /// not matter because we use the 'cost' units to compare different 1638 /// vector widths. The cost that is returned is *not* normalized by 1639 /// the factor width. If \p Invalid is not nullptr, this function 1640 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1641 /// each instruction that has an Invalid cost for the given VF. 1642 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1643 VectorizationCostTy 1644 expectedCost(ElementCount VF, 1645 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1646 1647 /// Returns the execution time cost of an instruction for a given vector 1648 /// width. Vector width of one means scalar. 1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost-computation logic from getInstructionCost which provides 1652 /// the vector type as an output parameter. 1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1654 Type *&VectorTy); 1655 1656 /// Return the cost of instructions in an inloop reduction pattern, if I is 1657 /// part of that pattern. 1658 std::optional<InstructionCost> 1659 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1660 TTI::TargetCostKind CostKind); 1661 1662 /// Calculate vectorization cost of memory instruction \p I. 1663 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1664 1665 /// The cost computation for scalarized memory instruction. 1666 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1667 1668 /// The cost computation for interleaving group of memory instructions. 1669 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1670 1671 /// The cost computation for Gather/Scatter instruction. 1672 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1673 1674 /// The cost computation for widening instruction \p I with consecutive 1675 /// memory access. 1676 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1677 1678 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1679 /// Load: scalar load + broadcast. 1680 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1681 /// element) 1682 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1683 1684 /// Estimate the overhead of scalarizing an instruction. This is a 1685 /// convenience wrapper for the type-based getScalarizationOverhead API. 1686 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1687 TTI::TargetCostKind CostKind) const; 1688 1689 /// Returns true if an artificially high cost for emulated masked memrefs 1690 /// should be used. 1691 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1692 1693 /// Map of scalar integer values to the smallest bitwidth they can be legally 1694 /// represented as. The vector equivalents of these values should be truncated 1695 /// to this type. 1696 MapVector<Instruction *, uint64_t> MinBWs; 1697 1698 /// A type representing the costs for instructions if they were to be 1699 /// scalarized rather than vectorized. The entries are Instruction-Cost 1700 /// pairs. 1701 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1702 1703 /// A set containing all BasicBlocks that are known to present after 1704 /// vectorization as a predicated block. 1705 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1706 PredicatedBBsAfterVectorization; 1707 1708 /// Records whether it is allowed to have the original scalar loop execute at 1709 /// least once. This may be needed as a fallback loop in case runtime 1710 /// aliasing/dependence checks fail, or to handle the tail/remainder 1711 /// iterations when the trip count is unknown or doesn't divide by the VF, 1712 /// or as a peel-loop to handle gaps in interleave-groups. 1713 /// Under optsize and when the trip count is very small we don't allow any 1714 /// iterations to execute in the scalar loop. 1715 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1716 1717 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1718 bool FoldTailByMasking = false; 1719 1720 /// A map holding scalar costs for different vectorization factors. The 1721 /// presence of a cost for an instruction in the mapping indicates that the 1722 /// instruction will be scalarized when vectorizing with the associated 1723 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1724 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1725 1726 /// Holds the instructions known to be uniform after vectorization. 1727 /// The data is collected per VF. 1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1729 1730 /// Holds the instructions known to be scalar after vectorization. 1731 /// The data is collected per VF. 1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1733 1734 /// Holds the instructions (address computations) that are forced to be 1735 /// scalarized. 1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1737 1738 /// PHINodes of the reductions that should be expanded in-loop along with 1739 /// their associated chains of reduction operations, in program order from top 1740 /// (PHI) to bottom 1741 ReductionChainMap InLoopReductionChains; 1742 1743 /// A Map of inloop reduction operations and their immediate chain operand. 1744 /// FIXME: This can be removed once reductions can be costed correctly in 1745 /// vplan. This was added to allow quick lookup to the inloop operations, 1746 /// without having to loop through InLoopReductionChains. 1747 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1748 1749 /// Returns the expected difference in cost from scalarizing the expression 1750 /// feeding a predicated instruction \p PredInst. The instructions to 1751 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1752 /// non-negative return value implies the expression will be scalarized. 1753 /// Currently, only single-use chains are considered for scalarization. 1754 InstructionCost computePredInstDiscount(Instruction *PredInst, 1755 ScalarCostsTy &ScalarCosts, 1756 ElementCount VF); 1757 1758 /// Collect the instructions that are uniform after vectorization. An 1759 /// instruction is uniform if we represent it with a single scalar value in 1760 /// the vectorized loop corresponding to each vector iteration. Examples of 1761 /// uniform instructions include pointer operands of consecutive or 1762 /// interleaved memory accesses. Note that although uniformity implies an 1763 /// instruction will be scalar, the reverse is not true. In general, a 1764 /// scalarized instruction will be represented by VF scalar values in the 1765 /// vectorized loop, each corresponding to an iteration of the original 1766 /// scalar loop. 1767 void collectLoopUniforms(ElementCount VF); 1768 1769 /// Collect the instructions that are scalar after vectorization. An 1770 /// instruction is scalar if it is known to be uniform or will be scalarized 1771 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1772 /// to the list if they are used by a load/store instruction that is marked as 1773 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1774 /// VF values in the vectorized loop, each corresponding to an iteration of 1775 /// the original scalar loop. 1776 void collectLoopScalars(ElementCount VF); 1777 1778 /// Keeps cost model vectorization decision and cost for instructions. 1779 /// Right now it is used for memory instructions only. 1780 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1781 std::pair<InstWidening, InstructionCost>>; 1782 1783 DecisionList WideningDecisions; 1784 1785 /// Returns true if \p V is expected to be vectorized and it needs to be 1786 /// extracted. 1787 bool needsExtract(Value *V, ElementCount VF) const { 1788 Instruction *I = dyn_cast<Instruction>(V); 1789 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1790 TheLoop->isLoopInvariant(I)) 1791 return false; 1792 1793 // Assume we can vectorize V (and hence we need extraction) if the 1794 // scalars are not computed yet. This can happen, because it is called 1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1796 // the scalars are collected. That should be a safe assumption in most 1797 // cases, because we check if the operands have vectorizable types 1798 // beforehand in LoopVectorizationLegality. 1799 return Scalars.find(VF) == Scalars.end() || 1800 !isScalarAfterVectorization(I, VF); 1801 }; 1802 1803 /// Returns a range containing only operands needing to be extracted. 1804 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1805 ElementCount VF) const { 1806 return SmallVector<Value *, 4>(make_filter_range( 1807 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1808 } 1809 1810 /// Determines if we have the infrastructure to vectorize loop \p L and its 1811 /// epilogue, assuming the main loop is vectorized by \p VF. 1812 bool isCandidateForEpilogueVectorization(const Loop &L, 1813 const ElementCount VF) const; 1814 1815 /// Returns true if epilogue vectorization is considered profitable, and 1816 /// false otherwise. 1817 /// \p VF is the vectorization factor chosen for the original loop. 1818 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1819 1820 public: 1821 /// The loop that we evaluate. 1822 Loop *TheLoop; 1823 1824 /// Predicated scalar evolution analysis. 1825 PredicatedScalarEvolution &PSE; 1826 1827 /// Loop Info analysis. 1828 LoopInfo *LI; 1829 1830 /// Vectorization legality. 1831 LoopVectorizationLegality *Legal; 1832 1833 /// Vector target information. 1834 const TargetTransformInfo &TTI; 1835 1836 /// Target Library Info. 1837 const TargetLibraryInfo *TLI; 1838 1839 /// Demanded bits analysis. 1840 DemandedBits *DB; 1841 1842 /// Assumption cache. 1843 AssumptionCache *AC; 1844 1845 /// Interface to emit optimization remarks. 1846 OptimizationRemarkEmitter *ORE; 1847 1848 const Function *TheFunction; 1849 1850 /// Loop Vectorize Hint. 1851 const LoopVectorizeHints *Hints; 1852 1853 /// The interleave access information contains groups of interleaved accesses 1854 /// with the same stride and close to each other. 1855 InterleavedAccessInfo &InterleaveInfo; 1856 1857 /// Values to ignore in the cost model. 1858 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1859 1860 /// Values to ignore in the cost model when VF > 1. 1861 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1862 1863 /// All element types found in the loop. 1864 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1865 1866 /// Profitable vector factors. 1867 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1868 }; 1869 } // end namespace llvm 1870 1871 namespace { 1872 /// Helper struct to manage generating runtime checks for vectorization. 1873 /// 1874 /// The runtime checks are created up-front in temporary blocks to allow better 1875 /// estimating the cost and un-linked from the existing IR. After deciding to 1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1877 /// temporary blocks are completely removed. 1878 class GeneratedRTChecks { 1879 /// Basic block which contains the generated SCEV checks, if any. 1880 BasicBlock *SCEVCheckBlock = nullptr; 1881 1882 /// The value representing the result of the generated SCEV checks. If it is 1883 /// nullptr, either no SCEV checks have been generated or they have been used. 1884 Value *SCEVCheckCond = nullptr; 1885 1886 /// Basic block which contains the generated memory runtime checks, if any. 1887 BasicBlock *MemCheckBlock = nullptr; 1888 1889 /// The value representing the result of the generated memory runtime checks. 1890 /// If it is nullptr, either no memory runtime checks have been generated or 1891 /// they have been used. 1892 Value *MemRuntimeCheckCond = nullptr; 1893 1894 DominatorTree *DT; 1895 LoopInfo *LI; 1896 TargetTransformInfo *TTI; 1897 1898 SCEVExpander SCEVExp; 1899 SCEVExpander MemCheckExp; 1900 1901 bool CostTooHigh = false; 1902 1903 public: 1904 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1905 TargetTransformInfo *TTI, const DataLayout &DL) 1906 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1907 MemCheckExp(SE, DL, "scev.check") {} 1908 1909 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1910 /// accurately estimate the cost of the runtime checks. The blocks are 1911 /// un-linked from the IR and is added back during vector code generation. If 1912 /// there is no vector code generation, the check blocks are removed 1913 /// completely. 1914 void Create(Loop *L, const LoopAccessInfo &LAI, 1915 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1916 1917 // Hard cutoff to limit compile-time increase in case a very large number of 1918 // runtime checks needs to be generated. 1919 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1920 // profile info. 1921 CostTooHigh = 1922 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1923 if (CostTooHigh) 1924 return; 1925 1926 BasicBlock *LoopHeader = L->getHeader(); 1927 BasicBlock *Preheader = L->getLoopPreheader(); 1928 1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1931 // may be used by SCEVExpander. The blocks will be un-linked from their 1932 // predecessors and removed from LI & DT at the end of the function. 1933 if (!UnionPred.isAlwaysTrue()) { 1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1935 nullptr, "vector.scevcheck"); 1936 1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1938 &UnionPred, SCEVCheckBlock->getTerminator()); 1939 } 1940 1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1942 if (RtPtrChecking.Need) { 1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1945 "vector.memcheck"); 1946 1947 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1948 if (DiffChecks) { 1949 Value *RuntimeVF = nullptr; 1950 MemRuntimeCheckCond = addDiffRuntimeChecks( 1951 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1952 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1953 if (!RuntimeVF) 1954 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1955 return RuntimeVF; 1956 }, 1957 IC); 1958 } else { 1959 MemRuntimeCheckCond = 1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1961 RtPtrChecking.getChecks(), MemCheckExp); 1962 } 1963 assert(MemRuntimeCheckCond && 1964 "no RT checks generated although RtPtrChecking " 1965 "claimed checks are required"); 1966 } 1967 1968 if (!MemCheckBlock && !SCEVCheckBlock) 1969 return; 1970 1971 // Unhook the temporary block with the checks, update various places 1972 // accordingly. 1973 if (SCEVCheckBlock) 1974 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1975 if (MemCheckBlock) 1976 MemCheckBlock->replaceAllUsesWith(Preheader); 1977 1978 if (SCEVCheckBlock) { 1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1981 Preheader->getTerminator()->eraseFromParent(); 1982 } 1983 if (MemCheckBlock) { 1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1986 Preheader->getTerminator()->eraseFromParent(); 1987 } 1988 1989 DT->changeImmediateDominator(LoopHeader, Preheader); 1990 if (MemCheckBlock) { 1991 DT->eraseNode(MemCheckBlock); 1992 LI->removeBlock(MemCheckBlock); 1993 } 1994 if (SCEVCheckBlock) { 1995 DT->eraseNode(SCEVCheckBlock); 1996 LI->removeBlock(SCEVCheckBlock); 1997 } 1998 } 1999 2000 InstructionCost getCost() { 2001 if (SCEVCheckBlock || MemCheckBlock) 2002 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2003 2004 if (CostTooHigh) { 2005 InstructionCost Cost; 2006 Cost.setInvalid(); 2007 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2008 return Cost; 2009 } 2010 2011 InstructionCost RTCheckCost = 0; 2012 if (SCEVCheckBlock) 2013 for (Instruction &I : *SCEVCheckBlock) { 2014 if (SCEVCheckBlock->getTerminator() == &I) 2015 continue; 2016 InstructionCost C = 2017 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2018 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2019 RTCheckCost += C; 2020 } 2021 if (MemCheckBlock) 2022 for (Instruction &I : *MemCheckBlock) { 2023 if (MemCheckBlock->getTerminator() == &I) 2024 continue; 2025 InstructionCost C = 2026 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2028 RTCheckCost += C; 2029 } 2030 2031 if (SCEVCheckBlock || MemCheckBlock) 2032 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2033 << "\n"); 2034 2035 return RTCheckCost; 2036 } 2037 2038 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2039 /// unused. 2040 ~GeneratedRTChecks() { 2041 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2042 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2043 if (!SCEVCheckCond) 2044 SCEVCleaner.markResultUsed(); 2045 2046 if (!MemRuntimeCheckCond) 2047 MemCheckCleaner.markResultUsed(); 2048 2049 if (MemRuntimeCheckCond) { 2050 auto &SE = *MemCheckExp.getSE(); 2051 // Memory runtime check generation creates compares that use expanded 2052 // values. Remove them before running the SCEVExpanderCleaners. 2053 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2054 if (MemCheckExp.isInsertedInstruction(&I)) 2055 continue; 2056 SE.forgetValue(&I); 2057 I.eraseFromParent(); 2058 } 2059 } 2060 MemCheckCleaner.cleanup(); 2061 SCEVCleaner.cleanup(); 2062 2063 if (SCEVCheckCond) 2064 SCEVCheckBlock->eraseFromParent(); 2065 if (MemRuntimeCheckCond) 2066 MemCheckBlock->eraseFromParent(); 2067 } 2068 2069 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2070 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2071 /// depending on the generated condition. 2072 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2073 BasicBlock *LoopVectorPreHeader, 2074 BasicBlock *LoopExitBlock) { 2075 if (!SCEVCheckCond) 2076 return nullptr; 2077 2078 Value *Cond = SCEVCheckCond; 2079 // Mark the check as used, to prevent it from being removed during cleanup. 2080 SCEVCheckCond = nullptr; 2081 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2082 if (C->isZero()) 2083 return nullptr; 2084 2085 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2086 2087 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2088 // Create new preheader for vector loop. 2089 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2090 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2091 2092 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2093 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2095 SCEVCheckBlock); 2096 2097 DT->addNewBlock(SCEVCheckBlock, Pred); 2098 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2099 2100 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2101 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2102 return SCEVCheckBlock; 2103 } 2104 2105 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2106 /// the branches to branch to the vector preheader or \p Bypass, depending on 2107 /// the generated condition. 2108 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2109 BasicBlock *LoopVectorPreHeader) { 2110 // Check if we generated code that checks in runtime if arrays overlap. 2111 if (!MemRuntimeCheckCond) 2112 return nullptr; 2113 2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2116 MemCheckBlock); 2117 2118 DT->addNewBlock(MemCheckBlock, Pred); 2119 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2120 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2121 2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2123 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2124 2125 ReplaceInstWithInst( 2126 MemCheckBlock->getTerminator(), 2127 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2128 MemCheckBlock->getTerminator()->setDebugLoc( 2129 Pred->getTerminator()->getDebugLoc()); 2130 2131 // Mark the check as used, to prevent it from being removed during cleanup. 2132 MemRuntimeCheckCond = nullptr; 2133 return MemCheckBlock; 2134 } 2135 }; 2136 } // namespace 2137 2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2139 // vectorization. The loop needs to be annotated with #pragma omp simd 2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2141 // vector length information is not provided, vectorization is not considered 2142 // explicit. Interleave hints are not allowed either. These limitations will be 2143 // relaxed in the future. 2144 // Please, note that we are currently forced to abuse the pragma 'clang 2145 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2147 // provides *explicit vectorization hints* (LV can bypass legal checks and 2148 // assume that vectorization is legal). However, both hints are implemented 2149 // using the same metadata (llvm.loop.vectorize, processed by 2150 // LoopVectorizeHints). This will be fixed in the future when the native IR 2151 // representation for pragma 'omp simd' is introduced. 2152 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2153 OptimizationRemarkEmitter *ORE) { 2154 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2156 2157 // Only outer loops with an explicit vectorization hint are supported. 2158 // Unannotated outer loops are ignored. 2159 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2160 return false; 2161 2162 Function *Fn = OuterLp->getHeader()->getParent(); 2163 if (!Hints.allowVectorization(Fn, OuterLp, 2164 true /*VectorizeOnlyWhenForced*/)) { 2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2166 return false; 2167 } 2168 2169 if (Hints.getInterleave() > 1) { 2170 // TODO: Interleave support is future work. 2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2172 "outer loops.\n"); 2173 Hints.emitRemarkWithHints(); 2174 return false; 2175 } 2176 2177 return true; 2178 } 2179 2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2181 OptimizationRemarkEmitter *ORE, 2182 SmallVectorImpl<Loop *> &V) { 2183 // Collect inner loops and outer loops without irreducible control flow. For 2184 // now, only collect outer loops that have explicit vectorization hints. If we 2185 // are stress testing the VPlan H-CFG construction, we collect the outermost 2186 // loop of every loop nest. 2187 if (L.isInnermost() || VPlanBuildStressTest || 2188 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2189 LoopBlocksRPO RPOT(&L); 2190 RPOT.perform(LI); 2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2192 V.push_back(&L); 2193 // TODO: Collect inner loops inside marked outer loops in case 2194 // vectorization fails for the outer loop. Do not invoke 2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2196 // already known to be reducible. We can use an inherited attribute for 2197 // that. 2198 return; 2199 } 2200 } 2201 for (Loop *InnerL : L) 2202 collectSupportedLoops(*InnerL, LI, ORE, V); 2203 } 2204 2205 namespace { 2206 2207 /// The LoopVectorize Pass. 2208 struct LoopVectorize : public FunctionPass { 2209 /// Pass identification, replacement for typeid 2210 static char ID; 2211 2212 LoopVectorizePass Impl; 2213 2214 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2215 bool VectorizeOnlyWhenForced = false) 2216 : FunctionPass(ID), 2217 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2218 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2219 } 2220 2221 bool runOnFunction(Function &F) override { 2222 if (skipFunction(F)) 2223 return false; 2224 2225 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2226 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2227 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2228 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2229 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2230 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2231 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2232 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2233 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); 2234 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2235 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2236 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2237 2238 return Impl 2239 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI) 2240 .MadeAnyChange; 2241 } 2242 2243 void getAnalysisUsage(AnalysisUsage &AU) const override { 2244 AU.addRequired<AssumptionCacheTracker>(); 2245 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2246 AU.addRequired<DominatorTreeWrapperPass>(); 2247 AU.addRequired<LoopInfoWrapperPass>(); 2248 AU.addRequired<ScalarEvolutionWrapperPass>(); 2249 AU.addRequired<TargetTransformInfoWrapperPass>(); 2250 AU.addRequired<LoopAccessLegacyAnalysis>(); 2251 AU.addRequired<DemandedBitsWrapperPass>(); 2252 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2253 AU.addRequired<InjectTLIMappingsLegacy>(); 2254 2255 // We currently do not preserve loopinfo/dominator analyses with outer loop 2256 // vectorization. Until this is addressed, mark these analyses as preserved 2257 // only for non-VPlan-native path. 2258 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2259 if (!EnableVPlanNativePath) { 2260 AU.addPreserved<LoopInfoWrapperPass>(); 2261 AU.addPreserved<DominatorTreeWrapperPass>(); 2262 } 2263 2264 AU.addPreserved<BasicAAWrapperPass>(); 2265 AU.addPreserved<GlobalsAAWrapperPass>(); 2266 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2267 } 2268 }; 2269 2270 } // end anonymous namespace 2271 2272 //===----------------------------------------------------------------------===// 2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2274 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2275 //===----------------------------------------------------------------------===// 2276 2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2278 // We need to place the broadcast of invariant variables outside the loop, 2279 // but only if it's proven safe to do so. Else, broadcast will be inside 2280 // vector loop body. 2281 Instruction *Instr = dyn_cast<Instruction>(V); 2282 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2283 (!Instr || 2284 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2285 // Place the code for broadcasting invariant variables in the new preheader. 2286 IRBuilder<>::InsertPointGuard Guard(Builder); 2287 if (SafeToHoist) 2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2289 2290 // Broadcast the scalar into all locations in the vector. 2291 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2292 2293 return Shuf; 2294 } 2295 2296 /// This function adds 2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2298 /// to each vector element of Val. The sequence starts at StartIndex. 2299 /// \p Opcode is relevant for FP induction variable. 2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2301 Instruction::BinaryOps BinOp, ElementCount VF, 2302 IRBuilderBase &Builder) { 2303 assert(VF.isVector() && "only vector VFs are supported"); 2304 2305 // Create and check the types. 2306 auto *ValVTy = cast<VectorType>(Val->getType()); 2307 ElementCount VLen = ValVTy->getElementCount(); 2308 2309 Type *STy = Val->getType()->getScalarType(); 2310 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2311 "Induction Step must be an integer or FP"); 2312 assert(Step->getType() == STy && "Step has wrong type"); 2313 2314 SmallVector<Constant *, 8> Indices; 2315 2316 // Create a vector of consecutive numbers from zero to VF. 2317 VectorType *InitVecValVTy = ValVTy; 2318 if (STy->isFloatingPointTy()) { 2319 Type *InitVecValSTy = 2320 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2321 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2322 } 2323 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2324 2325 // Splat the StartIdx 2326 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2327 2328 if (STy->isIntegerTy()) { 2329 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2330 Step = Builder.CreateVectorSplat(VLen, Step); 2331 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2332 // FIXME: The newly created binary instructions should contain nsw/nuw 2333 // flags, which can be found from the original scalar operations. 2334 Step = Builder.CreateMul(InitVec, Step); 2335 return Builder.CreateAdd(Val, Step, "induction"); 2336 } 2337 2338 // Floating point induction. 2339 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2340 "Binary Opcode should be specified for FP induction"); 2341 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2342 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2343 2344 Step = Builder.CreateVectorSplat(VLen, Step); 2345 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2346 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2347 } 2348 2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2350 /// variable on which to base the steps, \p Step is the size of the step. 2351 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2352 const InductionDescriptor &ID, VPValue *Def, 2353 VPTransformState &State) { 2354 IRBuilderBase &Builder = State.Builder; 2355 2356 // Ensure step has the same type as that of scalar IV. 2357 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2358 if (ScalarIVTy != Step->getType()) { 2359 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 2360 // avoid separate truncate here. 2361 assert(Step->getType()->isIntegerTy() && 2362 "Truncation requires an integer step"); 2363 Step = State.Builder.CreateTrunc(Step, ScalarIVTy); 2364 } 2365 2366 // We build scalar steps for both integer and floating-point induction 2367 // variables. Here, we determine the kind of arithmetic we will perform. 2368 Instruction::BinaryOps AddOp; 2369 Instruction::BinaryOps MulOp; 2370 if (ScalarIVTy->isIntegerTy()) { 2371 AddOp = Instruction::Add; 2372 MulOp = Instruction::Mul; 2373 } else { 2374 AddOp = ID.getInductionOpcode(); 2375 MulOp = Instruction::FMul; 2376 } 2377 2378 // Determine the number of scalars we need to generate for each unroll 2379 // iteration. 2380 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2381 // Compute the scalar steps and save the results in State. 2382 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2383 ScalarIVTy->getScalarSizeInBits()); 2384 Type *VecIVTy = nullptr; 2385 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2386 if (!FirstLaneOnly && State.VF.isScalable()) { 2387 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2388 UnitStepVec = 2389 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2390 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2391 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2392 } 2393 2394 unsigned StartPart = 0; 2395 unsigned EndPart = State.UF; 2396 unsigned StartLane = 0; 2397 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2398 if (State.Instance) { 2399 StartPart = State.Instance->Part; 2400 EndPart = StartPart + 1; 2401 StartLane = State.Instance->Lane.getKnownLane(); 2402 EndLane = StartLane + 1; 2403 } 2404 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 2405 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2406 2407 if (!FirstLaneOnly && State.VF.isScalable()) { 2408 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2409 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2410 if (ScalarIVTy->isFloatingPointTy()) 2411 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2412 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2413 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2414 State.set(Def, Add, Part); 2415 // It's useful to record the lane values too for the known minimum number 2416 // of elements so we do those below. This improves the code quality when 2417 // trying to extract the first element, for example. 2418 } 2419 2420 if (ScalarIVTy->isFloatingPointTy()) 2421 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2422 2423 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2424 Value *StartIdx = Builder.CreateBinOp( 2425 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2426 // The step returned by `createStepForVF` is a runtime-evaluated value 2427 // when VF is scalable. Otherwise, it should be folded into a Constant. 2428 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2429 "Expected StartIdx to be folded to a constant when VF is not " 2430 "scalable"); 2431 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2432 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2433 State.set(Def, Add, VPIteration(Part, Lane)); 2434 } 2435 } 2436 } 2437 2438 // Generate code for the induction step. Note that induction steps are 2439 // required to be loop-invariant 2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2441 Instruction *InsertBefore, 2442 Loop *OrigLoop = nullptr) { 2443 const DataLayout &DL = SE.getDataLayout(); 2444 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2445 "Induction step should be loop invariant"); 2446 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2447 return E->getValue(); 2448 2449 SCEVExpander Exp(SE, DL, "induction"); 2450 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2451 } 2452 2453 /// Compute the transformed value of Index at offset StartValue using step 2454 /// StepValue. 2455 /// For integer induction, returns StartValue + Index * StepValue. 2456 /// For pointer induction, returns StartValue[Index * StepValue]. 2457 /// FIXME: The newly created binary instructions should contain nsw/nuw 2458 /// flags, which can be found from the original scalar operations. 2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2460 Value *StartValue, Value *Step, 2461 const InductionDescriptor &ID) { 2462 Type *StepTy = Step->getType(); 2463 Value *CastedIndex = StepTy->isIntegerTy() 2464 ? B.CreateSExtOrTrunc(Index, StepTy) 2465 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2466 if (CastedIndex != Index) { 2467 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2468 Index = CastedIndex; 2469 } 2470 2471 // Note: the IR at this point is broken. We cannot use SE to create any new 2472 // SCEV and then expand it, hoping that SCEV's simplification will give us 2473 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2474 // lead to various SCEV crashes. So all we can do is to use builder and rely 2475 // on InstCombine for future simplifications. Here we handle some trivial 2476 // cases only. 2477 auto CreateAdd = [&B](Value *X, Value *Y) { 2478 assert(X->getType() == Y->getType() && "Types don't match!"); 2479 if (auto *CX = dyn_cast<ConstantInt>(X)) 2480 if (CX->isZero()) 2481 return Y; 2482 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2483 if (CY->isZero()) 2484 return X; 2485 return B.CreateAdd(X, Y); 2486 }; 2487 2488 // We allow X to be a vector type, in which case Y will potentially be 2489 // splatted into a vector with the same element count. 2490 auto CreateMul = [&B](Value *X, Value *Y) { 2491 assert(X->getType()->getScalarType() == Y->getType() && 2492 "Types don't match!"); 2493 if (auto *CX = dyn_cast<ConstantInt>(X)) 2494 if (CX->isOne()) 2495 return Y; 2496 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2497 if (CY->isOne()) 2498 return X; 2499 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2500 if (XVTy && !isa<VectorType>(Y->getType())) 2501 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2502 return B.CreateMul(X, Y); 2503 }; 2504 2505 switch (ID.getKind()) { 2506 case InductionDescriptor::IK_IntInduction: { 2507 assert(!isa<VectorType>(Index->getType()) && 2508 "Vector indices not supported for integer inductions yet"); 2509 assert(Index->getType() == StartValue->getType() && 2510 "Index type does not match StartValue type"); 2511 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2512 return B.CreateSub(StartValue, Index); 2513 auto *Offset = CreateMul(Index, Step); 2514 return CreateAdd(StartValue, Offset); 2515 } 2516 case InductionDescriptor::IK_PtrInduction: { 2517 assert(isa<Constant>(Step) && 2518 "Expected constant step for pointer induction"); 2519 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2520 } 2521 case InductionDescriptor::IK_FpInduction: { 2522 assert(!isa<VectorType>(Index->getType()) && 2523 "Vector indices not supported for FP inductions yet"); 2524 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2525 auto InductionBinOp = ID.getInductionBinOp(); 2526 assert(InductionBinOp && 2527 (InductionBinOp->getOpcode() == Instruction::FAdd || 2528 InductionBinOp->getOpcode() == Instruction::FSub) && 2529 "Original bin op should be defined for FP induction"); 2530 2531 Value *MulExp = B.CreateFMul(Step, Index); 2532 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2533 "induction"); 2534 } 2535 case InductionDescriptor::IK_NoInduction: 2536 return nullptr; 2537 } 2538 llvm_unreachable("invalid enum"); 2539 } 2540 2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2542 const VPIteration &Instance, 2543 VPTransformState &State) { 2544 Value *ScalarInst = State.get(Def, Instance); 2545 Value *VectorValue = State.get(Def, Instance.Part); 2546 VectorValue = Builder.CreateInsertElement( 2547 VectorValue, ScalarInst, 2548 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2549 State.set(Def, VectorValue, Instance.Part); 2550 } 2551 2552 // Return whether we allow using masked interleave-groups (for dealing with 2553 // strided loads/stores that reside in predicated blocks, or for dealing 2554 // with gaps). 2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2556 // If an override option has been passed in for interleaved accesses, use it. 2557 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2558 return EnableMaskedInterleavedMemAccesses; 2559 2560 return TTI.enableMaskedInterleavedAccessVectorization(); 2561 } 2562 2563 // Try to vectorize the interleave group that \p Instr belongs to. 2564 // 2565 // E.g. Translate following interleaved load group (factor = 3): 2566 // for (i = 0; i < N; i+=3) { 2567 // R = Pic[i]; // Member of index 0 2568 // G = Pic[i+1]; // Member of index 1 2569 // B = Pic[i+2]; // Member of index 2 2570 // ... // do something to R, G, B 2571 // } 2572 // To: 2573 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2574 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2575 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2576 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2577 // 2578 // Or translate following interleaved store group (factor = 3): 2579 // for (i = 0; i < N; i+=3) { 2580 // ... do something to R, G, B 2581 // Pic[i] = R; // Member of index 0 2582 // Pic[i+1] = G; // Member of index 1 2583 // Pic[i+2] = B; // Member of index 2 2584 // } 2585 // To: 2586 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2587 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2588 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2589 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2590 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2591 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2592 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2593 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2594 VPValue *BlockInMask) { 2595 Instruction *Instr = Group->getInsertPos(); 2596 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2597 2598 // Prepare for the vector type of the interleaved load/store. 2599 Type *ScalarTy = getLoadStoreType(Instr); 2600 unsigned InterleaveFactor = Group->getFactor(); 2601 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2602 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2603 2604 // Prepare for the new pointers. 2605 SmallVector<Value *, 2> AddrParts; 2606 unsigned Index = Group->getIndex(Instr); 2607 2608 // TODO: extend the masked interleaved-group support to reversed access. 2609 assert((!BlockInMask || !Group->isReverse()) && 2610 "Reversed masked interleave-group not supported."); 2611 2612 // If the group is reverse, adjust the index to refer to the last vector lane 2613 // instead of the first. We adjust the index from the first vector lane, 2614 // rather than directly getting the pointer for lane VF - 1, because the 2615 // pointer operand of the interleaved access is supposed to be uniform. For 2616 // uniform instructions, we're only required to generate a value for the 2617 // first vector lane in each unroll iteration. 2618 if (Group->isReverse()) 2619 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2620 2621 for (unsigned Part = 0; Part < UF; Part++) { 2622 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2623 State.setDebugLocFromInst(AddrPart); 2624 2625 // Notice current instruction could be any index. Need to adjust the address 2626 // to the member of index 0. 2627 // 2628 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2629 // b = A[i]; // Member of index 0 2630 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2631 // 2632 // E.g. A[i+1] = a; // Member of index 1 2633 // A[i] = b; // Member of index 0 2634 // A[i+2] = c; // Member of index 2 (Current instruction) 2635 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2636 2637 bool InBounds = false; 2638 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2639 InBounds = gep->isInBounds(); 2640 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2641 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2642 2643 // Cast to the vector pointer type. 2644 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2645 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2646 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2647 } 2648 2649 State.setDebugLocFromInst(Instr); 2650 Value *PoisonVec = PoisonValue::get(VecTy); 2651 2652 Value *MaskForGaps = nullptr; 2653 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2654 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2655 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2656 } 2657 2658 // Vectorize the interleaved load group. 2659 if (isa<LoadInst>(Instr)) { 2660 // For each unroll part, create a wide load for the group. 2661 SmallVector<Value *, 2> NewLoads; 2662 for (unsigned Part = 0; Part < UF; Part++) { 2663 Instruction *NewLoad; 2664 if (BlockInMask || MaskForGaps) { 2665 assert(useMaskedInterleavedAccesses(*TTI) && 2666 "masked interleaved groups are not allowed."); 2667 Value *GroupMask = MaskForGaps; 2668 if (BlockInMask) { 2669 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2670 Value *ShuffledMask = Builder.CreateShuffleVector( 2671 BlockInMaskPart, 2672 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2673 "interleaved.mask"); 2674 GroupMask = MaskForGaps 2675 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2676 MaskForGaps) 2677 : ShuffledMask; 2678 } 2679 NewLoad = 2680 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2681 GroupMask, PoisonVec, "wide.masked.vec"); 2682 } 2683 else 2684 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2685 Group->getAlign(), "wide.vec"); 2686 Group->addMetadata(NewLoad); 2687 NewLoads.push_back(NewLoad); 2688 } 2689 2690 // For each member in the group, shuffle out the appropriate data from the 2691 // wide loads. 2692 unsigned J = 0; 2693 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2694 Instruction *Member = Group->getMember(I); 2695 2696 // Skip the gaps in the group. 2697 if (!Member) 2698 continue; 2699 2700 auto StrideMask = 2701 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2702 for (unsigned Part = 0; Part < UF; Part++) { 2703 Value *StridedVec = Builder.CreateShuffleVector( 2704 NewLoads[Part], StrideMask, "strided.vec"); 2705 2706 // If this member has different type, cast the result type. 2707 if (Member->getType() != ScalarTy) { 2708 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2709 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2710 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2711 } 2712 2713 if (Group->isReverse()) 2714 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2715 2716 State.set(VPDefs[J], StridedVec, Part); 2717 } 2718 ++J; 2719 } 2720 return; 2721 } 2722 2723 // The sub vector type for current instruction. 2724 auto *SubVT = VectorType::get(ScalarTy, VF); 2725 2726 // Vectorize the interleaved store group. 2727 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2728 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2729 "masked interleaved groups are not allowed."); 2730 assert((!MaskForGaps || !VF.isScalable()) && 2731 "masking gaps for scalable vectors is not yet supported."); 2732 for (unsigned Part = 0; Part < UF; Part++) { 2733 // Collect the stored vector from each member. 2734 SmallVector<Value *, 4> StoredVecs; 2735 unsigned StoredIdx = 0; 2736 for (unsigned i = 0; i < InterleaveFactor; i++) { 2737 assert((Group->getMember(i) || MaskForGaps) && 2738 "Fail to get a member from an interleaved store group"); 2739 Instruction *Member = Group->getMember(i); 2740 2741 // Skip the gaps in the group. 2742 if (!Member) { 2743 Value *Undef = PoisonValue::get(SubVT); 2744 StoredVecs.push_back(Undef); 2745 continue; 2746 } 2747 2748 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2749 ++StoredIdx; 2750 2751 if (Group->isReverse()) 2752 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2753 2754 // If this member has different type, cast it to a unified type. 2755 2756 if (StoredVec->getType() != SubVT) 2757 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2758 2759 StoredVecs.push_back(StoredVec); 2760 } 2761 2762 // Concatenate all vectors into a wide vector. 2763 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2764 2765 // Interleave the elements in the wide vector. 2766 Value *IVec = Builder.CreateShuffleVector( 2767 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2768 "interleaved.vec"); 2769 2770 Instruction *NewStoreInstr; 2771 if (BlockInMask || MaskForGaps) { 2772 Value *GroupMask = MaskForGaps; 2773 if (BlockInMask) { 2774 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2775 Value *ShuffledMask = Builder.CreateShuffleVector( 2776 BlockInMaskPart, 2777 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2778 "interleaved.mask"); 2779 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2780 ShuffledMask, MaskForGaps) 2781 : ShuffledMask; 2782 } 2783 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2784 Group->getAlign(), GroupMask); 2785 } else 2786 NewStoreInstr = 2787 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2788 2789 Group->addMetadata(NewStoreInstr); 2790 } 2791 } 2792 2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2794 VPReplicateRecipe *RepRecipe, 2795 const VPIteration &Instance, 2796 bool IfPredicateInstr, 2797 VPTransformState &State) { 2798 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2799 2800 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2801 // the first lane and part. 2802 if (isa<NoAliasScopeDeclInst>(Instr)) 2803 if (!Instance.isFirstIteration()) 2804 return; 2805 2806 // Does this instruction return a value ? 2807 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2808 2809 Instruction *Cloned = Instr->clone(); 2810 if (!IsVoidRetTy) 2811 Cloned->setName(Instr->getName() + ".cloned"); 2812 2813 // If the scalarized instruction contributes to the address computation of a 2814 // widen masked load/store which was in a basic block that needed predication 2815 // and is not predicated after vectorization, we can't propagate 2816 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2817 // instruction could feed a poison value to the base address of the widen 2818 // load/store. 2819 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2820 Cloned->dropPoisonGeneratingFlags(); 2821 2822 if (Instr->getDebugLoc()) 2823 State.setDebugLocFromInst(Instr); 2824 2825 // Replace the operands of the cloned instructions with their scalar 2826 // equivalents in the new loop. 2827 for (const auto &I : enumerate(RepRecipe->operands())) { 2828 auto InputInstance = Instance; 2829 VPValue *Operand = I.value(); 2830 if (vputils::isUniformAfterVectorization(Operand)) 2831 InputInstance.Lane = VPLane::getFirstLane(); 2832 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2833 } 2834 State.addNewMetadata(Cloned, Instr); 2835 2836 // Place the cloned scalar in the new loop. 2837 State.Builder.Insert(Cloned); 2838 2839 State.set(RepRecipe, Cloned, Instance); 2840 2841 // If we just cloned a new assumption, add it the assumption cache. 2842 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2843 AC->registerAssumption(II); 2844 2845 // End if-block. 2846 if (IfPredicateInstr) 2847 PredicatedInstructions.push_back(Cloned); 2848 } 2849 2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2851 if (TripCount) 2852 return TripCount; 2853 2854 assert(InsertBlock); 2855 IRBuilder<> Builder(InsertBlock->getTerminator()); 2856 // Find the loop boundaries. 2857 Type *IdxTy = Legal->getWidestInductionType(); 2858 assert(IdxTy && "No type for induction"); 2859 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); 2860 2861 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2862 2863 // Expand the trip count and place the new instructions in the preheader. 2864 // Notice that the pre-header does not change, only the loop body. 2865 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2866 2867 // Count holds the overall loop count (N). 2868 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2869 InsertBlock->getTerminator()); 2870 2871 if (TripCount->getType()->isPointerTy()) 2872 TripCount = 2873 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2874 InsertBlock->getTerminator()); 2875 2876 return TripCount; 2877 } 2878 2879 Value * 2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2881 if (VectorTripCount) 2882 return VectorTripCount; 2883 2884 Value *TC = getOrCreateTripCount(InsertBlock); 2885 IRBuilder<> Builder(InsertBlock->getTerminator()); 2886 2887 Type *Ty = TC->getType(); 2888 // This is where we can make the step a runtime constant. 2889 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2890 2891 // If the tail is to be folded by masking, round the number of iterations N 2892 // up to a multiple of Step instead of rounding down. This is done by first 2893 // adding Step-1 and then rounding down. Note that it's ok if this addition 2894 // overflows: the vector induction variable will eventually wrap to zero given 2895 // that it starts at zero and its Step is a power of two; the loop will then 2896 // exit, with the last early-exit vector comparison also producing all-true. 2897 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2898 // is accounted for in emitIterationCountCheck that adds an overflow check. 2899 if (Cost->foldTailByMasking()) { 2900 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2901 "VF*UF must be a power of 2 when folding tail by masking"); 2902 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2903 TC = Builder.CreateAdd( 2904 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2905 } 2906 2907 // Now we need to generate the expression for the part of the loop that the 2908 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2909 // iterations are not required for correctness, or N - Step, otherwise. Step 2910 // is equal to the vectorization factor (number of SIMD elements) times the 2911 // unroll factor (number of SIMD instructions). 2912 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2913 2914 // There are cases where we *must* run at least one iteration in the remainder 2915 // loop. See the cost model for when this can happen. If the step evenly 2916 // divides the trip count, we set the remainder to be equal to the step. If 2917 // the step does not evenly divide the trip count, no adjustment is necessary 2918 // since there will already be scalar iterations. Note that the minimum 2919 // iterations check ensures that N >= Step. 2920 if (Cost->requiresScalarEpilogue(VF)) { 2921 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2922 R = Builder.CreateSelect(IsZero, Step, R); 2923 } 2924 2925 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2926 2927 return VectorTripCount; 2928 } 2929 2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2931 const DataLayout &DL) { 2932 // Verify that V is a vector type with same number of elements as DstVTy. 2933 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2934 unsigned VF = DstFVTy->getNumElements(); 2935 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2936 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2937 Type *SrcElemTy = SrcVecTy->getElementType(); 2938 Type *DstElemTy = DstFVTy->getElementType(); 2939 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2940 "Vector elements must have same size"); 2941 2942 // Do a direct cast if element types are castable. 2943 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2944 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2945 } 2946 // V cannot be directly casted to desired vector type. 2947 // May happen when V is a floating point vector but DstVTy is a vector of 2948 // pointers or vice-versa. Handle this using a two-step bitcast using an 2949 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2950 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2951 "Only one type should be a pointer type"); 2952 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2953 "Only one type should be a floating point type"); 2954 Type *IntTy = 2955 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2956 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2957 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2958 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2959 } 2960 2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2962 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2963 // Reuse existing vector loop preheader for TC checks. 2964 // Note that new preheader block is generated for vector loop. 2965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2966 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2967 2968 // Generate code to check if the loop's trip count is less than VF * UF, or 2969 // equal to it in case a scalar epilogue is required; this implies that the 2970 // vector trip count is zero. This check also covers the case where adding one 2971 // to the backedge-taken count overflowed leading to an incorrect trip count 2972 // of zero. In this case we will also jump to the scalar loop. 2973 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2974 : ICmpInst::ICMP_ULT; 2975 2976 // If tail is to be folded, vector loop takes care of all iterations. 2977 Type *CountTy = Count->getType(); 2978 Value *CheckMinIters = Builder.getFalse(); 2979 auto CreateStep = [&]() -> Value * { 2980 // Create step with max(MinProTripCount, UF * VF). 2981 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2982 return createStepForVF(Builder, CountTy, VF, UF); 2983 2984 Value *MinProfTC = 2985 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2986 if (!VF.isScalable()) 2987 return MinProfTC; 2988 return Builder.CreateBinaryIntrinsic( 2989 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2990 }; 2991 2992 if (!Cost->foldTailByMasking()) 2993 CheckMinIters = 2994 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2995 else if (VF.isScalable()) { 2996 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2997 // an overflow to zero when updating induction variables and so an 2998 // additional overflow check is required before entering the vector loop. 2999 3000 // Get the maximum unsigned value for the type. 3001 Value *MaxUIntTripCount = 3002 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 3003 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 3004 3005 // Don't execute the vector loop if (UMax - n) < (VF * UF). 3006 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 3007 } 3008 3009 // Create new preheader for vector loop. 3010 LoopVectorPreHeader = 3011 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3012 "vector.ph"); 3013 3014 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3015 DT->getNode(Bypass)->getIDom()) && 3016 "TC check is expected to dominate Bypass"); 3017 3018 // Update dominator for Bypass & LoopExit (if needed). 3019 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3020 if (!Cost->requiresScalarEpilogue(VF)) 3021 // If there is an epilogue which must run, there's no edge from the 3022 // middle block to exit blocks and thus no need to update the immediate 3023 // dominator of the exit blocks. 3024 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3025 3026 ReplaceInstWithInst( 3027 TCCheckBlock->getTerminator(), 3028 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3029 LoopBypassBlocks.push_back(TCCheckBlock); 3030 } 3031 3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3033 BasicBlock *const SCEVCheckBlock = 3034 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3035 if (!SCEVCheckBlock) 3036 return nullptr; 3037 3038 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3039 (OptForSizeBasedOnProfile && 3040 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3041 "Cannot SCEV check stride or overflow when optimizing for size"); 3042 3043 3044 // Update dominator only if this is first RT check. 3045 if (LoopBypassBlocks.empty()) { 3046 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3047 if (!Cost->requiresScalarEpilogue(VF)) 3048 // If there is an epilogue which must run, there's no edge from the 3049 // middle block to exit blocks and thus no need to update the immediate 3050 // dominator of the exit blocks. 3051 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3052 } 3053 3054 LoopBypassBlocks.push_back(SCEVCheckBlock); 3055 AddedSafetyChecks = true; 3056 return SCEVCheckBlock; 3057 } 3058 3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3060 // VPlan-native path does not do any analysis for runtime checks currently. 3061 if (EnableVPlanNativePath) 3062 return nullptr; 3063 3064 BasicBlock *const MemCheckBlock = 3065 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3066 3067 // Check if we generated code that checks in runtime if arrays overlap. We put 3068 // the checks into a separate block to make the more common case of few 3069 // elements faster. 3070 if (!MemCheckBlock) 3071 return nullptr; 3072 3073 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3074 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3075 "Cannot emit memory checks when optimizing for size, unless forced " 3076 "to vectorize."); 3077 ORE->emit([&]() { 3078 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3079 OrigLoop->getStartLoc(), 3080 OrigLoop->getHeader()) 3081 << "Code-size may be reduced by not forcing " 3082 "vectorization, or by source-code modifications " 3083 "eliminating the need for runtime checks " 3084 "(e.g., adding 'restrict')."; 3085 }); 3086 } 3087 3088 LoopBypassBlocks.push_back(MemCheckBlock); 3089 3090 AddedSafetyChecks = true; 3091 3092 return MemCheckBlock; 3093 } 3094 3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3096 LoopScalarBody = OrigLoop->getHeader(); 3097 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3098 assert(LoopVectorPreHeader && "Invalid loop structure"); 3099 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3100 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3101 "multiple exit loop without required epilogue?"); 3102 3103 LoopMiddleBlock = 3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3105 LI, nullptr, Twine(Prefix) + "middle.block"); 3106 LoopScalarPreHeader = 3107 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3108 nullptr, Twine(Prefix) + "scalar.ph"); 3109 3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3111 3112 // Set up the middle block terminator. Two cases: 3113 // 1) If we know that we must execute the scalar epilogue, emit an 3114 // unconditional branch. 3115 // 2) Otherwise, we must have a single unique exit block (due to how we 3116 // implement the multiple exit case). In this case, set up a conditional 3117 // branch from the middle block to the loop scalar preheader, and the 3118 // exit block. completeLoopSkeleton will update the condition to use an 3119 // iteration check, if required to decide whether to execute the remainder. 3120 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3121 BranchInst::Create(LoopScalarPreHeader) : 3122 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3123 Builder.getTrue()); 3124 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3125 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3126 3127 // Update dominator for loop exit. During skeleton creation, only the vector 3128 // pre-header and the middle block are created. The vector loop is entirely 3129 // created during VPlan exection. 3130 if (!Cost->requiresScalarEpilogue(VF)) 3131 // If there is an epilogue which must run, there's no edge from the 3132 // middle block to exit blocks and thus no need to update the immediate 3133 // dominator of the exit blocks. 3134 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3135 } 3136 3137 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3138 PHINode *OrigPhi, const InductionDescriptor &II, 3139 ArrayRef<BasicBlock *> BypassBlocks, 3140 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3141 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3142 assert(VectorTripCount && "Expected valid arguments"); 3143 3144 Instruction *OldInduction = Legal->getPrimaryInduction(); 3145 Value *&EndValue = IVEndValues[OrigPhi]; 3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3147 if (OrigPhi == OldInduction) { 3148 // We know what the end value is. 3149 EndValue = VectorTripCount; 3150 } else { 3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3152 3153 // Fast-math-flags propagate from the original induction instruction. 3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3156 3157 Value *Step = 3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3159 EndValue = 3160 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); 3161 EndValue->setName("ind.end"); 3162 3163 // Compute the end value for the additional bypass (if applicable). 3164 if (AdditionalBypass.first) { 3165 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3166 Value *Step = 3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3168 EndValueFromAdditionalBypass = emitTransformedIndex( 3169 B, AdditionalBypass.second, II.getStartValue(), Step, II); 3170 EndValueFromAdditionalBypass->setName("ind.end"); 3171 } 3172 } 3173 3174 // Create phi nodes to merge from the backedge-taken check block. 3175 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3176 LoopScalarPreHeader->getTerminator()); 3177 // Copy original phi DL over to the new one. 3178 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3179 3180 // The new PHI merges the original incoming value, in case of a bypass, 3181 // or the value at the end of the vectorized loop. 3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3183 3184 // Fix the scalar body counter (PHI node). 3185 // The old induction's phi node in the scalar body needs the truncated 3186 // value. 3187 for (BasicBlock *BB : BypassBlocks) 3188 BCResumeVal->addIncoming(II.getStartValue(), BB); 3189 3190 if (AdditionalBypass.first) 3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3192 EndValueFromAdditionalBypass); 3193 return BCResumeVal; 3194 } 3195 3196 void InnerLoopVectorizer::createInductionResumeValues( 3197 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3198 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3199 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3200 "Inconsistent information about additional bypass."); 3201 // We are going to resume the execution of the scalar loop. 3202 // Go over all of the induction variables that we found and fix the 3203 // PHIs that are left in the scalar version of the loop. 3204 // The starting values of PHI nodes depend on the counter of the last 3205 // iteration in the vectorized loop. 3206 // If we come from a bypass edge then we need to start from the original 3207 // start value. 3208 for (const auto &InductionEntry : Legal->getInductionVars()) { 3209 PHINode *OrigPhi = InductionEntry.first; 3210 const InductionDescriptor &II = InductionEntry.second; 3211 PHINode *BCResumeVal = createInductionResumeValue( 3212 OrigPhi, II, LoopBypassBlocks, AdditionalBypass); 3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3214 } 3215 } 3216 3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3218 // The trip counts should be cached by now. 3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3221 3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3223 3224 // Add a check in the middle block to see if we have completed 3225 // all of the iterations in the first vector loop. Three cases: 3226 // 1) If we require a scalar epilogue, there is no conditional branch as 3227 // we unconditionally branch to the scalar preheader. Do nothing. 3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3229 // Thus if tail is to be folded, we know we don't need to run the 3230 // remainder and we can use the previous value for the condition (true). 3231 // 3) Otherwise, construct a runtime check. 3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3234 Count, VectorTripCount, "cmp.n", 3235 LoopMiddleBlock->getTerminator()); 3236 3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3238 // of the corresponding compare because they may have ended up with 3239 // different line numbers and we want to avoid awkward line stepping while 3240 // debugging. Eg. if the compare has got a line number inside the loop. 3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3243 } 3244 3245 #ifdef EXPENSIVE_CHECKS 3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3247 #endif 3248 3249 return LoopVectorPreHeader; 3250 } 3251 3252 std::pair<BasicBlock *, Value *> 3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3254 /* 3255 In this function we generate a new loop. The new loop will contain 3256 the vectorized instructions while the old loop will continue to run the 3257 scalar remainder. 3258 3259 [ ] <-- loop iteration number check. 3260 / | 3261 / v 3262 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3263 | / | 3264 | / v 3265 || [ ] <-- vector pre header. 3266 |/ | 3267 | v 3268 | [ ] \ 3269 | [ ]_| <-- vector loop (created during VPlan execution). 3270 | | 3271 | v 3272 \ -[ ] <--- middle-block. 3273 \/ | 3274 /\ v 3275 | ->[ ] <--- new preheader. 3276 | | 3277 (opt) v <-- edge from middle to exit iff epilogue is not required. 3278 | [ ] \ 3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3280 \ | 3281 \ v 3282 >[ ] <-- exit block(s). 3283 ... 3284 */ 3285 3286 // Create an empty vector loop, and prepare basic blocks for the runtime 3287 // checks. 3288 createVectorLoopSkeleton(""); 3289 3290 // Now, compare the new count to zero. If it is zero skip the vector loop and 3291 // jump to the scalar loop. This check also covers the case where the 3292 // backedge-taken count is uint##_max: adding one to it will overflow leading 3293 // to an incorrect trip count of zero. In this (rare) case we will also jump 3294 // to the scalar loop. 3295 emitIterationCountCheck(LoopScalarPreHeader); 3296 3297 // Generate the code to check any assumptions that we've made for SCEV 3298 // expressions. 3299 emitSCEVChecks(LoopScalarPreHeader); 3300 3301 // Generate the code that checks in runtime if arrays overlap. We put the 3302 // checks into a separate block to make the more common case of few elements 3303 // faster. 3304 emitMemRuntimeChecks(LoopScalarPreHeader); 3305 3306 // Emit phis for the new starting index of the scalar loop. 3307 createInductionResumeValues(); 3308 3309 return {completeLoopSkeleton(), nullptr}; 3310 } 3311 3312 // Fix up external users of the induction variable. At this point, we are 3313 // in LCSSA form, with all external PHIs that use the IV having one input value, 3314 // coming from the remainder loop. We need those PHIs to also have a correct 3315 // value for the IV when arriving directly from the middle block. 3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3317 const InductionDescriptor &II, 3318 Value *VectorTripCount, Value *EndValue, 3319 BasicBlock *MiddleBlock, 3320 BasicBlock *VectorHeader, VPlan &Plan) { 3321 // There are two kinds of external IV usages - those that use the value 3322 // computed in the last iteration (the PHI) and those that use the penultimate 3323 // value (the value that feeds into the phi from the loop latch). 3324 // We allow both, but they, obviously, have different values. 3325 3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3327 3328 DenseMap<Value *, Value *> MissingVals; 3329 3330 // An external user of the last iteration's value should see the value that 3331 // the remainder loop uses to initialize its own IV. 3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3333 for (User *U : PostInc->users()) { 3334 Instruction *UI = cast<Instruction>(U); 3335 if (!OrigLoop->contains(UI)) { 3336 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3337 MissingVals[UI] = EndValue; 3338 } 3339 } 3340 3341 // An external user of the penultimate value need to see EndValue - Step. 3342 // The simplest way to get this is to recompute it from the constituent SCEVs, 3343 // that is Start + (Step * (CRD - 1)). 3344 for (User *U : OrigPhi->users()) { 3345 auto *UI = cast<Instruction>(U); 3346 if (!OrigLoop->contains(UI)) { 3347 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3348 3349 IRBuilder<> B(MiddleBlock->getTerminator()); 3350 3351 // Fast-math-flags propagate from the original induction instruction. 3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3354 3355 Value *CountMinusOne = B.CreateSub( 3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3357 CountMinusOne->setName("cmo"); 3358 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3359 VectorHeader->getTerminator()); 3360 Value *Escape = 3361 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); 3362 Escape->setName("ind.escape"); 3363 MissingVals[UI] = Escape; 3364 } 3365 } 3366 3367 for (auto &I : MissingVals) { 3368 PHINode *PHI = cast<PHINode>(I.first); 3369 // One corner case we have to handle is two IVs "chasing" each-other, 3370 // that is %IV2 = phi [...], [ %IV1, %latch ] 3371 // In this case, if IV1 has an external use, we need to avoid adding both 3372 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3373 // don't already have an incoming value for the middle block. 3374 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3375 PHI->addIncoming(I.second, MiddleBlock); 3376 Plan.removeLiveOut(PHI); 3377 } 3378 } 3379 } 3380 3381 namespace { 3382 3383 struct CSEDenseMapInfo { 3384 static bool canHandle(const Instruction *I) { 3385 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3386 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3387 } 3388 3389 static inline Instruction *getEmptyKey() { 3390 return DenseMapInfo<Instruction *>::getEmptyKey(); 3391 } 3392 3393 static inline Instruction *getTombstoneKey() { 3394 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3395 } 3396 3397 static unsigned getHashValue(const Instruction *I) { 3398 assert(canHandle(I) && "Unknown instruction!"); 3399 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3400 I->value_op_end())); 3401 } 3402 3403 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3404 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3405 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3406 return LHS == RHS; 3407 return LHS->isIdenticalTo(RHS); 3408 } 3409 }; 3410 3411 } // end anonymous namespace 3412 3413 ///Perform cse of induction variable instructions. 3414 static void cse(BasicBlock *BB) { 3415 // Perform simple cse. 3416 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3417 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3418 if (!CSEDenseMapInfo::canHandle(&In)) 3419 continue; 3420 3421 // Check if we can replace this instruction with any of the 3422 // visited instructions. 3423 if (Instruction *V = CSEMap.lookup(&In)) { 3424 In.replaceAllUsesWith(V); 3425 In.eraseFromParent(); 3426 continue; 3427 } 3428 3429 CSEMap[&In] = &In; 3430 } 3431 } 3432 3433 InstructionCost 3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3435 bool &NeedToScalarize) const { 3436 Function *F = CI->getCalledFunction(); 3437 Type *ScalarRetTy = CI->getType(); 3438 SmallVector<Type *, 4> Tys, ScalarTys; 3439 for (auto &ArgOp : CI->args()) 3440 ScalarTys.push_back(ArgOp->getType()); 3441 3442 // Estimate cost of scalarized vector call. The source operands are assumed 3443 // to be vectors, so we need to extract individual elements from there, 3444 // execute VF scalar calls, and then gather the result into the vector return 3445 // value. 3446 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3447 InstructionCost ScalarCallCost = 3448 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); 3449 if (VF.isScalar()) 3450 return ScalarCallCost; 3451 3452 // Compute corresponding vector type for return value and arguments. 3453 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3454 for (Type *ScalarTy : ScalarTys) 3455 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3456 3457 // Compute costs of unpacking argument values for the scalar calls and 3458 // packing the return values to a vector. 3459 InstructionCost ScalarizationCost = 3460 getScalarizationOverhead(CI, VF, CostKind); 3461 3462 InstructionCost Cost = 3463 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3464 3465 // If we can't emit a vector call for this function, then the currently found 3466 // cost is the cost we need to return. 3467 NeedToScalarize = true; 3468 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3470 3471 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3472 return Cost; 3473 3474 // If the corresponding vector cost is cheaper, return its cost. 3475 InstructionCost VectorCallCost = 3476 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); 3477 if (VectorCallCost < Cost) { 3478 NeedToScalarize = false; 3479 Cost = VectorCallCost; 3480 } 3481 return Cost; 3482 } 3483 3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3485 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3486 return Elt; 3487 return VectorType::get(Elt, VF); 3488 } 3489 3490 InstructionCost 3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3492 ElementCount VF) const { 3493 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3494 assert(ID && "Expected intrinsic call!"); 3495 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3496 FastMathFlags FMF; 3497 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3498 FMF = FPMO->getFastMathFlags(); 3499 3500 SmallVector<const Value *> Arguments(CI->args()); 3501 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3502 SmallVector<Type *> ParamTys; 3503 std::transform(FTy->param_begin(), FTy->param_end(), 3504 std::back_inserter(ParamTys), 3505 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3506 3507 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3508 dyn_cast<IntrinsicInst>(CI)); 3509 return TTI.getIntrinsicInstrCost(CostAttrs, 3510 TargetTransformInfo::TCK_RecipThroughput); 3511 } 3512 3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3517 } 3518 3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3523 } 3524 3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3526 // For every instruction `I` in MinBWs, truncate the operands, create a 3527 // truncated version of `I` and reextend its result. InstCombine runs 3528 // later and will remove any ext/trunc pairs. 3529 SmallPtrSet<Value *, 4> Erased; 3530 for (const auto &KV : Cost->getMinimalBitwidths()) { 3531 // If the value wasn't vectorized, we must maintain the original scalar 3532 // type. The absence of the value from State indicates that it 3533 // wasn't vectorized. 3534 // FIXME: Should not rely on getVPValue at this point. 3535 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3536 if (!State.hasAnyVectorValue(Def)) 3537 continue; 3538 for (unsigned Part = 0; Part < UF; ++Part) { 3539 Value *I = State.get(Def, Part); 3540 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3541 continue; 3542 Type *OriginalTy = I->getType(); 3543 Type *ScalarTruncatedTy = 3544 IntegerType::get(OriginalTy->getContext(), KV.second); 3545 auto *TruncatedTy = VectorType::get( 3546 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3547 if (TruncatedTy == OriginalTy) 3548 continue; 3549 3550 IRBuilder<> B(cast<Instruction>(I)); 3551 auto ShrinkOperand = [&](Value *V) -> Value * { 3552 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3553 if (ZI->getSrcTy() == TruncatedTy) 3554 return ZI->getOperand(0); 3555 return B.CreateZExtOrTrunc(V, TruncatedTy); 3556 }; 3557 3558 // The actual instruction modification depends on the instruction type, 3559 // unfortunately. 3560 Value *NewI = nullptr; 3561 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3562 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3563 ShrinkOperand(BO->getOperand(1))); 3564 3565 // Any wrapping introduced by shrinking this operation shouldn't be 3566 // considered undefined behavior. So, we can't unconditionally copy 3567 // arithmetic wrapping flags to NewI. 3568 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3569 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3570 NewI = 3571 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3572 ShrinkOperand(CI->getOperand(1))); 3573 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3574 NewI = B.CreateSelect(SI->getCondition(), 3575 ShrinkOperand(SI->getTrueValue()), 3576 ShrinkOperand(SI->getFalseValue())); 3577 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3578 switch (CI->getOpcode()) { 3579 default: 3580 llvm_unreachable("Unhandled cast!"); 3581 case Instruction::Trunc: 3582 NewI = ShrinkOperand(CI->getOperand(0)); 3583 break; 3584 case Instruction::SExt: 3585 NewI = B.CreateSExtOrTrunc( 3586 CI->getOperand(0), 3587 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3588 break; 3589 case Instruction::ZExt: 3590 NewI = B.CreateZExtOrTrunc( 3591 CI->getOperand(0), 3592 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3593 break; 3594 } 3595 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3596 auto Elements0 = 3597 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3598 auto *O0 = B.CreateZExtOrTrunc( 3599 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3600 auto Elements1 = 3601 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3602 auto *O1 = B.CreateZExtOrTrunc( 3603 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3604 3605 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3606 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3607 // Don't do anything with the operands, just extend the result. 3608 continue; 3609 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3610 auto Elements = 3611 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3612 auto *O0 = B.CreateZExtOrTrunc( 3613 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3617 auto Elements = 3618 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3619 auto *O0 = B.CreateZExtOrTrunc( 3620 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3621 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3622 } else { 3623 // If we don't know what to do, be conservative and don't do anything. 3624 continue; 3625 } 3626 3627 // Lastly, extend the result. 3628 NewI->takeName(cast<Instruction>(I)); 3629 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3630 I->replaceAllUsesWith(Res); 3631 cast<Instruction>(I)->eraseFromParent(); 3632 Erased.insert(I); 3633 State.reset(Def, Res, Part); 3634 } 3635 } 3636 3637 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3638 for (const auto &KV : Cost->getMinimalBitwidths()) { 3639 // If the value wasn't vectorized, we must maintain the original scalar 3640 // type. The absence of the value from State indicates that it 3641 // wasn't vectorized. 3642 // FIXME: Should not rely on getVPValue at this point. 3643 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3644 if (!State.hasAnyVectorValue(Def)) 3645 continue; 3646 for (unsigned Part = 0; Part < UF; ++Part) { 3647 Value *I = State.get(Def, Part); 3648 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3649 if (Inst && Inst->use_empty()) { 3650 Value *NewI = Inst->getOperand(0); 3651 Inst->eraseFromParent(); 3652 State.reset(Def, NewI, Part); 3653 } 3654 } 3655 } 3656 } 3657 3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3659 VPlan &Plan) { 3660 // Insert truncates and extends for any truncated instructions as hints to 3661 // InstCombine. 3662 if (VF.isVector()) 3663 truncateToMinimalBitwidths(State); 3664 3665 // Fix widened non-induction PHIs by setting up the PHI operands. 3666 if (EnableVPlanNativePath) 3667 fixNonInductionPHIs(Plan, State); 3668 3669 // At this point every instruction in the original loop is widened to a 3670 // vector form. Now we need to fix the recurrences in the loop. These PHI 3671 // nodes are currently empty because we did not want to introduce cycles. 3672 // This is the second stage of vectorizing recurrences. 3673 fixCrossIterationPHIs(State); 3674 3675 // Forget the original basic block. 3676 PSE.getSE()->forgetLoop(OrigLoop); 3677 3678 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3679 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3680 if (Cost->requiresScalarEpilogue(VF)) { 3681 // No edge from the middle block to the unique exit block has been inserted 3682 // and there is nothing to fix from vector loop; phis should have incoming 3683 // from scalar loop only. 3684 Plan.clearLiveOuts(); 3685 } else { 3686 // If we inserted an edge from the middle block to the unique exit block, 3687 // update uses outside the loop (phis) to account for the newly inserted 3688 // edge. 3689 3690 // Fix-up external users of the induction variables. 3691 for (const auto &Entry : Legal->getInductionVars()) 3692 fixupIVUsers(Entry.first, Entry.second, 3693 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3694 IVEndValues[Entry.first], LoopMiddleBlock, 3695 VectorLoop->getHeader(), Plan); 3696 } 3697 3698 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3699 // in the exit block, so update the builder. 3700 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3701 for (const auto &KV : Plan.getLiveOuts()) 3702 KV.second->fixPhi(Plan, State); 3703 3704 for (Instruction *PI : PredicatedInstructions) 3705 sinkScalarOperands(&*PI); 3706 3707 // Remove redundant induction instructions. 3708 cse(VectorLoop->getHeader()); 3709 3710 // Set/update profile weights for the vector and remainder loops as original 3711 // loop iterations are now distributed among them. Note that original loop 3712 // represented by LoopScalarBody becomes remainder loop after vectorization. 3713 // 3714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3715 // end up getting slightly roughened result but that should be OK since 3716 // profile is not inherently precise anyway. Note also possible bypass of 3717 // vector code caused by legality checks is ignored, assigning all the weight 3718 // to the vector loop, optimistically. 3719 // 3720 // For scalable vectorization we can't know at compile time how many iterations 3721 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3722 // vscale of '1'. 3723 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3724 LI->getLoopFor(LoopScalarBody), 3725 VF.getKnownMinValue() * UF); 3726 } 3727 3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3729 // In order to support recurrences we need to be able to vectorize Phi nodes. 3730 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3731 // stage #2: We now need to fix the recurrences by adding incoming edges to 3732 // the currently empty PHI nodes. At this point every instruction in the 3733 // original loop is widened to a vector form so we can use them to construct 3734 // the incoming edges. 3735 VPBasicBlock *Header = 3736 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3737 for (VPRecipeBase &R : Header->phis()) { 3738 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3739 fixReduction(ReductionPhi, State); 3740 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3741 fixFixedOrderRecurrence(FOR, State); 3742 } 3743 } 3744 3745 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3746 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3747 // This is the second phase of vectorizing first-order recurrences. An 3748 // overview of the transformation is described below. Suppose we have the 3749 // following loop. 3750 // 3751 // for (int i = 0; i < n; ++i) 3752 // b[i] = a[i] - a[i - 1]; 3753 // 3754 // There is a first-order recurrence on "a". For this loop, the shorthand 3755 // scalar IR looks like: 3756 // 3757 // scalar.ph: 3758 // s_init = a[-1] 3759 // br scalar.body 3760 // 3761 // scalar.body: 3762 // i = phi [0, scalar.ph], [i+1, scalar.body] 3763 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3764 // s2 = a[i] 3765 // b[i] = s2 - s1 3766 // br cond, scalar.body, ... 3767 // 3768 // In this example, s1 is a recurrence because it's value depends on the 3769 // previous iteration. In the first phase of vectorization, we created a 3770 // vector phi v1 for s1. We now complete the vectorization and produce the 3771 // shorthand vector IR shown below (for VF = 4, UF = 1). 3772 // 3773 // vector.ph: 3774 // v_init = vector(..., ..., ..., a[-1]) 3775 // br vector.body 3776 // 3777 // vector.body 3778 // i = phi [0, vector.ph], [i+4, vector.body] 3779 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3780 // v2 = a[i, i+1, i+2, i+3]; 3781 // v3 = vector(v1(3), v2(0, 1, 2)) 3782 // b[i, i+1, i+2, i+3] = v2 - v3 3783 // br cond, vector.body, middle.block 3784 // 3785 // middle.block: 3786 // x = v2(3) 3787 // br scalar.ph 3788 // 3789 // scalar.ph: 3790 // s_init = phi [x, middle.block], [a[-1], otherwise] 3791 // br scalar.body 3792 // 3793 // After execution completes the vector loop, we extract the next value of 3794 // the recurrence (x) to use as the initial value in the scalar loop. 3795 3796 // Extract the last vector element in the middle block. This will be the 3797 // initial value for the recurrence when jumping to the scalar loop. 3798 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3799 Value *Incoming = State.get(PreviousDef, UF - 1); 3800 auto *ExtractForScalar = Incoming; 3801 auto *IdxTy = Builder.getInt32Ty(); 3802 if (VF.isVector()) { 3803 auto *One = ConstantInt::get(IdxTy, 1); 3804 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3805 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3806 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3807 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3808 "vector.recur.extract"); 3809 } 3810 // Extract the second last element in the middle block if the 3811 // Phi is used outside the loop. We need to extract the phi itself 3812 // and not the last element (the phi update in the current iteration). This 3813 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3814 // when the scalar loop is not run at all. 3815 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3816 if (VF.isVector()) { 3817 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3818 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3819 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3820 Incoming, Idx, "vector.recur.extract.for.phi"); 3821 } else if (UF > 1) 3822 // When loop is unrolled without vectorizing, initialize 3823 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3824 // of `Incoming`. This is analogous to the vectorized case above: extracting 3825 // the second last element when VF > 1. 3826 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3827 3828 // Fix the initial value of the original recurrence in the scalar loop. 3829 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3830 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3831 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3832 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3833 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3834 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3835 Start->addIncoming(Incoming, BB); 3836 } 3837 3838 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3839 Phi->setName("scalar.recur"); 3840 3841 // Finally, fix users of the recurrence outside the loop. The users will need 3842 // either the last value of the scalar recurrence or the last value of the 3843 // vector recurrence we extracted in the middle block. Since the loop is in 3844 // LCSSA form, we just need to find all the phi nodes for the original scalar 3845 // recurrence in the exit block, and then add an edge for the middle block. 3846 // Note that LCSSA does not imply single entry when the original scalar loop 3847 // had multiple exiting edges (as we always run the last iteration in the 3848 // scalar epilogue); in that case, there is no edge from middle to exit and 3849 // and thus no phis which needed updated. 3850 if (!Cost->requiresScalarEpilogue(VF)) 3851 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3852 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3853 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3854 State.Plan->removeLiveOut(&LCSSAPhi); 3855 } 3856 } 3857 3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3859 VPTransformState &State) { 3860 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3861 // Get it's reduction variable descriptor. 3862 assert(Legal->isReductionVariable(OrigPhi) && 3863 "Unable to find the reduction variable"); 3864 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3865 3866 RecurKind RK = RdxDesc.getRecurrenceKind(); 3867 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3868 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3869 State.setDebugLocFromInst(ReductionStartValue); 3870 3871 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3872 // This is the vector-clone of the value that leaves the loop. 3873 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3874 3875 // Wrap flags are in general invalid after vectorization, clear them. 3876 clearReductionWrapFlags(PhiR, State); 3877 3878 // Before each round, move the insertion point right between 3879 // the PHIs and the values we are going to write. 3880 // This allows us to write both PHINodes and the extractelement 3881 // instructions. 3882 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3883 3884 State.setDebugLocFromInst(LoopExitInst); 3885 3886 Type *PhiTy = OrigPhi->getType(); 3887 3888 VPBasicBlock *LatchVPBB = 3889 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3890 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3891 // If tail is folded by masking, the vector value to leave the loop should be 3892 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3893 // instead of the former. For an inloop reduction the reduction will already 3894 // be predicated, and does not need to be handled here. 3895 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3896 for (unsigned Part = 0; Part < UF; ++Part) { 3897 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3898 SelectInst *Sel = nullptr; 3899 for (User *U : VecLoopExitInst->users()) { 3900 if (isa<SelectInst>(U)) { 3901 assert(!Sel && "Reduction exit feeding two selects"); 3902 Sel = cast<SelectInst>(U); 3903 } else 3904 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3905 } 3906 assert(Sel && "Reduction exit feeds no select"); 3907 State.reset(LoopExitInstDef, Sel, Part); 3908 3909 if (isa<FPMathOperator>(Sel)) 3910 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3911 3912 // If the target can create a predicated operator for the reduction at no 3913 // extra cost in the loop (for example a predicated vadd), it can be 3914 // cheaper for the select to remain in the loop than be sunk out of it, 3915 // and so use the select value for the phi instead of the old 3916 // LoopExitValue. 3917 if (PreferPredicatedReductionSelect || 3918 TTI->preferPredicatedReductionSelect( 3919 RdxDesc.getOpcode(), PhiTy, 3920 TargetTransformInfo::ReductionFlags())) { 3921 auto *VecRdxPhi = 3922 cast<PHINode>(State.get(PhiR, Part)); 3923 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3924 } 3925 } 3926 } 3927 3928 // If the vector reduction can be performed in a smaller type, we truncate 3929 // then extend the loop exit value to enable InstCombine to evaluate the 3930 // entire expression in the smaller type. 3931 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3932 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3933 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3934 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3935 VectorParts RdxParts(UF); 3936 for (unsigned Part = 0; Part < UF; ++Part) { 3937 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3938 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3939 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3940 : Builder.CreateZExt(Trunc, VecTy); 3941 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3942 if (U != Trunc) { 3943 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3944 RdxParts[Part] = Extnd; 3945 } 3946 } 3947 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3948 for (unsigned Part = 0; Part < UF; ++Part) { 3949 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3950 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3951 } 3952 } 3953 3954 // Reduce all of the unrolled parts into a single vector. 3955 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3956 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3957 3958 // The middle block terminator has already been assigned a DebugLoc here (the 3959 // OrigLoop's single latch terminator). We want the whole middle block to 3960 // appear to execute on this line because: (a) it is all compiler generated, 3961 // (b) these instructions are always executed after evaluating the latch 3962 // conditional branch, and (c) other passes may add new predecessors which 3963 // terminate on this line. This is the easiest way to ensure we don't 3964 // accidentally cause an extra step back into the loop while debugging. 3965 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3966 if (PhiR->isOrdered()) 3967 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3968 else { 3969 // Floating-point operations should have some FMF to enable the reduction. 3970 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3972 for (unsigned Part = 1; Part < UF; ++Part) { 3973 Value *RdxPart = State.get(LoopExitInstDef, Part); 3974 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3975 ReducedPartRdx = Builder.CreateBinOp( 3976 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3977 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3978 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3979 ReducedPartRdx, RdxPart); 3980 else 3981 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3982 } 3983 } 3984 3985 // Create the reduction after the loop. Note that inloop reductions create the 3986 // target reduction in the loop using a Reduction recipe. 3987 if (VF.isVector() && !PhiR->isInLoop()) { 3988 ReducedPartRdx = 3989 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3990 // If the reduction can be performed in a smaller type, we need to extend 3991 // the reduction to the wider type before we branch to the original loop. 3992 if (PhiTy != RdxDesc.getRecurrenceType()) 3993 ReducedPartRdx = RdxDesc.isSigned() 3994 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3995 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3996 } 3997 3998 PHINode *ResumePhi = 3999 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4000 4001 // Create a phi node that merges control-flow from the backedge-taken check 4002 // block and the middle block. 4003 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4004 LoopScalarPreHeader->getTerminator()); 4005 4006 // If we are fixing reductions in the epilogue loop then we should already 4007 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4008 // we carry over the incoming values correctly. 4009 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4010 if (Incoming == LoopMiddleBlock) 4011 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4012 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4013 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4014 Incoming); 4015 else 4016 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4017 } 4018 4019 // Set the resume value for this reduction 4020 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4021 4022 // If there were stores of the reduction value to a uniform memory address 4023 // inside the loop, create the final store here. 4024 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4025 StoreInst *NewSI = 4026 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4027 propagateMetadata(NewSI, SI); 4028 4029 // If the reduction value is used in other places, 4030 // then let the code below create PHI's for that. 4031 } 4032 4033 // Now, we need to fix the users of the reduction variable 4034 // inside and outside of the scalar remainder loop. 4035 4036 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4037 // in the exit blocks. See comment on analogous loop in 4038 // fixFixedOrderRecurrence for a more complete explaination of the logic. 4039 if (!Cost->requiresScalarEpilogue(VF)) 4040 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4041 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4042 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4043 State.Plan->removeLiveOut(&LCSSAPhi); 4044 } 4045 4046 // Fix the scalar loop reduction variable with the incoming reduction sum 4047 // from the vector body and from the backedge value. 4048 int IncomingEdgeBlockIdx = 4049 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4050 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4051 // Pick the other block. 4052 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4053 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4054 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4055 } 4056 4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4058 VPTransformState &State) { 4059 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4060 RecurKind RK = RdxDesc.getRecurrenceKind(); 4061 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4062 return; 4063 4064 SmallVector<VPValue *, 8> Worklist; 4065 SmallPtrSet<VPValue *, 8> Visited; 4066 Worklist.push_back(PhiR); 4067 Visited.insert(PhiR); 4068 4069 while (!Worklist.empty()) { 4070 VPValue *Cur = Worklist.pop_back_val(); 4071 for (unsigned Part = 0; Part < UF; ++Part) { 4072 Value *V = State.get(Cur, Part); 4073 if (!isa<OverflowingBinaryOperator>(V)) 4074 break; 4075 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4076 } 4077 4078 for (VPUser *U : Cur->users()) { 4079 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4080 if (!UserRecipe) 4081 continue; 4082 for (VPValue *V : UserRecipe->definedValues()) 4083 if (Visited.insert(V).second) 4084 Worklist.push_back(V); 4085 } 4086 } 4087 } 4088 4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4090 // The basic block and loop containing the predicated instruction. 4091 auto *PredBB = PredInst->getParent(); 4092 auto *VectorLoop = LI->getLoopFor(PredBB); 4093 4094 // Initialize a worklist with the operands of the predicated instruction. 4095 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4096 4097 // Holds instructions that we need to analyze again. An instruction may be 4098 // reanalyzed if we don't yet know if we can sink it or not. 4099 SmallVector<Instruction *, 8> InstsToReanalyze; 4100 4101 // Returns true if a given use occurs in the predicated block. Phi nodes use 4102 // their operands in their corresponding predecessor blocks. 4103 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4104 auto *I = cast<Instruction>(U.getUser()); 4105 BasicBlock *BB = I->getParent(); 4106 if (auto *Phi = dyn_cast<PHINode>(I)) 4107 BB = Phi->getIncomingBlock( 4108 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4109 return BB == PredBB; 4110 }; 4111 4112 // Iteratively sink the scalarized operands of the predicated instruction 4113 // into the block we created for it. When an instruction is sunk, it's 4114 // operands are then added to the worklist. The algorithm ends after one pass 4115 // through the worklist doesn't sink a single instruction. 4116 bool Changed; 4117 do { 4118 // Add the instructions that need to be reanalyzed to the worklist, and 4119 // reset the changed indicator. 4120 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4121 InstsToReanalyze.clear(); 4122 Changed = false; 4123 4124 while (!Worklist.empty()) { 4125 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4126 4127 // We can't sink an instruction if it is a phi node, is not in the loop, 4128 // or may have side effects. 4129 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4130 I->mayHaveSideEffects()) 4131 continue; 4132 4133 // If the instruction is already in PredBB, check if we can sink its 4134 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4135 // sinking the scalar instruction I, hence it appears in PredBB; but it 4136 // may have failed to sink I's operands (recursively), which we try 4137 // (again) here. 4138 if (I->getParent() == PredBB) { 4139 Worklist.insert(I->op_begin(), I->op_end()); 4140 continue; 4141 } 4142 4143 // It's legal to sink the instruction if all its uses occur in the 4144 // predicated block. Otherwise, there's nothing to do yet, and we may 4145 // need to reanalyze the instruction. 4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4147 InstsToReanalyze.push_back(I); 4148 continue; 4149 } 4150 4151 // Move the instruction to the beginning of the predicated block, and add 4152 // it's operands to the worklist. 4153 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4154 Worklist.insert(I->op_begin(), I->op_end()); 4155 4156 // The sinking may have enabled other instructions to be sunk, so we will 4157 // need to iterate. 4158 Changed = true; 4159 } 4160 } while (Changed); 4161 } 4162 4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4164 VPTransformState &State) { 4165 auto Iter = vp_depth_first_deep(Plan.getEntry()); 4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4167 for (VPRecipeBase &P : VPBB->phis()) { 4168 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4169 if (!VPPhi) 4170 continue; 4171 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4172 // Make sure the builder has a valid insert point. 4173 Builder.SetInsertPoint(NewPhi); 4174 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4175 VPValue *Inc = VPPhi->getIncomingValue(i); 4176 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4177 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4178 } 4179 } 4180 } 4181 } 4182 4183 bool InnerLoopVectorizer::useOrderedReductions( 4184 const RecurrenceDescriptor &RdxDesc) { 4185 return Cost->useOrderedReductions(RdxDesc); 4186 } 4187 4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4189 // We should not collect Scalars more than once per VF. Right now, this 4190 // function is called from collectUniformsAndScalars(), which already does 4191 // this check. Collecting Scalars for VF=1 does not make any sense. 4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4193 "This function should not be visited twice for the same VF"); 4194 4195 // This avoids any chances of creating a REPLICATE recipe during planning 4196 // since that would result in generation of scalarized code during execution, 4197 // which is not supported for scalable vectors. 4198 if (VF.isScalable()) { 4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4200 return; 4201 } 4202 4203 SmallSetVector<Instruction *, 8> Worklist; 4204 4205 // These sets are used to seed the analysis with pointers used by memory 4206 // accesses that will remain scalar. 4207 SmallSetVector<Instruction *, 8> ScalarPtrs; 4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4209 auto *Latch = TheLoop->getLoopLatch(); 4210 4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4212 // The pointer operands of loads and stores will be scalar as long as the 4213 // memory access is not a gather or scatter operation. The value operand of a 4214 // store will remain scalar if the store is scalarized. 4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4217 assert(WideningDecision != CM_Unknown && 4218 "Widening decision should be ready at this moment"); 4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4220 if (Ptr == Store->getValueOperand()) 4221 return WideningDecision == CM_Scalarize; 4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4223 "Ptr is neither a value or pointer operand"); 4224 return WideningDecision != CM_GatherScatter; 4225 }; 4226 4227 // A helper that returns true if the given value is a bitcast or 4228 // getelementptr instruction contained in the loop. 4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4231 isa<GetElementPtrInst>(V)) && 4232 !TheLoop->isLoopInvariant(V); 4233 }; 4234 4235 // A helper that evaluates a memory access's use of a pointer. If the use will 4236 // be a scalar use and the pointer is only used by memory accesses, we place 4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4238 // PossibleNonScalarPtrs. 4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4240 // We only care about bitcast and getelementptr instructions contained in 4241 // the loop. 4242 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4243 return; 4244 4245 // If the pointer has already been identified as scalar (e.g., if it was 4246 // also identified as uniform), there's nothing to do. 4247 auto *I = cast<Instruction>(Ptr); 4248 if (Worklist.count(I)) 4249 return; 4250 4251 // If the use of the pointer will be a scalar use, and all users of the 4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4253 // place the pointer in PossibleNonScalarPtrs. 4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4255 return isa<LoadInst>(U) || isa<StoreInst>(U); 4256 })) 4257 ScalarPtrs.insert(I); 4258 else 4259 PossibleNonScalarPtrs.insert(I); 4260 }; 4261 4262 // We seed the scalars analysis with three classes of instructions: (1) 4263 // instructions marked uniform-after-vectorization and (2) bitcast, 4264 // getelementptr and (pointer) phi instructions used by memory accesses 4265 // requiring a scalar use. 4266 // 4267 // (1) Add to the worklist all instructions that have been identified as 4268 // uniform-after-vectorization. 4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4270 4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4272 // memory accesses requiring a scalar use. The pointer operands of loads and 4273 // stores will be scalar as long as the memory accesses is not a gather or 4274 // scatter operation. The value operand of a store will remain scalar if the 4275 // store is scalarized. 4276 for (auto *BB : TheLoop->blocks()) 4277 for (auto &I : *BB) { 4278 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4279 evaluatePtrUse(Load, Load->getPointerOperand()); 4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4281 evaluatePtrUse(Store, Store->getPointerOperand()); 4282 evaluatePtrUse(Store, Store->getValueOperand()); 4283 } 4284 } 4285 for (auto *I : ScalarPtrs) 4286 if (!PossibleNonScalarPtrs.count(I)) { 4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4288 Worklist.insert(I); 4289 } 4290 4291 // Insert the forced scalars. 4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4293 // induction variable when the PHI user is scalarized. 4294 auto ForcedScalar = ForcedScalars.find(VF); 4295 if (ForcedScalar != ForcedScalars.end()) 4296 for (auto *I : ForcedScalar->second) { 4297 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 4298 Worklist.insert(I); 4299 } 4300 4301 // Expand the worklist by looking through any bitcasts and getelementptr 4302 // instructions we've already identified as scalar. This is similar to the 4303 // expansion step in collectLoopUniforms(); however, here we're only 4304 // expanding to include additional bitcasts and getelementptr instructions. 4305 unsigned Idx = 0; 4306 while (Idx != Worklist.size()) { 4307 Instruction *Dst = Worklist[Idx++]; 4308 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4309 continue; 4310 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4311 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4312 auto *J = cast<Instruction>(U); 4313 return !TheLoop->contains(J) || Worklist.count(J) || 4314 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4315 isScalarUse(J, Src)); 4316 })) { 4317 Worklist.insert(Src); 4318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4319 } 4320 } 4321 4322 // An induction variable will remain scalar if all users of the induction 4323 // variable and induction variable update remain scalar. 4324 for (const auto &Induction : Legal->getInductionVars()) { 4325 auto *Ind = Induction.first; 4326 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4327 4328 // If tail-folding is applied, the primary induction variable will be used 4329 // to feed a vector compare. 4330 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4331 continue; 4332 4333 // Returns true if \p Indvar is a pointer induction that is used directly by 4334 // load/store instruction \p I. 4335 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4336 Instruction *I) { 4337 return Induction.second.getKind() == 4338 InductionDescriptor::IK_PtrInduction && 4339 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4340 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4341 }; 4342 4343 // Determine if all users of the induction variable are scalar after 4344 // vectorization. 4345 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4346 auto *I = cast<Instruction>(U); 4347 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4348 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4349 }); 4350 if (!ScalarInd) 4351 continue; 4352 4353 // Determine if all users of the induction variable update instruction are 4354 // scalar after vectorization. 4355 auto ScalarIndUpdate = 4356 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4357 auto *I = cast<Instruction>(U); 4358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4360 }); 4361 if (!ScalarIndUpdate) 4362 continue; 4363 4364 // The induction variable and its update instruction will remain scalar. 4365 Worklist.insert(Ind); 4366 Worklist.insert(IndUpdate); 4367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4369 << "\n"); 4370 } 4371 4372 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4373 } 4374 4375 bool LoopVectorizationCostModel::isScalarWithPredication( 4376 Instruction *I, ElementCount VF) const { 4377 if (!isPredicatedInst(I)) 4378 return false; 4379 4380 // Do we have a non-scalar lowering for this predicated 4381 // instruction? No - it is scalar with predication. 4382 switch(I->getOpcode()) { 4383 default: 4384 return true; 4385 case Instruction::Load: 4386 case Instruction::Store: { 4387 auto *Ptr = getLoadStorePointerOperand(I); 4388 auto *Ty = getLoadStoreType(I); 4389 Type *VTy = Ty; 4390 if (VF.isVector()) 4391 VTy = VectorType::get(Ty, VF); 4392 const Align Alignment = getLoadStoreAlignment(I); 4393 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4394 TTI.isLegalMaskedGather(VTy, Alignment)) 4395 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4396 TTI.isLegalMaskedScatter(VTy, Alignment)); 4397 } 4398 case Instruction::UDiv: 4399 case Instruction::SDiv: 4400 case Instruction::SRem: 4401 case Instruction::URem: { 4402 // We have the option to use the safe-divisor idiom to avoid predication. 4403 // The cost based decision here will always select safe-divisor for 4404 // scalable vectors as scalarization isn't legal. 4405 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 4406 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 4407 } 4408 } 4409 } 4410 4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 4412 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4413 return false; 4414 4415 // Can we prove this instruction is safe to unconditionally execute? 4416 // If not, we must use some form of predication. 4417 switch(I->getOpcode()) { 4418 default: 4419 return false; 4420 case Instruction::Load: 4421 case Instruction::Store: { 4422 if (!Legal->isMaskRequired(I)) 4423 return false; 4424 // When we know the load's address is loop invariant and the instruction 4425 // in the original scalar loop was unconditionally executed then we 4426 // don't need to mark it as a predicated instruction. Tail folding may 4427 // introduce additional predication, but we're guaranteed to always have 4428 // at least one active lane. We call Legal->blockNeedsPredication here 4429 // because it doesn't query tail-folding. For stores, we need to prove 4430 // both speculation safety (which follows from the same argument as loads), 4431 // but also must prove the value being stored is correct. The easiest 4432 // form of the later is to require that all values stored are the same. 4433 if (Legal->isUniformMemOp(*I) && 4434 (isa<LoadInst>(I) || 4435 (isa<StoreInst>(I) && 4436 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4437 !Legal->blockNeedsPredication(I->getParent())) 4438 return false; 4439 return true; 4440 } 4441 case Instruction::UDiv: 4442 case Instruction::SDiv: 4443 case Instruction::SRem: 4444 case Instruction::URem: 4445 // TODO: We can use the loop-preheader as context point here and get 4446 // context sensitive reasoning 4447 return !isSafeToSpeculativelyExecute(I); 4448 } 4449 } 4450 4451 std::pair<InstructionCost, InstructionCost> 4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4453 ElementCount VF) const { 4454 assert(I->getOpcode() == Instruction::UDiv || 4455 I->getOpcode() == Instruction::SDiv || 4456 I->getOpcode() == Instruction::SRem || 4457 I->getOpcode() == Instruction::URem); 4458 assert(!isSafeToSpeculativelyExecute(I)); 4459 4460 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4461 4462 // Scalarization isn't legal for scalable vector types 4463 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4464 if (!VF.isScalable()) { 4465 // Get the scalarization cost and scale this amount by the probability of 4466 // executing the predicated block. If the instruction is not predicated, 4467 // we fall through to the next case. 4468 ScalarizationCost = 0; 4469 4470 // These instructions have a non-void type, so account for the phi nodes 4471 // that we will create. This cost is likely to be zero. The phi node 4472 // cost, if any, should be scaled by the block probability because it 4473 // models a copy at the end of each predicated block. 4474 ScalarizationCost += VF.getKnownMinValue() * 4475 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4476 4477 // The cost of the non-predicated instruction. 4478 ScalarizationCost += VF.getKnownMinValue() * 4479 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4480 4481 // The cost of insertelement and extractelement instructions needed for 4482 // scalarization. 4483 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4484 4485 // Scale the cost by the probability of executing the predicated blocks. 4486 // This assumes the predicated block for each vector lane is equally 4487 // likely. 4488 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4489 } 4490 InstructionCost SafeDivisorCost = 0; 4491 4492 auto *VecTy = ToVectorTy(I->getType(), VF); 4493 4494 // The cost of the select guard to ensure all lanes are well defined 4495 // after we speculate above any internal control flow. 4496 SafeDivisorCost += TTI.getCmpSelInstrCost( 4497 Instruction::Select, VecTy, 4498 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4499 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4500 4501 // Certain instructions can be cheaper to vectorize if they have a constant 4502 // second vector operand. One example of this are shifts on x86. 4503 Value *Op2 = I->getOperand(1); 4504 auto Op2Info = TTI.getOperandInfo(Op2); 4505 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 4506 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4507 4508 SmallVector<const Value *, 4> Operands(I->operand_values()); 4509 SafeDivisorCost += TTI.getArithmeticInstrCost( 4510 I->getOpcode(), VecTy, CostKind, 4511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4512 Op2Info, Operands, I); 4513 return {ScalarizationCost, SafeDivisorCost}; 4514 } 4515 4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4517 Instruction *I, ElementCount VF) { 4518 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4519 assert(getWideningDecision(I, VF) == CM_Unknown && 4520 "Decision should not be set yet."); 4521 auto *Group = getInterleavedAccessGroup(I); 4522 assert(Group && "Must have a group."); 4523 4524 // If the instruction's allocated size doesn't equal it's type size, it 4525 // requires padding and will be scalarized. 4526 auto &DL = I->getModule()->getDataLayout(); 4527 auto *ScalarTy = getLoadStoreType(I); 4528 if (hasIrregularType(ScalarTy, DL)) 4529 return false; 4530 4531 // If the group involves a non-integral pointer, we may not be able to 4532 // losslessly cast all values to a common type. 4533 unsigned InterleaveFactor = Group->getFactor(); 4534 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4535 for (unsigned i = 0; i < InterleaveFactor; i++) { 4536 Instruction *Member = Group->getMember(i); 4537 if (!Member) 4538 continue; 4539 auto *MemberTy = getLoadStoreType(Member); 4540 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4541 // Don't coerce non-integral pointers to integers or vice versa. 4542 if (MemberNI != ScalarNI) { 4543 // TODO: Consider adding special nullptr value case here 4544 return false; 4545 } else if (MemberNI && ScalarNI && 4546 ScalarTy->getPointerAddressSpace() != 4547 MemberTy->getPointerAddressSpace()) { 4548 return false; 4549 } 4550 } 4551 4552 // Check if masking is required. 4553 // A Group may need masking for one of two reasons: it resides in a block that 4554 // needs predication, or it was decided to use masking to deal with gaps 4555 // (either a gap at the end of a load-access that may result in a speculative 4556 // load, or any gaps in a store-access). 4557 bool PredicatedAccessRequiresMasking = 4558 blockNeedsPredicationForAnyReason(I->getParent()) && 4559 Legal->isMaskRequired(I); 4560 bool LoadAccessWithGapsRequiresEpilogMasking = 4561 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4562 !isScalarEpilogueAllowed(); 4563 bool StoreAccessWithGapsRequiresMasking = 4564 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4565 if (!PredicatedAccessRequiresMasking && 4566 !LoadAccessWithGapsRequiresEpilogMasking && 4567 !StoreAccessWithGapsRequiresMasking) 4568 return true; 4569 4570 // If masked interleaving is required, we expect that the user/target had 4571 // enabled it, because otherwise it either wouldn't have been created or 4572 // it should have been invalidated by the CostModel. 4573 assert(useMaskedInterleavedAccesses(TTI) && 4574 "Masked interleave-groups for predicated accesses are not enabled."); 4575 4576 if (Group->isReverse()) 4577 return false; 4578 4579 auto *Ty = getLoadStoreType(I); 4580 const Align Alignment = getLoadStoreAlignment(I); 4581 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4582 : TTI.isLegalMaskedStore(Ty, Alignment); 4583 } 4584 4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4586 Instruction *I, ElementCount VF) { 4587 // Get and ensure we have a valid memory instruction. 4588 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4589 4590 auto *Ptr = getLoadStorePointerOperand(I); 4591 auto *ScalarTy = getLoadStoreType(I); 4592 4593 // In order to be widened, the pointer should be consecutive, first of all. 4594 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4595 return false; 4596 4597 // If the instruction is a store located in a predicated block, it will be 4598 // scalarized. 4599 if (isScalarWithPredication(I, VF)) 4600 return false; 4601 4602 // If the instruction's allocated size doesn't equal it's type size, it 4603 // requires padding and will be scalarized. 4604 auto &DL = I->getModule()->getDataLayout(); 4605 if (hasIrregularType(ScalarTy, DL)) 4606 return false; 4607 4608 return true; 4609 } 4610 4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4612 // We should not collect Uniforms more than once per VF. Right now, 4613 // this function is called from collectUniformsAndScalars(), which 4614 // already does this check. Collecting Uniforms for VF=1 does not make any 4615 // sense. 4616 4617 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4618 "This function should not be visited twice for the same VF"); 4619 4620 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4621 // not analyze again. Uniforms.count(VF) will return 1. 4622 Uniforms[VF].clear(); 4623 4624 // We now know that the loop is vectorizable! 4625 // Collect instructions inside the loop that will remain uniform after 4626 // vectorization. 4627 4628 // Global values, params and instructions outside of current loop are out of 4629 // scope. 4630 auto isOutOfScope = [&](Value *V) -> bool { 4631 Instruction *I = dyn_cast<Instruction>(V); 4632 return (!I || !TheLoop->contains(I)); 4633 }; 4634 4635 // Worklist containing uniform instructions demanding lane 0. 4636 SetVector<Instruction *> Worklist; 4637 BasicBlock *Latch = TheLoop->getLoopLatch(); 4638 4639 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4640 // that are scalar with predication must not be considered uniform after 4641 // vectorization, because that would create an erroneous replicating region 4642 // where only a single instance out of VF should be formed. 4643 // TODO: optimize such seldom cases if found important, see PR40816. 4644 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4645 if (isOutOfScope(I)) { 4646 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4647 << *I << "\n"); 4648 return; 4649 } 4650 if (isScalarWithPredication(I, VF)) { 4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4652 << *I << "\n"); 4653 return; 4654 } 4655 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4656 Worklist.insert(I); 4657 }; 4658 4659 // Start with the conditional branch. If the branch condition is an 4660 // instruction contained in the loop that is only used by the branch, it is 4661 // uniform. 4662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4663 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4664 addToWorklistIfAllowed(Cmp); 4665 4666 // Return true if all lanes perform the same memory operation, and we can 4667 // thus chose to execute only one. 4668 auto isUniformMemOpUse = [&](Instruction *I) { 4669 if (!Legal->isUniformMemOp(*I)) 4670 return false; 4671 if (isa<LoadInst>(I)) 4672 // Loading the same address always produces the same result - at least 4673 // assuming aliasing and ordering which have already been checked. 4674 return true; 4675 // Storing the same value on every iteration. 4676 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4677 }; 4678 4679 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4680 InstWidening WideningDecision = getWideningDecision(I, VF); 4681 assert(WideningDecision != CM_Unknown && 4682 "Widening decision should be ready at this moment"); 4683 4684 if (isUniformMemOpUse(I)) 4685 return true; 4686 4687 return (WideningDecision == CM_Widen || 4688 WideningDecision == CM_Widen_Reverse || 4689 WideningDecision == CM_Interleave); 4690 }; 4691 4692 // Returns true if Ptr is the pointer operand of a memory access instruction 4693 // I, I is known to not require scalarization, and the pointer is not also 4694 // stored. 4695 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4696 auto GetStoredValue = [I]() -> Value * { 4697 if (!isa<StoreInst>(I)) 4698 return nullptr; 4699 return I->getOperand(0); 4700 }; 4701 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) && 4702 GetStoredValue() != Ptr; 4703 }; 4704 4705 // Holds a list of values which are known to have at least one uniform use. 4706 // Note that there may be other uses which aren't uniform. A "uniform use" 4707 // here is something which only demands lane 0 of the unrolled iterations; 4708 // it does not imply that all lanes produce the same value (e.g. this is not 4709 // the usual meaning of uniform) 4710 SetVector<Value *> HasUniformUse; 4711 4712 // Scan the loop for instructions which are either a) known to have only 4713 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4714 for (auto *BB : TheLoop->blocks()) 4715 for (auto &I : *BB) { 4716 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4717 switch (II->getIntrinsicID()) { 4718 case Intrinsic::sideeffect: 4719 case Intrinsic::experimental_noalias_scope_decl: 4720 case Intrinsic::assume: 4721 case Intrinsic::lifetime_start: 4722 case Intrinsic::lifetime_end: 4723 if (TheLoop->hasLoopInvariantOperands(&I)) 4724 addToWorklistIfAllowed(&I); 4725 break; 4726 default: 4727 break; 4728 } 4729 } 4730 4731 // ExtractValue instructions must be uniform, because the operands are 4732 // known to be loop-invariant. 4733 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4734 assert(isOutOfScope(EVI->getAggregateOperand()) && 4735 "Expected aggregate value to be loop invariant"); 4736 addToWorklistIfAllowed(EVI); 4737 continue; 4738 } 4739 4740 // If there's no pointer operand, there's nothing to do. 4741 auto *Ptr = getLoadStorePointerOperand(&I); 4742 if (!Ptr) 4743 continue; 4744 4745 if (isUniformMemOpUse(&I)) 4746 addToWorklistIfAllowed(&I); 4747 4748 if (isVectorizedMemAccessUse(&I, Ptr)) { 4749 assert(isUniformDecision(&I, VF) && "consistency check"); 4750 HasUniformUse.insert(Ptr); 4751 } 4752 } 4753 4754 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4755 // demanding) users. Since loops are assumed to be in LCSSA form, this 4756 // disallows uses outside the loop as well. 4757 for (auto *V : HasUniformUse) { 4758 if (isOutOfScope(V)) 4759 continue; 4760 auto *I = cast<Instruction>(V); 4761 auto UsersAreMemAccesses = 4762 llvm::all_of(I->users(), [&](User *U) -> bool { 4763 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4764 }); 4765 if (UsersAreMemAccesses) 4766 addToWorklistIfAllowed(I); 4767 } 4768 4769 // Expand Worklist in topological order: whenever a new instruction 4770 // is added , its users should be already inside Worklist. It ensures 4771 // a uniform instruction will only be used by uniform instructions. 4772 unsigned idx = 0; 4773 while (idx != Worklist.size()) { 4774 Instruction *I = Worklist[idx++]; 4775 4776 for (auto *OV : I->operand_values()) { 4777 // isOutOfScope operands cannot be uniform instructions. 4778 if (isOutOfScope(OV)) 4779 continue; 4780 // First order recurrence Phi's should typically be considered 4781 // non-uniform. 4782 auto *OP = dyn_cast<PHINode>(OV); 4783 if (OP && Legal->isFixedOrderRecurrence(OP)) 4784 continue; 4785 // If all the users of the operand are uniform, then add the 4786 // operand into the uniform worklist. 4787 auto *OI = cast<Instruction>(OV); 4788 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4789 auto *J = cast<Instruction>(U); 4790 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4791 })) 4792 addToWorklistIfAllowed(OI); 4793 } 4794 } 4795 4796 // For an instruction to be added into Worklist above, all its users inside 4797 // the loop should also be in Worklist. However, this condition cannot be 4798 // true for phi nodes that form a cyclic dependence. We must process phi 4799 // nodes separately. An induction variable will remain uniform if all users 4800 // of the induction variable and induction variable update remain uniform. 4801 // The code below handles both pointer and non-pointer induction variables. 4802 for (const auto &Induction : Legal->getInductionVars()) { 4803 auto *Ind = Induction.first; 4804 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4805 4806 // Determine if all users of the induction variable are uniform after 4807 // vectorization. 4808 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4809 auto *I = cast<Instruction>(U); 4810 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4811 isVectorizedMemAccessUse(I, Ind); 4812 }); 4813 if (!UniformInd) 4814 continue; 4815 4816 // Determine if all users of the induction variable update instruction are 4817 // uniform after vectorization. 4818 auto UniformIndUpdate = 4819 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4820 auto *I = cast<Instruction>(U); 4821 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4822 isVectorizedMemAccessUse(I, IndUpdate); 4823 }); 4824 if (!UniformIndUpdate) 4825 continue; 4826 4827 // The induction variable and its update instruction will remain uniform. 4828 addToWorklistIfAllowed(Ind); 4829 addToWorklistIfAllowed(IndUpdate); 4830 } 4831 4832 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4833 } 4834 4835 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4836 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4837 4838 if (Legal->getRuntimePointerChecking()->Need) { 4839 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4840 "runtime pointer checks needed. Enable vectorization of this " 4841 "loop with '#pragma clang loop vectorize(enable)' when " 4842 "compiling with -Os/-Oz", 4843 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4844 return true; 4845 } 4846 4847 if (!PSE.getPredicate().isAlwaysTrue()) { 4848 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4849 "runtime SCEV checks needed. Enable vectorization of this " 4850 "loop with '#pragma clang loop vectorize(enable)' when " 4851 "compiling with -Os/-Oz", 4852 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4853 return true; 4854 } 4855 4856 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4857 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4858 reportVectorizationFailure("Runtime stride check for small trip count", 4859 "runtime stride == 1 checks needed. Enable vectorization of " 4860 "this loop without such check by compiling with -Os/-Oz", 4861 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4862 return true; 4863 } 4864 4865 return false; 4866 } 4867 4868 ElementCount 4869 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4870 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4871 return ElementCount::getScalable(0); 4872 4873 if (Hints->isScalableVectorizationDisabled()) { 4874 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4875 "ScalableVectorizationDisabled", ORE, TheLoop); 4876 return ElementCount::getScalable(0); 4877 } 4878 4879 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4880 4881 auto MaxScalableVF = ElementCount::getScalable( 4882 std::numeric_limits<ElementCount::ScalarTy>::max()); 4883 4884 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4885 // FIXME: While for scalable vectors this is currently sufficient, this should 4886 // be replaced by a more detailed mechanism that filters out specific VFs, 4887 // instead of invalidating vectorization for a whole set of VFs based on the 4888 // MaxVF. 4889 4890 // Disable scalable vectorization if the loop contains unsupported reductions. 4891 if (!canVectorizeReductions(MaxScalableVF)) { 4892 reportVectorizationInfo( 4893 "Scalable vectorization not supported for the reduction " 4894 "operations found in this loop.", 4895 "ScalableVFUnfeasible", ORE, TheLoop); 4896 return ElementCount::getScalable(0); 4897 } 4898 4899 // Disable scalable vectorization if the loop contains any instructions 4900 // with element types not supported for scalable vectors. 4901 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4902 return !Ty->isVoidTy() && 4903 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4904 })) { 4905 reportVectorizationInfo("Scalable vectorization is not supported " 4906 "for all element types found in this loop.", 4907 "ScalableVFUnfeasible", ORE, TheLoop); 4908 return ElementCount::getScalable(0); 4909 } 4910 4911 if (Legal->isSafeForAnyVectorWidth()) 4912 return MaxScalableVF; 4913 4914 // Limit MaxScalableVF by the maximum safe dependence distance. 4915 std::optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4916 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4917 MaxVScale = 4918 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4919 MaxScalableVF = 4920 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0); 4921 if (!MaxScalableVF) 4922 reportVectorizationInfo( 4923 "Max legal vector width too small, scalable vectorization " 4924 "unfeasible.", 4925 "ScalableVFUnfeasible", ORE, TheLoop); 4926 4927 return MaxScalableVF; 4928 } 4929 4930 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4931 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4932 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4933 unsigned SmallestType, WidestType; 4934 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4935 4936 // Get the maximum safe dependence distance in bits computed by LAA. 4937 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4938 // the memory accesses that is most restrictive (involved in the smallest 4939 // dependence distance). 4940 unsigned MaxSafeElements = 4941 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4942 4943 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4944 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4945 4946 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4947 << ".\n"); 4948 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4949 << ".\n"); 4950 4951 // First analyze the UserVF, fall back if the UserVF should be ignored. 4952 if (UserVF) { 4953 auto MaxSafeUserVF = 4954 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4955 4956 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4957 // If `VF=vscale x N` is safe, then so is `VF=N` 4958 if (UserVF.isScalable()) 4959 return FixedScalableVFPair( 4960 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4961 else 4962 return UserVF; 4963 } 4964 4965 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4966 4967 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4968 // is better to ignore the hint and let the compiler choose a suitable VF. 4969 if (!UserVF.isScalable()) { 4970 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4971 << " is unsafe, clamping to max safe VF=" 4972 << MaxSafeFixedVF << ".\n"); 4973 ORE->emit([&]() { 4974 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4975 TheLoop->getStartLoc(), 4976 TheLoop->getHeader()) 4977 << "User-specified vectorization factor " 4978 << ore::NV("UserVectorizationFactor", UserVF) 4979 << " is unsafe, clamping to maximum safe vectorization factor " 4980 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4981 }); 4982 return MaxSafeFixedVF; 4983 } 4984 4985 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4986 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4987 << " is ignored because scalable vectors are not " 4988 "available.\n"); 4989 ORE->emit([&]() { 4990 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4991 TheLoop->getStartLoc(), 4992 TheLoop->getHeader()) 4993 << "User-specified vectorization factor " 4994 << ore::NV("UserVectorizationFactor", UserVF) 4995 << " is ignored because the target does not support scalable " 4996 "vectors. The compiler will pick a more suitable value."; 4997 }); 4998 } else { 4999 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5000 << " is unsafe. Ignoring scalable UserVF.\n"); 5001 ORE->emit([&]() { 5002 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5003 TheLoop->getStartLoc(), 5004 TheLoop->getHeader()) 5005 << "User-specified vectorization factor " 5006 << ore::NV("UserVectorizationFactor", UserVF) 5007 << " is unsafe. Ignoring the hint to let the compiler pick a " 5008 "more suitable value."; 5009 }); 5010 } 5011 } 5012 5013 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5014 << " / " << WidestType << " bits.\n"); 5015 5016 FixedScalableVFPair Result(ElementCount::getFixed(1), 5017 ElementCount::getScalable(0)); 5018 if (auto MaxVF = 5019 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5020 MaxSafeFixedVF, FoldTailByMasking)) 5021 Result.FixedVF = MaxVF; 5022 5023 if (auto MaxVF = 5024 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5025 MaxSafeScalableVF, FoldTailByMasking)) 5026 if (MaxVF.isScalable()) { 5027 Result.ScalableVF = MaxVF; 5028 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5029 << "\n"); 5030 } 5031 5032 return Result; 5033 } 5034 5035 FixedScalableVFPair 5036 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5037 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5038 // TODO: It may by useful to do since it's still likely to be dynamically 5039 // uniform if the target can skip. 5040 reportVectorizationFailure( 5041 "Not inserting runtime ptr check for divergent target", 5042 "runtime pointer checks needed. Not enabled for divergent target", 5043 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5044 return FixedScalableVFPair::getNone(); 5045 } 5046 5047 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5048 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5049 if (TC == 1) { 5050 reportVectorizationFailure("Single iteration (non) loop", 5051 "loop trip count is one, irrelevant for vectorization", 5052 "SingleIterationLoop", ORE, TheLoop); 5053 return FixedScalableVFPair::getNone(); 5054 } 5055 5056 switch (ScalarEpilogueStatus) { 5057 case CM_ScalarEpilogueAllowed: 5058 return computeFeasibleMaxVF(TC, UserVF, false); 5059 case CM_ScalarEpilogueNotAllowedUsePredicate: 5060 [[fallthrough]]; 5061 case CM_ScalarEpilogueNotNeededUsePredicate: 5062 LLVM_DEBUG( 5063 dbgs() << "LV: vector predicate hint/switch found.\n" 5064 << "LV: Not allowing scalar epilogue, creating predicated " 5065 << "vector loop.\n"); 5066 break; 5067 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5068 // fallthrough as a special case of OptForSize 5069 case CM_ScalarEpilogueNotAllowedOptSize: 5070 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5071 LLVM_DEBUG( 5072 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5073 else 5074 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5075 << "count.\n"); 5076 5077 // Bail if runtime checks are required, which are not good when optimising 5078 // for size. 5079 if (runtimeChecksRequired()) 5080 return FixedScalableVFPair::getNone(); 5081 5082 break; 5083 } 5084 5085 // The only loops we can vectorize without a scalar epilogue, are loops with 5086 // a bottom-test and a single exiting block. We'd have to handle the fact 5087 // that not every instruction executes on the last iteration. This will 5088 // require a lane mask which varies through the vector loop body. (TODO) 5089 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5090 // If there was a tail-folding hint/switch, but we can't fold the tail by 5091 // masking, fallback to a vectorization with a scalar epilogue. 5092 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5093 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5094 "scalar epilogue instead.\n"); 5095 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5096 return computeFeasibleMaxVF(TC, UserVF, false); 5097 } 5098 return FixedScalableVFPair::getNone(); 5099 } 5100 5101 // Now try the tail folding 5102 5103 // Invalidate interleave groups that require an epilogue if we can't mask 5104 // the interleave-group. 5105 if (!useMaskedInterleavedAccesses(TTI)) { 5106 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5107 "No decisions should have been taken at this point"); 5108 // Note: There is no need to invalidate any cost modeling decisions here, as 5109 // non where taken so far. 5110 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5111 } 5112 5113 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5114 // Avoid tail folding if the trip count is known to be a multiple of any VF 5115 // we chose. 5116 // FIXME: The condition below pessimises the case for fixed-width vectors, 5117 // when scalable VFs are also candidates for vectorization. 5118 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5119 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5120 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5121 "MaxFixedVF must be a power of 2"); 5122 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5123 : MaxFixedVF.getFixedValue(); 5124 ScalarEvolution *SE = PSE.getSE(); 5125 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5126 const SCEV *ExitCount = SE->getAddExpr( 5127 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5128 const SCEV *Rem = SE->getURemExpr( 5129 SE->applyLoopGuards(ExitCount, TheLoop), 5130 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5131 if (Rem->isZero()) { 5132 // Accept MaxFixedVF if we do not have a tail. 5133 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5134 return MaxFactors; 5135 } 5136 } 5137 5138 // If we don't know the precise trip count, or if the trip count that we 5139 // found modulo the vectorization factor is not zero, try to fold the tail 5140 // by masking. 5141 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5142 if (Legal->prepareToFoldTailByMasking()) { 5143 FoldTailByMasking = true; 5144 return MaxFactors; 5145 } 5146 5147 // If there was a tail-folding hint/switch, but we can't fold the tail by 5148 // masking, fallback to a vectorization with a scalar epilogue. 5149 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5150 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5151 "scalar epilogue instead.\n"); 5152 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5153 return MaxFactors; 5154 } 5155 5156 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5157 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5158 return FixedScalableVFPair::getNone(); 5159 } 5160 5161 if (TC == 0) { 5162 reportVectorizationFailure( 5163 "Unable to calculate the loop count due to complex control flow", 5164 "unable to calculate the loop count due to complex control flow", 5165 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5166 return FixedScalableVFPair::getNone(); 5167 } 5168 5169 reportVectorizationFailure( 5170 "Cannot optimize for size and vectorize at the same time.", 5171 "cannot optimize for size and vectorize at the same time. " 5172 "Enable vectorization of this loop with '#pragma clang loop " 5173 "vectorize(enable)' when compiling with -Os/-Oz", 5174 "NoTailLoopWithOptForSize", ORE, TheLoop); 5175 return FixedScalableVFPair::getNone(); 5176 } 5177 5178 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5179 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5180 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5181 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5182 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 5183 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5184 : TargetTransformInfo::RGK_FixedWidthVector); 5185 5186 // Convenience function to return the minimum of two ElementCounts. 5187 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5188 assert((LHS.isScalable() == RHS.isScalable()) && 5189 "Scalable flags must match"); 5190 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5191 }; 5192 5193 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5194 // Note that both WidestRegister and WidestType may not be a powers of 2. 5195 auto MaxVectorElementCount = ElementCount::get( 5196 PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType), 5197 ComputeScalableMaxVF); 5198 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5199 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5200 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5201 5202 if (!MaxVectorElementCount) { 5203 LLVM_DEBUG(dbgs() << "LV: The target has no " 5204 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5205 << " vector registers.\n"); 5206 return ElementCount::getFixed(1); 5207 } 5208 5209 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 5210 if (MaxVectorElementCount.isScalable() && 5211 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5212 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5213 auto Min = Attr.getVScaleRangeMin(); 5214 WidestRegisterMinEC *= Min; 5215 } 5216 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && 5217 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5218 // If loop trip count (TC) is known at compile time there is no point in 5219 // choosing VF greater than TC (as done in the loop below). Select maximum 5220 // power of two which doesn't exceed TC. 5221 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5222 // when the TC is less than or equal to the known number of lanes. 5223 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5224 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5225 "exceeding the constant trip count: " 5226 << ClampedConstTripCount << "\n"); 5227 return ElementCount::getFixed(ClampedConstTripCount); 5228 } 5229 5230 TargetTransformInfo::RegisterKind RegKind = 5231 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5232 : TargetTransformInfo::RGK_FixedWidthVector; 5233 ElementCount MaxVF = MaxVectorElementCount; 5234 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5235 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5236 auto MaxVectorElementCountMaxBW = ElementCount::get( 5237 PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType), 5238 ComputeScalableMaxVF); 5239 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5240 5241 // Collect all viable vectorization factors larger than the default MaxVF 5242 // (i.e. MaxVectorElementCount). 5243 SmallVector<ElementCount, 8> VFs; 5244 for (ElementCount VS = MaxVectorElementCount * 2; 5245 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5246 VFs.push_back(VS); 5247 5248 // For each VF calculate its register usage. 5249 auto RUs = calculateRegisterUsage(VFs); 5250 5251 // Select the largest VF which doesn't require more registers than existing 5252 // ones. 5253 for (int i = RUs.size() - 1; i >= 0; --i) { 5254 bool Selected = true; 5255 for (auto &pair : RUs[i].MaxLocalUsers) { 5256 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5257 if (pair.second > TargetNumRegisters) 5258 Selected = false; 5259 } 5260 if (Selected) { 5261 MaxVF = VFs[i]; 5262 break; 5263 } 5264 } 5265 if (ElementCount MinVF = 5266 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5267 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5268 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5269 << ") with target's minimum: " << MinVF << '\n'); 5270 MaxVF = MinVF; 5271 } 5272 } 5273 5274 // Invalidate any widening decisions we might have made, in case the loop 5275 // requires prediction (decided later), but we have already made some 5276 // load/store widening decisions. 5277 invalidateCostModelingDecisions(); 5278 } 5279 return MaxVF; 5280 } 5281 5282 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5283 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5284 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5285 auto Min = Attr.getVScaleRangeMin(); 5286 auto Max = Attr.getVScaleRangeMax(); 5287 if (Max && Min == Max) 5288 return Max; 5289 } 5290 5291 return TTI.getVScaleForTuning(); 5292 } 5293 5294 bool LoopVectorizationCostModel::isMoreProfitable( 5295 const VectorizationFactor &A, const VectorizationFactor &B) const { 5296 InstructionCost CostA = A.Cost; 5297 InstructionCost CostB = B.Cost; 5298 5299 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5300 5301 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5302 MaxTripCount) { 5303 // If we are folding the tail and the trip count is a known (possibly small) 5304 // constant, the trip count will be rounded up to an integer number of 5305 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5306 // which we compare directly. When not folding the tail, the total cost will 5307 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5308 // approximated with the per-lane cost below instead of using the tripcount 5309 // as here. 5310 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5311 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5312 return RTCostA < RTCostB; 5313 } 5314 5315 // Improve estimate for the vector width if it is scalable. 5316 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5317 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5318 if (std::optional<unsigned> VScale = getVScaleForTuning()) { 5319 if (A.Width.isScalable()) 5320 EstimatedWidthA *= *VScale; 5321 if (B.Width.isScalable()) 5322 EstimatedWidthB *= *VScale; 5323 } 5324 5325 // Assume vscale may be larger than 1 (or the value being tuned for), 5326 // so that scalable vectorization is slightly favorable over fixed-width 5327 // vectorization. 5328 if (A.Width.isScalable() && !B.Width.isScalable()) 5329 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5330 5331 // To avoid the need for FP division: 5332 // (CostA / A.Width) < (CostB / B.Width) 5333 // <=> (CostA * B.Width) < (CostB * A.Width) 5334 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5335 } 5336 5337 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5338 const ElementCountSet &VFCandidates) { 5339 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5340 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5341 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5342 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5343 "Expected Scalar VF to be a candidate"); 5344 5345 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5346 ExpectedCost); 5347 VectorizationFactor ChosenFactor = ScalarCost; 5348 5349 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5350 if (ForceVectorization && VFCandidates.size() > 1) { 5351 // Ignore scalar width, because the user explicitly wants vectorization. 5352 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5353 // evaluation. 5354 ChosenFactor.Cost = InstructionCost::getMax(); 5355 } 5356 5357 SmallVector<InstructionVFPair> InvalidCosts; 5358 for (const auto &i : VFCandidates) { 5359 // The cost for scalar VF=1 is already calculated, so ignore it. 5360 if (i.isScalar()) 5361 continue; 5362 5363 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5364 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5365 5366 #ifndef NDEBUG 5367 unsigned AssumedMinimumVscale = 1; 5368 if (std::optional<unsigned> VScale = getVScaleForTuning()) 5369 AssumedMinimumVscale = *VScale; 5370 unsigned Width = 5371 Candidate.Width.isScalable() 5372 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5373 : Candidate.Width.getFixedValue(); 5374 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5375 << " costs: " << (Candidate.Cost / Width)); 5376 if (i.isScalable()) 5377 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5378 << AssumedMinimumVscale << ")"); 5379 LLVM_DEBUG(dbgs() << ".\n"); 5380 #endif 5381 5382 if (!C.second && !ForceVectorization) { 5383 LLVM_DEBUG( 5384 dbgs() << "LV: Not considering vector loop of width " << i 5385 << " because it will not generate any vector instructions.\n"); 5386 continue; 5387 } 5388 5389 // If profitable add it to ProfitableVF list. 5390 if (isMoreProfitable(Candidate, ScalarCost)) 5391 ProfitableVFs.push_back(Candidate); 5392 5393 if (isMoreProfitable(Candidate, ChosenFactor)) 5394 ChosenFactor = Candidate; 5395 } 5396 5397 // Emit a report of VFs with invalid costs in the loop. 5398 if (!InvalidCosts.empty()) { 5399 // Group the remarks per instruction, keeping the instruction order from 5400 // InvalidCosts. 5401 std::map<Instruction *, unsigned> Numbering; 5402 unsigned I = 0; 5403 for (auto &Pair : InvalidCosts) 5404 if (!Numbering.count(Pair.first)) 5405 Numbering[Pair.first] = I++; 5406 5407 // Sort the list, first on instruction(number) then on VF. 5408 llvm::sort(InvalidCosts, 5409 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5410 if (Numbering[A.first] != Numbering[B.first]) 5411 return Numbering[A.first] < Numbering[B.first]; 5412 ElementCountComparator ECC; 5413 return ECC(A.second, B.second); 5414 }); 5415 5416 // For a list of ordered instruction-vf pairs: 5417 // [(load, vf1), (load, vf2), (store, vf1)] 5418 // Group the instructions together to emit separate remarks for: 5419 // load (vf1, vf2) 5420 // store (vf1) 5421 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5422 auto Subset = ArrayRef<InstructionVFPair>(); 5423 do { 5424 if (Subset.empty()) 5425 Subset = Tail.take_front(1); 5426 5427 Instruction *I = Subset.front().first; 5428 5429 // If the next instruction is different, or if there are no other pairs, 5430 // emit a remark for the collated subset. e.g. 5431 // [(load, vf1), (load, vf2))] 5432 // to emit: 5433 // remark: invalid costs for 'load' at VF=(vf, vf2) 5434 if (Subset == Tail || Tail[Subset.size()].first != I) { 5435 std::string OutString; 5436 raw_string_ostream OS(OutString); 5437 assert(!Subset.empty() && "Unexpected empty range"); 5438 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5439 for (const auto &Pair : Subset) 5440 OS << (Pair.second == Subset.front().second ? "" : ", ") 5441 << Pair.second; 5442 OS << "):"; 5443 if (auto *CI = dyn_cast<CallInst>(I)) 5444 OS << " call to " << CI->getCalledFunction()->getName(); 5445 else 5446 OS << " " << I->getOpcodeName(); 5447 OS.flush(); 5448 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5449 Tail = Tail.drop_front(Subset.size()); 5450 Subset = {}; 5451 } else 5452 // Grow the subset by one element 5453 Subset = Tail.take_front(Subset.size() + 1); 5454 } while (!Tail.empty()); 5455 } 5456 5457 if (!EnableCondStoresVectorization && NumPredStores) { 5458 reportVectorizationFailure("There are conditional stores.", 5459 "store that is conditionally executed prevents vectorization", 5460 "ConditionalStore", ORE, TheLoop); 5461 ChosenFactor = ScalarCost; 5462 } 5463 5464 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5465 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5466 << "LV: Vectorization seems to be not beneficial, " 5467 << "but was forced by a user.\n"); 5468 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5469 return ChosenFactor; 5470 } 5471 5472 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5473 const Loop &L, ElementCount VF) const { 5474 // Cross iteration phis such as reductions need special handling and are 5475 // currently unsupported. 5476 if (any_of(L.getHeader()->phis(), 5477 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5478 return false; 5479 5480 // Phis with uses outside of the loop require special handling and are 5481 // currently unsupported. 5482 for (const auto &Entry : Legal->getInductionVars()) { 5483 // Look for uses of the value of the induction at the last iteration. 5484 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5485 for (User *U : PostInc->users()) 5486 if (!L.contains(cast<Instruction>(U))) 5487 return false; 5488 // Look for uses of penultimate value of the induction. 5489 for (User *U : Entry.first->users()) 5490 if (!L.contains(cast<Instruction>(U))) 5491 return false; 5492 } 5493 5494 // Epilogue vectorization code has not been auditted to ensure it handles 5495 // non-latch exits properly. It may be fine, but it needs auditted and 5496 // tested. 5497 if (L.getExitingBlock() != L.getLoopLatch()) 5498 return false; 5499 5500 return true; 5501 } 5502 5503 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5504 const ElementCount VF) const { 5505 // FIXME: We need a much better cost-model to take different parameters such 5506 // as register pressure, code size increase and cost of extra branches into 5507 // account. For now we apply a very crude heuristic and only consider loops 5508 // with vectorization factors larger than a certain value. 5509 5510 // Allow the target to opt out entirely. 5511 if (!TTI.preferEpilogueVectorization()) 5512 return false; 5513 5514 // We also consider epilogue vectorization unprofitable for targets that don't 5515 // consider interleaving beneficial (eg. MVE). 5516 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5517 return false; 5518 // FIXME: We should consider changing the threshold for scalable 5519 // vectors to take VScaleForTuning into account. 5520 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5521 return true; 5522 return false; 5523 } 5524 5525 VectorizationFactor 5526 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5527 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5528 VectorizationFactor Result = VectorizationFactor::Disabled(); 5529 if (!EnableEpilogueVectorization) { 5530 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5531 return Result; 5532 } 5533 5534 if (!isScalarEpilogueAllowed()) { 5535 LLVM_DEBUG( 5536 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5537 "allowed.\n";); 5538 return Result; 5539 } 5540 5541 // Not really a cost consideration, but check for unsupported cases here to 5542 // simplify the logic. 5543 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5544 LLVM_DEBUG( 5545 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5546 "not a supported candidate.\n";); 5547 return Result; 5548 } 5549 5550 if (EpilogueVectorizationForceVF > 1) { 5551 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5552 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5553 if (LVP.hasPlanWithVF(ForcedEC)) 5554 return {ForcedEC, 0, 0}; 5555 else { 5556 LLVM_DEBUG( 5557 dbgs() 5558 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5559 return Result; 5560 } 5561 } 5562 5563 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5564 TheLoop->getHeader()->getParent()->hasMinSize()) { 5565 LLVM_DEBUG( 5566 dbgs() 5567 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5568 return Result; 5569 } 5570 5571 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5572 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5573 "this loop\n"); 5574 return Result; 5575 } 5576 5577 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5578 // the main loop handles 8 lanes per iteration. We could still benefit from 5579 // vectorizing the epilogue loop with VF=4. 5580 ElementCount EstimatedRuntimeVF = MainLoopVF; 5581 if (MainLoopVF.isScalable()) { 5582 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5583 if (std::optional<unsigned> VScale = getVScaleForTuning()) 5584 EstimatedRuntimeVF *= *VScale; 5585 } 5586 5587 for (auto &NextVF : ProfitableVFs) 5588 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5589 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5590 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5591 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5592 LVP.hasPlanWithVF(NextVF.Width)) 5593 Result = NextVF; 5594 5595 if (Result != VectorizationFactor::Disabled()) 5596 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5597 << Result.Width << "\n";); 5598 return Result; 5599 } 5600 5601 std::pair<unsigned, unsigned> 5602 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5603 unsigned MinWidth = -1U; 5604 unsigned MaxWidth = 8; 5605 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5606 // For in-loop reductions, no element types are added to ElementTypesInLoop 5607 // if there are no loads/stores in the loop. In this case, check through the 5608 // reduction variables to determine the maximum width. 5609 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5610 // Reset MaxWidth so that we can find the smallest type used by recurrences 5611 // in the loop. 5612 MaxWidth = -1U; 5613 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5614 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5615 // When finding the min width used by the recurrence we need to account 5616 // for casts on the input operands of the recurrence. 5617 MaxWidth = std::min<unsigned>( 5618 MaxWidth, std::min<unsigned>( 5619 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5620 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5621 } 5622 } else { 5623 for (Type *T : ElementTypesInLoop) { 5624 MinWidth = std::min<unsigned>( 5625 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5626 MaxWidth = std::max<unsigned>( 5627 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5628 } 5629 } 5630 return {MinWidth, MaxWidth}; 5631 } 5632 5633 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5634 ElementTypesInLoop.clear(); 5635 // For each block. 5636 for (BasicBlock *BB : TheLoop->blocks()) { 5637 // For each instruction in the loop. 5638 for (Instruction &I : BB->instructionsWithoutDebug()) { 5639 Type *T = I.getType(); 5640 5641 // Skip ignored values. 5642 if (ValuesToIgnore.count(&I)) 5643 continue; 5644 5645 // Only examine Loads, Stores and PHINodes. 5646 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5647 continue; 5648 5649 // Examine PHI nodes that are reduction variables. Update the type to 5650 // account for the recurrence type. 5651 if (auto *PN = dyn_cast<PHINode>(&I)) { 5652 if (!Legal->isReductionVariable(PN)) 5653 continue; 5654 const RecurrenceDescriptor &RdxDesc = 5655 Legal->getReductionVars().find(PN)->second; 5656 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5657 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5658 RdxDesc.getRecurrenceType(), 5659 TargetTransformInfo::ReductionFlags())) 5660 continue; 5661 T = RdxDesc.getRecurrenceType(); 5662 } 5663 5664 // Examine the stored values. 5665 if (auto *ST = dyn_cast<StoreInst>(&I)) 5666 T = ST->getValueOperand()->getType(); 5667 5668 assert(T->isSized() && 5669 "Expected the load/store/recurrence type to be sized"); 5670 5671 ElementTypesInLoop.insert(T); 5672 } 5673 } 5674 } 5675 5676 unsigned 5677 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5678 InstructionCost LoopCost) { 5679 // -- The interleave heuristics -- 5680 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5681 // There are many micro-architectural considerations that we can't predict 5682 // at this level. For example, frontend pressure (on decode or fetch) due to 5683 // code size, or the number and capabilities of the execution ports. 5684 // 5685 // We use the following heuristics to select the interleave count: 5686 // 1. If the code has reductions, then we interleave to break the cross 5687 // iteration dependency. 5688 // 2. If the loop is really small, then we interleave to reduce the loop 5689 // overhead. 5690 // 3. We don't interleave if we think that we will spill registers to memory 5691 // due to the increased register pressure. 5692 5693 if (!isScalarEpilogueAllowed()) 5694 return 1; 5695 5696 // We used the distance for the interleave count. 5697 if (Legal->getMaxSafeDepDistBytes() != -1U) 5698 return 1; 5699 5700 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5701 const bool HasReductions = !Legal->getReductionVars().empty(); 5702 // Do not interleave loops with a relatively small known or estimated trip 5703 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5704 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5705 // because with the above conditions interleaving can expose ILP and break 5706 // cross iteration dependences for reductions. 5707 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5708 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5709 return 1; 5710 5711 // If we did not calculate the cost for VF (because the user selected the VF) 5712 // then we calculate the cost of VF here. 5713 if (LoopCost == 0) { 5714 LoopCost = expectedCost(VF).first; 5715 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5716 5717 // Loop body is free and there is no need for interleaving. 5718 if (LoopCost == 0) 5719 return 1; 5720 } 5721 5722 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5723 // We divide by these constants so assume that we have at least one 5724 // instruction that uses at least one register. 5725 for (auto& pair : R.MaxLocalUsers) { 5726 pair.second = std::max(pair.second, 1U); 5727 } 5728 5729 // We calculate the interleave count using the following formula. 5730 // Subtract the number of loop invariants from the number of available 5731 // registers. These registers are used by all of the interleaved instances. 5732 // Next, divide the remaining registers by the number of registers that is 5733 // required by the loop, in order to estimate how many parallel instances 5734 // fit without causing spills. All of this is rounded down if necessary to be 5735 // a power of two. We want power of two interleave count to simplify any 5736 // addressing operations or alignment considerations. 5737 // We also want power of two interleave counts to ensure that the induction 5738 // variable of the vector loop wraps to zero, when tail is folded by masking; 5739 // this currently happens when OptForSize, in which case IC is set to 1 above. 5740 unsigned IC = UINT_MAX; 5741 5742 for (auto& pair : R.MaxLocalUsers) { 5743 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5744 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5745 << " registers of " 5746 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5747 if (VF.isScalar()) { 5748 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5749 TargetNumRegisters = ForceTargetNumScalarRegs; 5750 } else { 5751 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5752 TargetNumRegisters = ForceTargetNumVectorRegs; 5753 } 5754 unsigned MaxLocalUsers = pair.second; 5755 unsigned LoopInvariantRegs = 0; 5756 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5757 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5758 5759 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5760 // Don't count the induction variable as interleaved. 5761 if (EnableIndVarRegisterHeur) { 5762 TmpIC = 5763 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5764 std::max(1U, (MaxLocalUsers - 1))); 5765 } 5766 5767 IC = std::min(IC, TmpIC); 5768 } 5769 5770 // Clamp the interleave ranges to reasonable counts. 5771 unsigned MaxInterleaveCount = 5772 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5773 5774 // Check if the user has overridden the max. 5775 if (VF.isScalar()) { 5776 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5777 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5778 } else { 5779 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5780 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5781 } 5782 5783 // If trip count is known or estimated compile time constant, limit the 5784 // interleave count to be less than the trip count divided by VF, provided it 5785 // is at least 1. 5786 // 5787 // For scalable vectors we can't know if interleaving is beneficial. It may 5788 // not be beneficial for small loops if none of the lanes in the second vector 5789 // iterations is enabled. However, for larger loops, there is likely to be a 5790 // similar benefit as for fixed-width vectors. For now, we choose to leave 5791 // the InterleaveCount as if vscale is '1', although if some information about 5792 // the vector is known (e.g. min vector size), we can make a better decision. 5793 if (BestKnownTC) { 5794 MaxInterleaveCount = 5795 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5796 // Make sure MaxInterleaveCount is greater than 0. 5797 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5798 } 5799 5800 assert(MaxInterleaveCount > 0 && 5801 "Maximum interleave count must be greater than 0"); 5802 5803 // Clamp the calculated IC to be between the 1 and the max interleave count 5804 // that the target and trip count allows. 5805 if (IC > MaxInterleaveCount) 5806 IC = MaxInterleaveCount; 5807 else 5808 // Make sure IC is greater than 0. 5809 IC = std::max(1u, IC); 5810 5811 assert(IC > 0 && "Interleave count must be greater than 0."); 5812 5813 // Interleave if we vectorized this loop and there is a reduction that could 5814 // benefit from interleaving. 5815 if (VF.isVector() && HasReductions) { 5816 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5817 return IC; 5818 } 5819 5820 // For any scalar loop that either requires runtime checks or predication we 5821 // are better off leaving this to the unroller. Note that if we've already 5822 // vectorized the loop we will have done the runtime check and so interleaving 5823 // won't require further checks. 5824 bool ScalarInterleavingRequiresPredication = 5825 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5826 return Legal->blockNeedsPredication(BB); 5827 })); 5828 bool ScalarInterleavingRequiresRuntimePointerCheck = 5829 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5830 5831 // We want to interleave small loops in order to reduce the loop overhead and 5832 // potentially expose ILP opportunities. 5833 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5834 << "LV: IC is " << IC << '\n' 5835 << "LV: VF is " << VF << '\n'); 5836 const bool AggressivelyInterleaveReductions = 5837 TTI.enableAggressiveInterleaving(HasReductions); 5838 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5839 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5840 // We assume that the cost overhead is 1 and we use the cost model 5841 // to estimate the cost of the loop and interleave until the cost of the 5842 // loop overhead is about 5% of the cost of the loop. 5843 unsigned SmallIC = std::min( 5844 IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue())); 5845 5846 // Interleave until store/load ports (estimated by max interleave count) are 5847 // saturated. 5848 unsigned NumStores = Legal->getNumStores(); 5849 unsigned NumLoads = Legal->getNumLoads(); 5850 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5851 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5852 5853 // There is little point in interleaving for reductions containing selects 5854 // and compares when VF=1 since it may just create more overhead than it's 5855 // worth for loops with small trip counts. This is because we still have to 5856 // do the final reduction after the loop. 5857 bool HasSelectCmpReductions = 5858 HasReductions && 5859 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5860 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5861 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5862 RdxDesc.getRecurrenceKind()); 5863 }); 5864 if (HasSelectCmpReductions) { 5865 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5866 return 1; 5867 } 5868 5869 // If we have a scalar reduction (vector reductions are already dealt with 5870 // by this point), we can increase the critical path length if the loop 5871 // we're interleaving is inside another loop. For tree-wise reductions 5872 // set the limit to 2, and for ordered reductions it's best to disable 5873 // interleaving entirely. 5874 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5875 bool HasOrderedReductions = 5876 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5877 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5878 return RdxDesc.isOrdered(); 5879 }); 5880 if (HasOrderedReductions) { 5881 LLVM_DEBUG( 5882 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5883 return 1; 5884 } 5885 5886 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5887 SmallIC = std::min(SmallIC, F); 5888 StoresIC = std::min(StoresIC, F); 5889 LoadsIC = std::min(LoadsIC, F); 5890 } 5891 5892 if (EnableLoadStoreRuntimeInterleave && 5893 std::max(StoresIC, LoadsIC) > SmallIC) { 5894 LLVM_DEBUG( 5895 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5896 return std::max(StoresIC, LoadsIC); 5897 } 5898 5899 // If there are scalar reductions and TTI has enabled aggressive 5900 // interleaving for reductions, we will interleave to expose ILP. 5901 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5902 AggressivelyInterleaveReductions) { 5903 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5904 // Interleave no less than SmallIC but not as aggressive as the normal IC 5905 // to satisfy the rare situation when resources are too limited. 5906 return std::max(IC / 2, SmallIC); 5907 } else { 5908 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5909 return SmallIC; 5910 } 5911 } 5912 5913 // Interleave if this is a large loop (small loops are already dealt with by 5914 // this point) that could benefit from interleaving. 5915 if (AggressivelyInterleaveReductions) { 5916 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5917 return IC; 5918 } 5919 5920 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5921 return 1; 5922 } 5923 5924 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5925 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5926 // This function calculates the register usage by measuring the highest number 5927 // of values that are alive at a single location. Obviously, this is a very 5928 // rough estimation. We scan the loop in a topological order in order and 5929 // assign a number to each instruction. We use RPO to ensure that defs are 5930 // met before their users. We assume that each instruction that has in-loop 5931 // users starts an interval. We record every time that an in-loop value is 5932 // used, so we have a list of the first and last occurrences of each 5933 // instruction. Next, we transpose this data structure into a multi map that 5934 // holds the list of intervals that *end* at a specific location. This multi 5935 // map allows us to perform a linear search. We scan the instructions linearly 5936 // and record each time that a new interval starts, by placing it in a set. 5937 // If we find this value in the multi-map then we remove it from the set. 5938 // The max register usage is the maximum size of the set. 5939 // We also search for instructions that are defined outside the loop, but are 5940 // used inside the loop. We need this number separately from the max-interval 5941 // usage number because when we unroll, loop-invariant values do not take 5942 // more register. 5943 LoopBlocksDFS DFS(TheLoop); 5944 DFS.perform(LI); 5945 5946 RegisterUsage RU; 5947 5948 // Each 'key' in the map opens a new interval. The values 5949 // of the map are the index of the 'last seen' usage of the 5950 // instruction that is the key. 5951 using IntervalMap = DenseMap<Instruction *, unsigned>; 5952 5953 // Maps instruction to its index. 5954 SmallVector<Instruction *, 64> IdxToInstr; 5955 // Marks the end of each interval. 5956 IntervalMap EndPoint; 5957 // Saves the list of instruction indices that are used in the loop. 5958 SmallPtrSet<Instruction *, 8> Ends; 5959 // Saves the list of values that are used in the loop but are defined outside 5960 // the loop (not including non-instruction values such as arguments and 5961 // constants). 5962 SmallPtrSet<Instruction *, 8> LoopInvariants; 5963 5964 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5965 for (Instruction &I : BB->instructionsWithoutDebug()) { 5966 IdxToInstr.push_back(&I); 5967 5968 // Save the end location of each USE. 5969 for (Value *U : I.operands()) { 5970 auto *Instr = dyn_cast<Instruction>(U); 5971 5972 // Ignore non-instruction values such as arguments, constants, etc. 5973 // FIXME: Might need some motivation why these values are ignored. If 5974 // for example an argument is used inside the loop it will increase the 5975 // register pressure (so shouldn't we add it to LoopInvariants). 5976 if (!Instr) 5977 continue; 5978 5979 // If this instruction is outside the loop then record it and continue. 5980 if (!TheLoop->contains(Instr)) { 5981 LoopInvariants.insert(Instr); 5982 continue; 5983 } 5984 5985 // Overwrite previous end points. 5986 EndPoint[Instr] = IdxToInstr.size(); 5987 Ends.insert(Instr); 5988 } 5989 } 5990 } 5991 5992 // Saves the list of intervals that end with the index in 'key'. 5993 using InstrList = SmallVector<Instruction *, 2>; 5994 DenseMap<unsigned, InstrList> TransposeEnds; 5995 5996 // Transpose the EndPoints to a list of values that end at each index. 5997 for (auto &Interval : EndPoint) 5998 TransposeEnds[Interval.second].push_back(Interval.first); 5999 6000 SmallPtrSet<Instruction *, 8> OpenIntervals; 6001 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6002 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6003 6004 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6005 6006 const auto &TTICapture = TTI; 6007 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6008 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6009 return 0; 6010 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6011 }; 6012 6013 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6014 Instruction *I = IdxToInstr[i]; 6015 6016 // Remove all of the instructions that end at this location. 6017 InstrList &List = TransposeEnds[i]; 6018 for (Instruction *ToRemove : List) 6019 OpenIntervals.erase(ToRemove); 6020 6021 // Ignore instructions that are never used within the loop. 6022 if (!Ends.count(I)) 6023 continue; 6024 6025 // Skip ignored values. 6026 if (ValuesToIgnore.count(I)) 6027 continue; 6028 6029 // For each VF find the maximum usage of registers. 6030 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6031 // Count the number of registers used, per register class, given all open 6032 // intervals. 6033 // Note that elements in this SmallMapVector will be default constructed 6034 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 6035 // there is no previous entry for ClassID. 6036 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6037 6038 if (VFs[j].isScalar()) { 6039 for (auto *Inst : OpenIntervals) { 6040 unsigned ClassID = 6041 TTI.getRegisterClassForType(false, Inst->getType()); 6042 // FIXME: The target might use more than one register for the type 6043 // even in the scalar case. 6044 RegUsage[ClassID] += 1; 6045 } 6046 } else { 6047 collectUniformsAndScalars(VFs[j]); 6048 for (auto *Inst : OpenIntervals) { 6049 // Skip ignored values for VF > 1. 6050 if (VecValuesToIgnore.count(Inst)) 6051 continue; 6052 if (isScalarAfterVectorization(Inst, VFs[j])) { 6053 unsigned ClassID = 6054 TTI.getRegisterClassForType(false, Inst->getType()); 6055 // FIXME: The target might use more than one register for the type 6056 // even in the scalar case. 6057 RegUsage[ClassID] += 1; 6058 } else { 6059 unsigned ClassID = 6060 TTI.getRegisterClassForType(true, Inst->getType()); 6061 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6062 } 6063 } 6064 } 6065 6066 for (auto& pair : RegUsage) { 6067 auto &Entry = MaxUsages[j][pair.first]; 6068 Entry = std::max(Entry, pair.second); 6069 } 6070 } 6071 6072 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6073 << OpenIntervals.size() << '\n'); 6074 6075 // Add the current instruction to the list of open intervals. 6076 OpenIntervals.insert(I); 6077 } 6078 6079 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6080 // Note that elements in this SmallMapVector will be default constructed 6081 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 6082 // there is no previous entry for ClassID. 6083 SmallMapVector<unsigned, unsigned, 4> Invariant; 6084 6085 for (auto *Inst : LoopInvariants) { 6086 // FIXME: The target might use more than one register for the type 6087 // even in the scalar case. 6088 bool IsScalar = all_of(Inst->users(), [&](User *U) { 6089 auto *I = cast<Instruction>(U); 6090 return TheLoop != LI->getLoopFor(I->getParent()) || 6091 isScalarAfterVectorization(I, VFs[i]); 6092 }); 6093 6094 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 6095 unsigned ClassID = 6096 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 6097 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 6098 } 6099 6100 LLVM_DEBUG({ 6101 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6102 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6103 << " item\n"; 6104 for (const auto &pair : MaxUsages[i]) { 6105 dbgs() << "LV(REG): RegisterClass: " 6106 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6107 << " registers\n"; 6108 } 6109 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6110 << " item\n"; 6111 for (const auto &pair : Invariant) { 6112 dbgs() << "LV(REG): RegisterClass: " 6113 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6114 << " registers\n"; 6115 } 6116 }); 6117 6118 RU.LoopInvariantRegs = Invariant; 6119 RU.MaxLocalUsers = MaxUsages[i]; 6120 RUs[i] = RU; 6121 } 6122 6123 return RUs; 6124 } 6125 6126 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6127 ElementCount VF) { 6128 // TODO: Cost model for emulated masked load/store is completely 6129 // broken. This hack guides the cost model to use an artificially 6130 // high enough value to practically disable vectorization with such 6131 // operations, except where previously deployed legality hack allowed 6132 // using very low cost values. This is to avoid regressions coming simply 6133 // from moving "masked load/store" check from legality to cost model. 6134 // Masked Load/Gather emulation was previously never allowed. 6135 // Limited number of Masked Store/Scatter emulation was allowed. 6136 assert((isPredicatedInst(I)) && 6137 "Expecting a scalar emulated instruction"); 6138 return isa<LoadInst>(I) || 6139 (isa<StoreInst>(I) && 6140 NumPredStores > NumberOfStoresToPredicate); 6141 } 6142 6143 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6144 // If we aren't vectorizing the loop, or if we've already collected the 6145 // instructions to scalarize, there's nothing to do. Collection may already 6146 // have occurred if we have a user-selected VF and are now computing the 6147 // expected cost for interleaving. 6148 if (VF.isScalar() || VF.isZero() || 6149 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6150 return; 6151 6152 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6153 // not profitable to scalarize any instructions, the presence of VF in the 6154 // map will indicate that we've analyzed it already. 6155 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6156 6157 PredicatedBBsAfterVectorization[VF].clear(); 6158 6159 // Find all the instructions that are scalar with predication in the loop and 6160 // determine if it would be better to not if-convert the blocks they are in. 6161 // If so, we also record the instructions to scalarize. 6162 for (BasicBlock *BB : TheLoop->blocks()) { 6163 if (!blockNeedsPredicationForAnyReason(BB)) 6164 continue; 6165 for (Instruction &I : *BB) 6166 if (isScalarWithPredication(&I, VF)) { 6167 ScalarCostsTy ScalarCosts; 6168 // Do not apply discount if scalable, because that would lead to 6169 // invalid scalarization costs. 6170 // Do not apply discount logic if hacked cost is needed 6171 // for emulated masked memrefs. 6172 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6173 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6174 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6175 // Remember that BB will remain after vectorization. 6176 PredicatedBBsAfterVectorization[VF].insert(BB); 6177 } 6178 } 6179 } 6180 6181 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 6182 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6183 assert(!isUniformAfterVectorization(PredInst, VF) && 6184 "Instruction marked uniform-after-vectorization will be predicated"); 6185 6186 // Initialize the discount to zero, meaning that the scalar version and the 6187 // vector version cost the same. 6188 InstructionCost Discount = 0; 6189 6190 // Holds instructions to analyze. The instructions we visit are mapped in 6191 // ScalarCosts. Those instructions are the ones that would be scalarized if 6192 // we find that the scalar version costs less. 6193 SmallVector<Instruction *, 8> Worklist; 6194 6195 // Returns true if the given instruction can be scalarized. 6196 auto canBeScalarized = [&](Instruction *I) -> bool { 6197 // We only attempt to scalarize instructions forming a single-use chain 6198 // from the original predicated block that would otherwise be vectorized. 6199 // Although not strictly necessary, we give up on instructions we know will 6200 // already be scalar to avoid traversing chains that are unlikely to be 6201 // beneficial. 6202 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6203 isScalarAfterVectorization(I, VF)) 6204 return false; 6205 6206 // If the instruction is scalar with predication, it will be analyzed 6207 // separately. We ignore it within the context of PredInst. 6208 if (isScalarWithPredication(I, VF)) 6209 return false; 6210 6211 // If any of the instruction's operands are uniform after vectorization, 6212 // the instruction cannot be scalarized. This prevents, for example, a 6213 // masked load from being scalarized. 6214 // 6215 // We assume we will only emit a value for lane zero of an instruction 6216 // marked uniform after vectorization, rather than VF identical values. 6217 // Thus, if we scalarize an instruction that uses a uniform, we would 6218 // create uses of values corresponding to the lanes we aren't emitting code 6219 // for. This behavior can be changed by allowing getScalarValue to clone 6220 // the lane zero values for uniforms rather than asserting. 6221 for (Use &U : I->operands()) 6222 if (auto *J = dyn_cast<Instruction>(U.get())) 6223 if (isUniformAfterVectorization(J, VF)) 6224 return false; 6225 6226 // Otherwise, we can scalarize the instruction. 6227 return true; 6228 }; 6229 6230 // Compute the expected cost discount from scalarizing the entire expression 6231 // feeding the predicated instruction. We currently only consider expressions 6232 // that are single-use instruction chains. 6233 Worklist.push_back(PredInst); 6234 while (!Worklist.empty()) { 6235 Instruction *I = Worklist.pop_back_val(); 6236 6237 // If we've already analyzed the instruction, there's nothing to do. 6238 if (ScalarCosts.find(I) != ScalarCosts.end()) 6239 continue; 6240 6241 // Compute the cost of the vector instruction. Note that this cost already 6242 // includes the scalarization overhead of the predicated instruction. 6243 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6244 6245 // Compute the cost of the scalarized instruction. This cost is the cost of 6246 // the instruction as if it wasn't if-converted and instead remained in the 6247 // predicated block. We will scale this cost by block probability after 6248 // computing the scalarization overhead. 6249 InstructionCost ScalarCost = 6250 VF.getFixedValue() * 6251 getInstructionCost(I, ElementCount::getFixed(1)).first; 6252 6253 // Compute the scalarization overhead of needed insertelement instructions 6254 // and phi nodes. 6255 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6256 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6257 ScalarCost += TTI.getScalarizationOverhead( 6258 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6259 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 6260 /*Extract*/ false, CostKind); 6261 ScalarCost += 6262 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6263 } 6264 6265 // Compute the scalarization overhead of needed extractelement 6266 // instructions. For each of the instruction's operands, if the operand can 6267 // be scalarized, add it to the worklist; otherwise, account for the 6268 // overhead. 6269 for (Use &U : I->operands()) 6270 if (auto *J = dyn_cast<Instruction>(U.get())) { 6271 assert(VectorType::isValidElementType(J->getType()) && 6272 "Instruction has non-scalar type"); 6273 if (canBeScalarized(J)) 6274 Worklist.push_back(J); 6275 else if (needsExtract(J, VF)) { 6276 ScalarCost += TTI.getScalarizationOverhead( 6277 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6278 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 6279 /*Extract*/ true, CostKind); 6280 } 6281 } 6282 6283 // Scale the total scalar cost by block probability. 6284 ScalarCost /= getReciprocalPredBlockProb(); 6285 6286 // Compute the discount. A non-negative discount means the vector version 6287 // of the instruction costs more, and scalarizing would be beneficial. 6288 Discount += VectorCost - ScalarCost; 6289 ScalarCosts[I] = ScalarCost; 6290 } 6291 6292 return Discount; 6293 } 6294 6295 LoopVectorizationCostModel::VectorizationCostTy 6296 LoopVectorizationCostModel::expectedCost( 6297 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6298 VectorizationCostTy Cost; 6299 6300 // For each block. 6301 for (BasicBlock *BB : TheLoop->blocks()) { 6302 VectorizationCostTy BlockCost; 6303 6304 // For each instruction in the old loop. 6305 for (Instruction &I : BB->instructionsWithoutDebug()) { 6306 // Skip ignored values. 6307 if (ValuesToIgnore.count(&I) || 6308 (VF.isVector() && VecValuesToIgnore.count(&I))) 6309 continue; 6310 6311 VectorizationCostTy C = getInstructionCost(&I, VF); 6312 6313 // Check if we should override the cost. 6314 if (C.first.isValid() && 6315 ForceTargetInstructionCost.getNumOccurrences() > 0) 6316 C.first = InstructionCost(ForceTargetInstructionCost); 6317 6318 // Keep a list of instructions with invalid costs. 6319 if (Invalid && !C.first.isValid()) 6320 Invalid->emplace_back(&I, VF); 6321 6322 BlockCost.first += C.first; 6323 BlockCost.second |= C.second; 6324 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6325 << " for VF " << VF << " For instruction: " << I 6326 << '\n'); 6327 } 6328 6329 // If we are vectorizing a predicated block, it will have been 6330 // if-converted. This means that the block's instructions (aside from 6331 // stores and instructions that may divide by zero) will now be 6332 // unconditionally executed. For the scalar case, we may not always execute 6333 // the predicated block, if it is an if-else block. Thus, scale the block's 6334 // cost by the probability of executing it. blockNeedsPredication from 6335 // Legal is used so as to not include all blocks in tail folded loops. 6336 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6337 BlockCost.first /= getReciprocalPredBlockProb(); 6338 6339 Cost.first += BlockCost.first; 6340 Cost.second |= BlockCost.second; 6341 } 6342 6343 return Cost; 6344 } 6345 6346 /// Gets Address Access SCEV after verifying that the access pattern 6347 /// is loop invariant except the induction variable dependence. 6348 /// 6349 /// This SCEV can be sent to the Target in order to estimate the address 6350 /// calculation cost. 6351 static const SCEV *getAddressAccessSCEV( 6352 Value *Ptr, 6353 LoopVectorizationLegality *Legal, 6354 PredicatedScalarEvolution &PSE, 6355 const Loop *TheLoop) { 6356 6357 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6358 if (!Gep) 6359 return nullptr; 6360 6361 // We are looking for a gep with all loop invariant indices except for one 6362 // which should be an induction variable. 6363 auto SE = PSE.getSE(); 6364 unsigned NumOperands = Gep->getNumOperands(); 6365 for (unsigned i = 1; i < NumOperands; ++i) { 6366 Value *Opd = Gep->getOperand(i); 6367 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6368 !Legal->isInductionVariable(Opd)) 6369 return nullptr; 6370 } 6371 6372 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6373 return PSE.getSCEV(Ptr); 6374 } 6375 6376 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6377 return Legal->hasStride(I->getOperand(0)) || 6378 Legal->hasStride(I->getOperand(1)); 6379 } 6380 6381 InstructionCost 6382 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6383 ElementCount VF) { 6384 assert(VF.isVector() && 6385 "Scalarization cost of instruction implies vectorization."); 6386 if (VF.isScalable()) 6387 return InstructionCost::getInvalid(); 6388 6389 Type *ValTy = getLoadStoreType(I); 6390 auto SE = PSE.getSE(); 6391 6392 unsigned AS = getLoadStoreAddressSpace(I); 6393 Value *Ptr = getLoadStorePointerOperand(I); 6394 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6395 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6396 // that it is being called from this specific place. 6397 6398 // Figure out whether the access is strided and get the stride value 6399 // if it's known in compile time 6400 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6401 6402 // Get the cost of the scalar memory instruction and address computation. 6403 InstructionCost Cost = 6404 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6405 6406 // Don't pass *I here, since it is scalar but will actually be part of a 6407 // vectorized loop where the user of it is a vectorized instruction. 6408 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6409 const Align Alignment = getLoadStoreAlignment(I); 6410 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6411 ValTy->getScalarType(), 6412 Alignment, AS, CostKind); 6413 6414 // Get the overhead of the extractelement and insertelement instructions 6415 // we might create due to scalarization. 6416 Cost += getScalarizationOverhead(I, VF, CostKind); 6417 6418 // If we have a predicated load/store, it will need extra i1 extracts and 6419 // conditional branches, but may not be executed for each vector lane. Scale 6420 // the cost by the probability of executing the predicated block. 6421 if (isPredicatedInst(I)) { 6422 Cost /= getReciprocalPredBlockProb(); 6423 6424 // Add the cost of an i1 extract and a branch 6425 auto *Vec_i1Ty = 6426 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6427 Cost += TTI.getScalarizationOverhead( 6428 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6429 /*Insert=*/false, /*Extract=*/true, CostKind); 6430 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6431 6432 if (useEmulatedMaskMemRefHack(I, VF)) 6433 // Artificially setting to a high enough value to practically disable 6434 // vectorization with such operations. 6435 Cost = 3000000; 6436 } 6437 6438 return Cost; 6439 } 6440 6441 InstructionCost 6442 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6443 ElementCount VF) { 6444 Type *ValTy = getLoadStoreType(I); 6445 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6446 Value *Ptr = getLoadStorePointerOperand(I); 6447 unsigned AS = getLoadStoreAddressSpace(I); 6448 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6449 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6450 6451 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6452 "Stride should be 1 or -1 for consecutive memory access"); 6453 const Align Alignment = getLoadStoreAlignment(I); 6454 InstructionCost Cost = 0; 6455 if (Legal->isMaskRequired(I)) { 6456 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6457 CostKind); 6458 } else { 6459 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6460 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6461 CostKind, OpInfo, I); 6462 } 6463 6464 bool Reverse = ConsecutiveStride < 0; 6465 if (Reverse) 6466 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6467 std::nullopt, CostKind, 0); 6468 return Cost; 6469 } 6470 6471 InstructionCost 6472 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6473 ElementCount VF) { 6474 assert(Legal->isUniformMemOp(*I)); 6475 6476 Type *ValTy = getLoadStoreType(I); 6477 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6478 const Align Alignment = getLoadStoreAlignment(I); 6479 unsigned AS = getLoadStoreAddressSpace(I); 6480 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6481 if (isa<LoadInst>(I)) { 6482 return TTI.getAddressComputationCost(ValTy) + 6483 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6484 CostKind) + 6485 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6486 } 6487 StoreInst *SI = cast<StoreInst>(I); 6488 6489 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6490 return TTI.getAddressComputationCost(ValTy) + 6491 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6492 CostKind) + 6493 (isLoopInvariantStoreValue 6494 ? 0 6495 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6496 CostKind, VF.getKnownMinValue() - 1)); 6497 } 6498 6499 InstructionCost 6500 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6501 ElementCount VF) { 6502 Type *ValTy = getLoadStoreType(I); 6503 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6504 const Align Alignment = getLoadStoreAlignment(I); 6505 const Value *Ptr = getLoadStorePointerOperand(I); 6506 6507 return TTI.getAddressComputationCost(VectorTy) + 6508 TTI.getGatherScatterOpCost( 6509 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6510 TargetTransformInfo::TCK_RecipThroughput, I); 6511 } 6512 6513 InstructionCost 6514 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6515 ElementCount VF) { 6516 // TODO: Once we have support for interleaving with scalable vectors 6517 // we can calculate the cost properly here. 6518 if (VF.isScalable()) 6519 return InstructionCost::getInvalid(); 6520 6521 Type *ValTy = getLoadStoreType(I); 6522 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6523 unsigned AS = getLoadStoreAddressSpace(I); 6524 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6525 6526 auto Group = getInterleavedAccessGroup(I); 6527 assert(Group && "Fail to get an interleaved access group."); 6528 6529 unsigned InterleaveFactor = Group->getFactor(); 6530 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6531 6532 // Holds the indices of existing members in the interleaved group. 6533 SmallVector<unsigned, 4> Indices; 6534 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6535 if (Group->getMember(IF)) 6536 Indices.push_back(IF); 6537 6538 // Calculate the cost of the whole interleaved group. 6539 bool UseMaskForGaps = 6540 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6541 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6542 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6543 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6544 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6545 6546 if (Group->isReverse()) { 6547 // TODO: Add support for reversed masked interleaved access. 6548 assert(!Legal->isMaskRequired(I) && 6549 "Reverse masked interleaved access not supported."); 6550 Cost += Group->getNumMembers() * 6551 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6552 std::nullopt, CostKind, 0); 6553 } 6554 return Cost; 6555 } 6556 6557 std::optional<InstructionCost> 6558 LoopVectorizationCostModel::getReductionPatternCost( 6559 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6560 using namespace llvm::PatternMatch; 6561 // Early exit for no inloop reductions 6562 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6563 return std::nullopt; 6564 auto *VectorTy = cast<VectorType>(Ty); 6565 6566 // We are looking for a pattern of, and finding the minimal acceptable cost: 6567 // reduce(mul(ext(A), ext(B))) or 6568 // reduce(mul(A, B)) or 6569 // reduce(ext(A)) or 6570 // reduce(A). 6571 // The basic idea is that we walk down the tree to do that, finding the root 6572 // reduction instruction in InLoopReductionImmediateChains. From there we find 6573 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6574 // of the components. If the reduction cost is lower then we return it for the 6575 // reduction instruction and 0 for the other instructions in the pattern. If 6576 // it is not we return an invalid cost specifying the orignal cost method 6577 // should be used. 6578 Instruction *RetI = I; 6579 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6580 if (!RetI->hasOneUser()) 6581 return std::nullopt; 6582 RetI = RetI->user_back(); 6583 } 6584 6585 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6586 RetI->user_back()->getOpcode() == Instruction::Add) { 6587 RetI = RetI->user_back(); 6588 } 6589 6590 // Test if the found instruction is a reduction, and if not return an invalid 6591 // cost specifying the parent to use the original cost modelling. 6592 if (!InLoopReductionImmediateChains.count(RetI)) 6593 return std::nullopt; 6594 6595 // Find the reduction this chain is a part of and calculate the basic cost of 6596 // the reduction on its own. 6597 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6598 Instruction *ReductionPhi = LastChain; 6599 while (!isa<PHINode>(ReductionPhi)) 6600 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6601 6602 const RecurrenceDescriptor &RdxDesc = 6603 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6604 6605 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6606 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6607 6608 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6609 // normal fmul instruction to the cost of the fadd reduction. 6610 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6611 BaseCost += 6612 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6613 6614 // If we're using ordered reductions then we can just return the base cost 6615 // here, since getArithmeticReductionCost calculates the full ordered 6616 // reduction cost when FP reassociation is not allowed. 6617 if (useOrderedReductions(RdxDesc)) 6618 return BaseCost; 6619 6620 // Get the operand that was not the reduction chain and match it to one of the 6621 // patterns, returning the better cost if it is found. 6622 Instruction *RedOp = RetI->getOperand(1) == LastChain 6623 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6624 : dyn_cast<Instruction>(RetI->getOperand(1)); 6625 6626 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6627 6628 Instruction *Op0, *Op1; 6629 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6630 match(RedOp, 6631 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6632 match(Op0, m_ZExtOrSExt(m_Value())) && 6633 Op0->getOpcode() == Op1->getOpcode() && 6634 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6635 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6636 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6637 6638 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6639 // Note that the extend opcodes need to all match, or if A==B they will have 6640 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6641 // which is equally fine. 6642 bool IsUnsigned = isa<ZExtInst>(Op0); 6643 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6644 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6645 6646 InstructionCost ExtCost = 6647 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6648 TTI::CastContextHint::None, CostKind, Op0); 6649 InstructionCost MulCost = 6650 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6651 InstructionCost Ext2Cost = 6652 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6653 TTI::CastContextHint::None, CostKind, RedOp); 6654 6655 InstructionCost RedCost = TTI.getMulAccReductionCost( 6656 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6657 6658 if (RedCost.isValid() && 6659 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6660 return I == RetI ? RedCost : 0; 6661 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6662 !TheLoop->isLoopInvariant(RedOp)) { 6663 // Matched reduce(ext(A)) 6664 bool IsUnsigned = isa<ZExtInst>(RedOp); 6665 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6666 InstructionCost RedCost = TTI.getExtendedReductionCost( 6667 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6668 RdxDesc.getFastMathFlags(), CostKind); 6669 6670 InstructionCost ExtCost = 6671 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6672 TTI::CastContextHint::None, CostKind, RedOp); 6673 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6674 return I == RetI ? RedCost : 0; 6675 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6676 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6677 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6678 Op0->getOpcode() == Op1->getOpcode() && 6679 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6680 bool IsUnsigned = isa<ZExtInst>(Op0); 6681 Type *Op0Ty = Op0->getOperand(0)->getType(); 6682 Type *Op1Ty = Op1->getOperand(0)->getType(); 6683 Type *LargestOpTy = 6684 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6685 : Op0Ty; 6686 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6687 6688 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6689 // different sizes. We take the largest type as the ext to reduce, and add 6690 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6691 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6692 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6693 TTI::CastContextHint::None, CostKind, Op0); 6694 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6695 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6696 TTI::CastContextHint::None, CostKind, Op1); 6697 InstructionCost MulCost = 6698 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6699 6700 InstructionCost RedCost = TTI.getMulAccReductionCost( 6701 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6702 InstructionCost ExtraExtCost = 0; 6703 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6704 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6705 ExtraExtCost = TTI.getCastInstrCost( 6706 ExtraExtOp->getOpcode(), ExtType, 6707 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6708 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6709 } 6710 6711 if (RedCost.isValid() && 6712 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6713 return I == RetI ? RedCost : 0; 6714 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6715 // Matched reduce.add(mul()) 6716 InstructionCost MulCost = 6717 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6718 6719 InstructionCost RedCost = TTI.getMulAccReductionCost( 6720 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6721 6722 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6723 return I == RetI ? RedCost : 0; 6724 } 6725 } 6726 6727 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6728 } 6729 6730 InstructionCost 6731 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6732 ElementCount VF) { 6733 // Calculate scalar cost only. Vectorization cost should be ready at this 6734 // moment. 6735 if (VF.isScalar()) { 6736 Type *ValTy = getLoadStoreType(I); 6737 const Align Alignment = getLoadStoreAlignment(I); 6738 unsigned AS = getLoadStoreAddressSpace(I); 6739 6740 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6741 return TTI.getAddressComputationCost(ValTy) + 6742 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6743 TTI::TCK_RecipThroughput, OpInfo, I); 6744 } 6745 return getWideningCost(I, VF); 6746 } 6747 6748 LoopVectorizationCostModel::VectorizationCostTy 6749 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6750 ElementCount VF) { 6751 // If we know that this instruction will remain uniform, check the cost of 6752 // the scalar version. 6753 if (isUniformAfterVectorization(I, VF)) 6754 VF = ElementCount::getFixed(1); 6755 6756 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6757 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6758 6759 // Forced scalars do not have any scalarization overhead. 6760 auto ForcedScalar = ForcedScalars.find(VF); 6761 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6762 auto InstSet = ForcedScalar->second; 6763 if (InstSet.count(I)) 6764 return VectorizationCostTy( 6765 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6766 VF.getKnownMinValue()), 6767 false); 6768 } 6769 6770 Type *VectorTy; 6771 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6772 6773 bool TypeNotScalarized = false; 6774 if (VF.isVector() && VectorTy->isVectorTy()) { 6775 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6776 if (VF.isScalable()) 6777 // <vscale x 1 x iN> is assumed to be profitable over iN because 6778 // scalable registers are a distinct register class from scalar ones. 6779 // If we ever find a target which wants to lower scalable vectors 6780 // back to scalars, we'll need to update this code to explicitly 6781 // ask TTI about the register class uses for each part. 6782 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6783 else 6784 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6785 } else 6786 C = InstructionCost::getInvalid(); 6787 } 6788 return VectorizationCostTy(C, TypeNotScalarized); 6789 } 6790 6791 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6792 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6793 6794 // There is no mechanism yet to create a scalable scalarization loop, 6795 // so this is currently Invalid. 6796 if (VF.isScalable()) 6797 return InstructionCost::getInvalid(); 6798 6799 if (VF.isScalar()) 6800 return 0; 6801 6802 InstructionCost Cost = 0; 6803 Type *RetTy = ToVectorTy(I->getType(), VF); 6804 if (!RetTy->isVoidTy() && 6805 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6806 Cost += TTI.getScalarizationOverhead( 6807 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6808 /*Insert*/ true, 6809 /*Extract*/ false, CostKind); 6810 6811 // Some targets keep addresses scalar. 6812 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6813 return Cost; 6814 6815 // Some targets support efficient element stores. 6816 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6817 return Cost; 6818 6819 // Collect operands to consider. 6820 CallInst *CI = dyn_cast<CallInst>(I); 6821 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6822 6823 // Skip operands that do not require extraction/scalarization and do not incur 6824 // any overhead. 6825 SmallVector<Type *> Tys; 6826 for (auto *V : filterExtractingOperands(Ops, VF)) 6827 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6828 return Cost + TTI.getOperandsScalarizationOverhead( 6829 filterExtractingOperands(Ops, VF), Tys, CostKind); 6830 } 6831 6832 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6833 if (VF.isScalar()) 6834 return; 6835 NumPredStores = 0; 6836 for (BasicBlock *BB : TheLoop->blocks()) { 6837 // For each instruction in the old loop. 6838 for (Instruction &I : *BB) { 6839 Value *Ptr = getLoadStorePointerOperand(&I); 6840 if (!Ptr) 6841 continue; 6842 6843 // TODO: We should generate better code and update the cost model for 6844 // predicated uniform stores. Today they are treated as any other 6845 // predicated store (see added test cases in 6846 // invariant-store-vectorization.ll). 6847 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6848 NumPredStores++; 6849 6850 if (Legal->isUniformMemOp(I)) { 6851 auto isLegalToScalarize = [&]() { 6852 if (!VF.isScalable()) 6853 // Scalarization of fixed length vectors "just works". 6854 return true; 6855 6856 // We have dedicated lowering for unpredicated uniform loads and 6857 // stores. Note that even with tail folding we know that at least 6858 // one lane is active (i.e. generalized predication is not possible 6859 // here), and the logic below depends on this fact. 6860 if (!foldTailByMasking()) 6861 return true; 6862 6863 // For scalable vectors, a uniform memop load is always 6864 // uniform-by-parts and we know how to scalarize that. 6865 if (isa<LoadInst>(I)) 6866 return true; 6867 6868 // A uniform store isn't neccessarily uniform-by-part 6869 // and we can't assume scalarization. 6870 auto &SI = cast<StoreInst>(I); 6871 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6872 }; 6873 6874 const InstructionCost GatherScatterCost = 6875 isLegalGatherOrScatter(&I, VF) ? 6876 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6877 6878 // Load: Scalar load + broadcast 6879 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6880 // FIXME: This cost is a significant under-estimate for tail folded 6881 // memory ops. 6882 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6883 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6884 6885 // Choose better solution for the current VF, Note that Invalid 6886 // costs compare as maximumal large. If both are invalid, we get 6887 // scalable invalid which signals a failure and a vectorization abort. 6888 if (GatherScatterCost < ScalarizationCost) 6889 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6890 else 6891 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6892 continue; 6893 } 6894 6895 // We assume that widening is the best solution when possible. 6896 if (memoryInstructionCanBeWidened(&I, VF)) { 6897 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6898 int ConsecutiveStride = Legal->isConsecutivePtr( 6899 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6900 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6901 "Expected consecutive stride."); 6902 InstWidening Decision = 6903 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6904 setWideningDecision(&I, VF, Decision, Cost); 6905 continue; 6906 } 6907 6908 // Choose between Interleaving, Gather/Scatter or Scalarization. 6909 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6910 unsigned NumAccesses = 1; 6911 if (isAccessInterleaved(&I)) { 6912 auto Group = getInterleavedAccessGroup(&I); 6913 assert(Group && "Fail to get an interleaved access group."); 6914 6915 // Make one decision for the whole group. 6916 if (getWideningDecision(&I, VF) != CM_Unknown) 6917 continue; 6918 6919 NumAccesses = Group->getNumMembers(); 6920 if (interleavedAccessCanBeWidened(&I, VF)) 6921 InterleaveCost = getInterleaveGroupCost(&I, VF); 6922 } 6923 6924 InstructionCost GatherScatterCost = 6925 isLegalGatherOrScatter(&I, VF) 6926 ? getGatherScatterCost(&I, VF) * NumAccesses 6927 : InstructionCost::getInvalid(); 6928 6929 InstructionCost ScalarizationCost = 6930 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6931 6932 // Choose better solution for the current VF, 6933 // write down this decision and use it during vectorization. 6934 InstructionCost Cost; 6935 InstWidening Decision; 6936 if (InterleaveCost <= GatherScatterCost && 6937 InterleaveCost < ScalarizationCost) { 6938 Decision = CM_Interleave; 6939 Cost = InterleaveCost; 6940 } else if (GatherScatterCost < ScalarizationCost) { 6941 Decision = CM_GatherScatter; 6942 Cost = GatherScatterCost; 6943 } else { 6944 Decision = CM_Scalarize; 6945 Cost = ScalarizationCost; 6946 } 6947 // If the instructions belongs to an interleave group, the whole group 6948 // receives the same decision. The whole group receives the cost, but 6949 // the cost will actually be assigned to one instruction. 6950 if (auto Group = getInterleavedAccessGroup(&I)) 6951 setWideningDecision(Group, VF, Decision, Cost); 6952 else 6953 setWideningDecision(&I, VF, Decision, Cost); 6954 } 6955 } 6956 6957 // Make sure that any load of address and any other address computation 6958 // remains scalar unless there is gather/scatter support. This avoids 6959 // inevitable extracts into address registers, and also has the benefit of 6960 // activating LSR more, since that pass can't optimize vectorized 6961 // addresses. 6962 if (TTI.prefersVectorizedAddressing()) 6963 return; 6964 6965 // Start with all scalar pointer uses. 6966 SmallPtrSet<Instruction *, 8> AddrDefs; 6967 for (BasicBlock *BB : TheLoop->blocks()) 6968 for (Instruction &I : *BB) { 6969 Instruction *PtrDef = 6970 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6971 if (PtrDef && TheLoop->contains(PtrDef) && 6972 getWideningDecision(&I, VF) != CM_GatherScatter) 6973 AddrDefs.insert(PtrDef); 6974 } 6975 6976 // Add all instructions used to generate the addresses. 6977 SmallVector<Instruction *, 4> Worklist; 6978 append_range(Worklist, AddrDefs); 6979 while (!Worklist.empty()) { 6980 Instruction *I = Worklist.pop_back_val(); 6981 for (auto &Op : I->operands()) 6982 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6983 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6984 AddrDefs.insert(InstOp).second) 6985 Worklist.push_back(InstOp); 6986 } 6987 6988 for (auto *I : AddrDefs) { 6989 if (isa<LoadInst>(I)) { 6990 // Setting the desired widening decision should ideally be handled in 6991 // by cost functions, but since this involves the task of finding out 6992 // if the loaded register is involved in an address computation, it is 6993 // instead changed here when we know this is the case. 6994 InstWidening Decision = getWideningDecision(I, VF); 6995 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6996 // Scalarize a widened load of address. 6997 setWideningDecision( 6998 I, VF, CM_Scalarize, 6999 (VF.getKnownMinValue() * 7000 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7001 else if (auto Group = getInterleavedAccessGroup(I)) { 7002 // Scalarize an interleave group of address loads. 7003 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7004 if (Instruction *Member = Group->getMember(I)) 7005 setWideningDecision( 7006 Member, VF, CM_Scalarize, 7007 (VF.getKnownMinValue() * 7008 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7009 } 7010 } 7011 } else 7012 // Make sure I gets scalarized and a cost estimate without 7013 // scalarization overhead. 7014 ForcedScalars[VF].insert(I); 7015 } 7016 } 7017 7018 InstructionCost 7019 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7020 Type *&VectorTy) { 7021 Type *RetTy = I->getType(); 7022 if (canTruncateToMinimalBitwidth(I, VF)) 7023 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7024 auto SE = PSE.getSE(); 7025 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7026 7027 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7028 ElementCount VF) -> bool { 7029 if (VF.isScalar()) 7030 return true; 7031 7032 auto Scalarized = InstsToScalarize.find(VF); 7033 assert(Scalarized != InstsToScalarize.end() && 7034 "VF not yet analyzed for scalarization profitability"); 7035 return !Scalarized->second.count(I) && 7036 llvm::all_of(I->users(), [&](User *U) { 7037 auto *UI = cast<Instruction>(U); 7038 return !Scalarized->second.count(UI); 7039 }); 7040 }; 7041 (void) hasSingleCopyAfterVectorization; 7042 7043 if (isScalarAfterVectorization(I, VF)) { 7044 // With the exception of GEPs and PHIs, after scalarization there should 7045 // only be one copy of the instruction generated in the loop. This is 7046 // because the VF is either 1, or any instructions that need scalarizing 7047 // have already been dealt with by the the time we get here. As a result, 7048 // it means we don't have to multiply the instruction cost by VF. 7049 assert(I->getOpcode() == Instruction::GetElementPtr || 7050 I->getOpcode() == Instruction::PHI || 7051 (I->getOpcode() == Instruction::BitCast && 7052 I->getType()->isPointerTy()) || 7053 hasSingleCopyAfterVectorization(I, VF)); 7054 VectorTy = RetTy; 7055 } else 7056 VectorTy = ToVectorTy(RetTy, VF); 7057 7058 // TODO: We need to estimate the cost of intrinsic calls. 7059 switch (I->getOpcode()) { 7060 case Instruction::GetElementPtr: 7061 // We mark this instruction as zero-cost because the cost of GEPs in 7062 // vectorized code depends on whether the corresponding memory instruction 7063 // is scalarized or not. Therefore, we handle GEPs with the memory 7064 // instruction cost. 7065 return 0; 7066 case Instruction::Br: { 7067 // In cases of scalarized and predicated instructions, there will be VF 7068 // predicated blocks in the vectorized loop. Each branch around these 7069 // blocks requires also an extract of its vector compare i1 element. 7070 bool ScalarPredicatedBB = false; 7071 BranchInst *BI = cast<BranchInst>(I); 7072 if (VF.isVector() && BI->isConditional() && 7073 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 7074 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 7075 ScalarPredicatedBB = true; 7076 7077 if (ScalarPredicatedBB) { 7078 // Not possible to scalarize scalable vector with predicated instructions. 7079 if (VF.isScalable()) 7080 return InstructionCost::getInvalid(); 7081 // Return cost for branches around scalarized and predicated blocks. 7082 auto *Vec_i1Ty = 7083 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7084 return ( 7085 TTI.getScalarizationOverhead( 7086 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 7087 /*Insert*/ false, /*Extract*/ true, CostKind) + 7088 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7089 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7090 // The back-edge branch will remain, as will all scalar branches. 7091 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7092 else 7093 // This branch will be eliminated by if-conversion. 7094 return 0; 7095 // Note: We currently assume zero cost for an unconditional branch inside 7096 // a predicated block since it will become a fall-through, although we 7097 // may decide in the future to call TTI for all branches. 7098 } 7099 case Instruction::PHI: { 7100 auto *Phi = cast<PHINode>(I); 7101 7102 // First-order recurrences are replaced by vector shuffles inside the loop. 7103 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 7104 SmallVector<int> Mask(VF.getKnownMinValue()); 7105 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 7106 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 7107 cast<VectorType>(VectorTy), Mask, CostKind, 7108 VF.getKnownMinValue() - 1); 7109 } 7110 7111 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7112 // converted into select instructions. We require N - 1 selects per phi 7113 // node, where N is the number of incoming values. 7114 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7115 return (Phi->getNumIncomingValues() - 1) * 7116 TTI.getCmpSelInstrCost( 7117 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7118 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7119 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7120 7121 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7122 } 7123 case Instruction::UDiv: 7124 case Instruction::SDiv: 7125 case Instruction::URem: 7126 case Instruction::SRem: 7127 if (VF.isVector() && isPredicatedInst(I)) { 7128 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 7129 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 7130 ScalarCost : SafeDivisorCost; 7131 } 7132 // We've proven all lanes safe to speculate, fall through. 7133 [[fallthrough]]; 7134 case Instruction::Add: 7135 case Instruction::FAdd: 7136 case Instruction::Sub: 7137 case Instruction::FSub: 7138 case Instruction::Mul: 7139 case Instruction::FMul: 7140 case Instruction::FDiv: 7141 case Instruction::FRem: 7142 case Instruction::Shl: 7143 case Instruction::LShr: 7144 case Instruction::AShr: 7145 case Instruction::And: 7146 case Instruction::Or: 7147 case Instruction::Xor: { 7148 // Since we will replace the stride by 1 the multiplication should go away. 7149 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7150 return 0; 7151 7152 // Detect reduction patterns 7153 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7154 return *RedCost; 7155 7156 // Certain instructions can be cheaper to vectorize if they have a constant 7157 // second vector operand. One example of this are shifts on x86. 7158 Value *Op2 = I->getOperand(1); 7159 auto Op2Info = TTI.getOperandInfo(Op2); 7160 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7161 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7162 7163 SmallVector<const Value *, 4> Operands(I->operand_values()); 7164 return TTI.getArithmeticInstrCost( 7165 I->getOpcode(), VectorTy, CostKind, 7166 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7167 Op2Info, Operands, I); 7168 } 7169 case Instruction::FNeg: { 7170 return TTI.getArithmeticInstrCost( 7171 I->getOpcode(), VectorTy, CostKind, 7172 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7173 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7174 I->getOperand(0), I); 7175 } 7176 case Instruction::Select: { 7177 SelectInst *SI = cast<SelectInst>(I); 7178 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7179 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7180 7181 const Value *Op0, *Op1; 7182 using namespace llvm::PatternMatch; 7183 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7184 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7185 // select x, y, false --> x & y 7186 // select x, true, y --> x | y 7187 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7188 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7189 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7190 Op1->getType()->getScalarSizeInBits() == 1); 7191 7192 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7193 return TTI.getArithmeticInstrCost( 7194 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7195 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7196 } 7197 7198 Type *CondTy = SI->getCondition()->getType(); 7199 if (!ScalarCond) 7200 CondTy = VectorType::get(CondTy, VF); 7201 7202 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7203 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7204 Pred = Cmp->getPredicate(); 7205 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7206 CostKind, I); 7207 } 7208 case Instruction::ICmp: 7209 case Instruction::FCmp: { 7210 Type *ValTy = I->getOperand(0)->getType(); 7211 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7212 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7213 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7214 VectorTy = ToVectorTy(ValTy, VF); 7215 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7216 cast<CmpInst>(I)->getPredicate(), CostKind, 7217 I); 7218 } 7219 case Instruction::Store: 7220 case Instruction::Load: { 7221 ElementCount Width = VF; 7222 if (Width.isVector()) { 7223 InstWidening Decision = getWideningDecision(I, Width); 7224 assert(Decision != CM_Unknown && 7225 "CM decision should be taken at this point"); 7226 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7227 return InstructionCost::getInvalid(); 7228 if (Decision == CM_Scalarize) 7229 Width = ElementCount::getFixed(1); 7230 } 7231 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7232 return getMemoryInstructionCost(I, VF); 7233 } 7234 case Instruction::BitCast: 7235 if (I->getType()->isPointerTy()) 7236 return 0; 7237 [[fallthrough]]; 7238 case Instruction::ZExt: 7239 case Instruction::SExt: 7240 case Instruction::FPToUI: 7241 case Instruction::FPToSI: 7242 case Instruction::FPExt: 7243 case Instruction::PtrToInt: 7244 case Instruction::IntToPtr: 7245 case Instruction::SIToFP: 7246 case Instruction::UIToFP: 7247 case Instruction::Trunc: 7248 case Instruction::FPTrunc: { 7249 // Computes the CastContextHint from a Load/Store instruction. 7250 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7251 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7252 "Expected a load or a store!"); 7253 7254 if (VF.isScalar() || !TheLoop->contains(I)) 7255 return TTI::CastContextHint::Normal; 7256 7257 switch (getWideningDecision(I, VF)) { 7258 case LoopVectorizationCostModel::CM_GatherScatter: 7259 return TTI::CastContextHint::GatherScatter; 7260 case LoopVectorizationCostModel::CM_Interleave: 7261 return TTI::CastContextHint::Interleave; 7262 case LoopVectorizationCostModel::CM_Scalarize: 7263 case LoopVectorizationCostModel::CM_Widen: 7264 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7265 : TTI::CastContextHint::Normal; 7266 case LoopVectorizationCostModel::CM_Widen_Reverse: 7267 return TTI::CastContextHint::Reversed; 7268 case LoopVectorizationCostModel::CM_Unknown: 7269 llvm_unreachable("Instr did not go through cost modelling?"); 7270 } 7271 7272 llvm_unreachable("Unhandled case!"); 7273 }; 7274 7275 unsigned Opcode = I->getOpcode(); 7276 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7277 // For Trunc, the context is the only user, which must be a StoreInst. 7278 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7279 if (I->hasOneUse()) 7280 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7281 CCH = ComputeCCH(Store); 7282 } 7283 // For Z/Sext, the context is the operand, which must be a LoadInst. 7284 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7285 Opcode == Instruction::FPExt) { 7286 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7287 CCH = ComputeCCH(Load); 7288 } 7289 7290 // We optimize the truncation of induction variables having constant 7291 // integer steps. The cost of these truncations is the same as the scalar 7292 // operation. 7293 if (isOptimizableIVTruncate(I, VF)) { 7294 auto *Trunc = cast<TruncInst>(I); 7295 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7296 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7297 } 7298 7299 // Detect reduction patterns 7300 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7301 return *RedCost; 7302 7303 Type *SrcScalarTy = I->getOperand(0)->getType(); 7304 Type *SrcVecTy = 7305 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7306 if (canTruncateToMinimalBitwidth(I, VF)) { 7307 // This cast is going to be shrunk. This may remove the cast or it might 7308 // turn it into slightly different cast. For example, if MinBW == 16, 7309 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7310 // 7311 // Calculate the modified src and dest types. 7312 Type *MinVecTy = VectorTy; 7313 if (Opcode == Instruction::Trunc) { 7314 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7315 VectorTy = 7316 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7317 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7318 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7319 VectorTy = 7320 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7321 } 7322 } 7323 7324 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7325 } 7326 case Instruction::Call: { 7327 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7328 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7329 return *RedCost; 7330 bool NeedToScalarize; 7331 CallInst *CI = cast<CallInst>(I); 7332 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7333 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7334 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7335 return std::min(CallCost, IntrinsicCost); 7336 } 7337 return CallCost; 7338 } 7339 case Instruction::ExtractValue: 7340 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7341 case Instruction::Alloca: 7342 // We cannot easily widen alloca to a scalable alloca, as 7343 // the result would need to be a vector of pointers. 7344 if (VF.isScalable()) 7345 return InstructionCost::getInvalid(); 7346 [[fallthrough]]; 7347 default: 7348 // This opcode is unknown. Assume that it is the same as 'mul'. 7349 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7350 } // end of switch. 7351 } 7352 7353 char LoopVectorize::ID = 0; 7354 7355 static const char lv_name[] = "Loop Vectorization"; 7356 7357 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7358 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7359 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7360 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7361 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7362 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7363 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7364 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7365 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7366 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7367 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7368 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7369 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7370 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7371 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7372 7373 namespace llvm { 7374 7375 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7376 7377 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7378 bool VectorizeOnlyWhenForced) { 7379 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7380 } 7381 7382 } // end namespace llvm 7383 7384 void LoopVectorizationCostModel::collectValuesToIgnore() { 7385 // Ignore ephemeral values. 7386 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7387 7388 // Find all stores to invariant variables. Since they are going to sink 7389 // outside the loop we do not need calculate cost for them. 7390 for (BasicBlock *BB : TheLoop->blocks()) 7391 for (Instruction &I : *BB) { 7392 StoreInst *SI; 7393 if ((SI = dyn_cast<StoreInst>(&I)) && 7394 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7395 ValuesToIgnore.insert(&I); 7396 } 7397 7398 // Ignore type-promoting instructions we identified during reduction 7399 // detection. 7400 for (const auto &Reduction : Legal->getReductionVars()) { 7401 const RecurrenceDescriptor &RedDes = Reduction.second; 7402 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7403 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7404 } 7405 // Ignore type-casting instructions we identified during induction 7406 // detection. 7407 for (const auto &Induction : Legal->getInductionVars()) { 7408 const InductionDescriptor &IndDes = Induction.second; 7409 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7410 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7411 } 7412 } 7413 7414 void LoopVectorizationCostModel::collectInLoopReductions() { 7415 for (const auto &Reduction : Legal->getReductionVars()) { 7416 PHINode *Phi = Reduction.first; 7417 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7418 7419 // We don't collect reductions that are type promoted (yet). 7420 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7421 continue; 7422 7423 // If the target would prefer this reduction to happen "in-loop", then we 7424 // want to record it as such. 7425 unsigned Opcode = RdxDesc.getOpcode(); 7426 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7427 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7428 TargetTransformInfo::ReductionFlags())) 7429 continue; 7430 7431 // Check that we can correctly put the reductions into the loop, by 7432 // finding the chain of operations that leads from the phi to the loop 7433 // exit value. 7434 SmallVector<Instruction *, 4> ReductionOperations = 7435 RdxDesc.getReductionOpChain(Phi, TheLoop); 7436 bool InLoop = !ReductionOperations.empty(); 7437 if (InLoop) { 7438 InLoopReductionChains[Phi] = ReductionOperations; 7439 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7440 Instruction *LastChain = Phi; 7441 for (auto *I : ReductionOperations) { 7442 InLoopReductionImmediateChains[I] = LastChain; 7443 LastChain = I; 7444 } 7445 } 7446 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7447 << " reduction for phi: " << *Phi << "\n"); 7448 } 7449 } 7450 7451 // TODO: we could return a pair of values that specify the max VF and 7452 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7453 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7454 // doesn't have a cost model that can choose which plan to execute if 7455 // more than one is generated. 7456 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7457 LoopVectorizationCostModel &CM) { 7458 unsigned WidestType; 7459 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7460 return WidestVectorRegBits / WidestType; 7461 } 7462 7463 VectorizationFactor 7464 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7465 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7466 ElementCount VF = UserVF; 7467 // Outer loop handling: They may require CFG and instruction level 7468 // transformations before even evaluating whether vectorization is profitable. 7469 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7470 // the vectorization pipeline. 7471 if (!OrigLoop->isInnermost()) { 7472 // If the user doesn't provide a vectorization factor, determine a 7473 // reasonable one. 7474 if (UserVF.isZero()) { 7475 VF = ElementCount::getFixed(determineVPlanVF( 7476 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7477 .getFixedValue(), 7478 CM)); 7479 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7480 7481 // Make sure we have a VF > 1 for stress testing. 7482 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7483 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7484 << "overriding computed VF.\n"); 7485 VF = ElementCount::getFixed(4); 7486 } 7487 } 7488 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7489 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7490 "VF needs to be a power of two"); 7491 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7492 << "VF " << VF << " to build VPlans.\n"); 7493 buildVPlans(VF, VF); 7494 7495 // For VPlan build stress testing, we bail out after VPlan construction. 7496 if (VPlanBuildStressTest) 7497 return VectorizationFactor::Disabled(); 7498 7499 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7500 } 7501 7502 LLVM_DEBUG( 7503 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7504 "VPlan-native path.\n"); 7505 return VectorizationFactor::Disabled(); 7506 } 7507 7508 std::optional<VectorizationFactor> 7509 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7510 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7511 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7512 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7513 return std::nullopt; 7514 7515 // Invalidate interleave groups if all blocks of loop will be predicated. 7516 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7517 !useMaskedInterleavedAccesses(*TTI)) { 7518 LLVM_DEBUG( 7519 dbgs() 7520 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7521 "which requires masked-interleaved support.\n"); 7522 if (CM.InterleaveInfo.invalidateGroups()) 7523 // Invalidating interleave groups also requires invalidating all decisions 7524 // based on them, which includes widening decisions and uniform and scalar 7525 // values. 7526 CM.invalidateCostModelingDecisions(); 7527 } 7528 7529 ElementCount MaxUserVF = 7530 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7531 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7532 if (!UserVF.isZero() && UserVFIsLegal) { 7533 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7534 "VF needs to be a power of two"); 7535 // Collect the instructions (and their associated costs) that will be more 7536 // profitable to scalarize. 7537 if (CM.selectUserVectorizationFactor(UserVF)) { 7538 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7539 CM.collectInLoopReductions(); 7540 buildVPlansWithVPRecipes(UserVF, UserVF); 7541 LLVM_DEBUG(printPlans(dbgs())); 7542 return {{UserVF, 0, 0}}; 7543 } else 7544 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7545 "InvalidCost", ORE, OrigLoop); 7546 } 7547 7548 // Populate the set of Vectorization Factor Candidates. 7549 ElementCountSet VFCandidates; 7550 for (auto VF = ElementCount::getFixed(1); 7551 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7552 VFCandidates.insert(VF); 7553 for (auto VF = ElementCount::getScalable(1); 7554 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7555 VFCandidates.insert(VF); 7556 7557 for (const auto &VF : VFCandidates) { 7558 // Collect Uniform and Scalar instructions after vectorization with VF. 7559 CM.collectUniformsAndScalars(VF); 7560 7561 // Collect the instructions (and their associated costs) that will be more 7562 // profitable to scalarize. 7563 if (VF.isVector()) 7564 CM.collectInstsToScalarize(VF); 7565 } 7566 7567 CM.collectInLoopReductions(); 7568 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7569 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7570 7571 LLVM_DEBUG(printPlans(dbgs())); 7572 if (!MaxFactors.hasVector()) 7573 return VectorizationFactor::Disabled(); 7574 7575 // Select the optimal vectorization factor. 7576 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); 7577 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7578 return VF; 7579 } 7580 7581 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7582 assert(count_if(VPlans, 7583 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7584 1 && 7585 "Best VF has not a single VPlan."); 7586 7587 for (const VPlanPtr &Plan : VPlans) { 7588 if (Plan->hasVF(VF)) 7589 return *Plan.get(); 7590 } 7591 llvm_unreachable("No plan found!"); 7592 } 7593 7594 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7595 SmallVector<Metadata *, 4> MDs; 7596 // Reserve first location for self reference to the LoopID metadata node. 7597 MDs.push_back(nullptr); 7598 bool IsUnrollMetadata = false; 7599 MDNode *LoopID = L->getLoopID(); 7600 if (LoopID) { 7601 // First find existing loop unrolling disable metadata. 7602 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7603 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7604 if (MD) { 7605 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7606 IsUnrollMetadata = 7607 S && S->getString().startswith("llvm.loop.unroll.disable"); 7608 } 7609 MDs.push_back(LoopID->getOperand(i)); 7610 } 7611 } 7612 7613 if (!IsUnrollMetadata) { 7614 // Add runtime unroll disable metadata. 7615 LLVMContext &Context = L->getHeader()->getContext(); 7616 SmallVector<Metadata *, 1> DisableOperands; 7617 DisableOperands.push_back( 7618 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7619 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7620 MDs.push_back(DisableNode); 7621 MDNode *NewLoopID = MDNode::get(Context, MDs); 7622 // Set operand 0 to refer to the loop id itself. 7623 NewLoopID->replaceOperandWith(0, NewLoopID); 7624 L->setLoopID(NewLoopID); 7625 } 7626 } 7627 7628 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7629 VPlan &BestVPlan, 7630 InnerLoopVectorizer &ILV, 7631 DominatorTree *DT, 7632 bool IsEpilogueVectorization) { 7633 assert(BestVPlan.hasVF(BestVF) && 7634 "Trying to execute plan with unsupported VF"); 7635 assert(BestVPlan.hasUF(BestUF) && 7636 "Trying to execute plan with unsupported UF"); 7637 7638 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7639 << '\n'); 7640 7641 // Workaround! Compute the trip count of the original loop and cache it 7642 // before we start modifying the CFG. This code has a systemic problem 7643 // wherein it tries to run analysis over partially constructed IR; this is 7644 // wrong, and not simply for SCEV. The trip count of the original loop 7645 // simply happens to be prone to hitting this in practice. In theory, we 7646 // can hit the same issue for any SCEV, or ValueTracking query done during 7647 // mutation. See PR49900. 7648 ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7649 7650 if (!IsEpilogueVectorization) 7651 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7652 7653 // Perform the actual loop transformation. 7654 7655 // 1. Set up the skeleton for vectorization, including vector pre-header and 7656 // middle block. The vector loop is created during VPlan execution. 7657 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7658 Value *CanonicalIVStartValue; 7659 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7660 ILV.createVectorizedLoopSkeleton(); 7661 7662 // Only use noalias metadata when using memory checks guaranteeing no overlap 7663 // across all iterations. 7664 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7665 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7666 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7667 7668 // We currently don't use LoopVersioning for the actual loop cloning but we 7669 // still use it to add the noalias metadata. 7670 // TODO: Find a better way to re-use LoopVersioning functionality to add 7671 // metadata. 7672 State.LVer = std::make_unique<LoopVersioning>( 7673 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7674 PSE.getSE()); 7675 State.LVer->prepareNoAliasMetadata(); 7676 } 7677 7678 ILV.collectPoisonGeneratingRecipes(State); 7679 7680 ILV.printDebugTracesAtStart(); 7681 7682 //===------------------------------------------------===// 7683 // 7684 // Notice: any optimization or new instruction that go 7685 // into the code below should also be implemented in 7686 // the cost-model. 7687 // 7688 //===------------------------------------------------===// 7689 7690 // 2. Copy and widen instructions from the old loop into the new loop. 7691 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7692 ILV.getOrCreateVectorTripCount(nullptr), 7693 CanonicalIVStartValue, State, 7694 IsEpilogueVectorization); 7695 7696 BestVPlan.execute(&State); 7697 7698 // Keep all loop hints from the original loop on the vector loop (we'll 7699 // replace the vectorizer-specific hints below). 7700 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7701 7702 std::optional<MDNode *> VectorizedLoopID = 7703 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7704 LLVMLoopVectorizeFollowupVectorized}); 7705 7706 VPBasicBlock *HeaderVPBB = 7707 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7708 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7709 if (VectorizedLoopID) 7710 L->setLoopID(*VectorizedLoopID); 7711 else { 7712 // Keep all loop hints from the original loop on the vector loop (we'll 7713 // replace the vectorizer-specific hints below). 7714 if (MDNode *LID = OrigLoop->getLoopID()) 7715 L->setLoopID(LID); 7716 7717 LoopVectorizeHints Hints(L, true, *ORE); 7718 Hints.setAlreadyVectorized(); 7719 } 7720 AddRuntimeUnrollDisableMetaData(L); 7721 7722 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7723 // predication, updating analyses. 7724 ILV.fixVectorizedLoop(State, BestVPlan); 7725 7726 ILV.printDebugTracesAtEnd(); 7727 } 7728 7729 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7730 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7731 for (const auto &Plan : VPlans) 7732 if (PrintVPlansInDotFormat) 7733 Plan->printDOT(O); 7734 else 7735 Plan->print(O); 7736 } 7737 #endif 7738 7739 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7740 7741 //===--------------------------------------------------------------------===// 7742 // EpilogueVectorizerMainLoop 7743 //===--------------------------------------------------------------------===// 7744 7745 /// This function is partially responsible for generating the control flow 7746 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7747 std::pair<BasicBlock *, Value *> 7748 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7749 createVectorLoopSkeleton(""); 7750 7751 // Generate the code to check the minimum iteration count of the vector 7752 // epilogue (see below). 7753 EPI.EpilogueIterationCountCheck = 7754 emitIterationCountCheck(LoopScalarPreHeader, true); 7755 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7756 7757 // Generate the code to check any assumptions that we've made for SCEV 7758 // expressions. 7759 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7760 7761 // Generate the code that checks at runtime if arrays overlap. We put the 7762 // checks into a separate block to make the more common case of few elements 7763 // faster. 7764 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7765 7766 // Generate the iteration count check for the main loop, *after* the check 7767 // for the epilogue loop, so that the path-length is shorter for the case 7768 // that goes directly through the vector epilogue. The longer-path length for 7769 // the main loop is compensated for, by the gain from vectorizing the larger 7770 // trip count. Note: the branch will get updated later on when we vectorize 7771 // the epilogue. 7772 EPI.MainLoopIterationCountCheck = 7773 emitIterationCountCheck(LoopScalarPreHeader, false); 7774 7775 // Generate the induction variable. 7776 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7777 7778 // Skip induction resume value creation here because they will be created in 7779 // the second pass for the scalar loop. The induction resume values for the 7780 // inductions in the epilogue loop are created before executing the plan for 7781 // the epilogue loop. 7782 7783 return {completeLoopSkeleton(), nullptr}; 7784 } 7785 7786 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7787 LLVM_DEBUG({ 7788 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7789 << "Main Loop VF:" << EPI.MainLoopVF 7790 << ", Main Loop UF:" << EPI.MainLoopUF 7791 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7792 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7793 }); 7794 } 7795 7796 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7797 DEBUG_WITH_TYPE(VerboseDebug, { 7798 dbgs() << "intermediate fn:\n" 7799 << *OrigLoop->getHeader()->getParent() << "\n"; 7800 }); 7801 } 7802 7803 BasicBlock * 7804 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7805 bool ForEpilogue) { 7806 assert(Bypass && "Expected valid bypass basic block."); 7807 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7808 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7809 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7810 // Reuse existing vector loop preheader for TC checks. 7811 // Note that new preheader block is generated for vector loop. 7812 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7813 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7814 7815 // Generate code to check if the loop's trip count is less than VF * UF of the 7816 // main vector loop. 7817 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7818 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7819 7820 Value *CheckMinIters = Builder.CreateICmp( 7821 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7822 "min.iters.check"); 7823 7824 if (!ForEpilogue) 7825 TCCheckBlock->setName("vector.main.loop.iter.check"); 7826 7827 // Create new preheader for vector loop. 7828 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7829 DT, LI, nullptr, "vector.ph"); 7830 7831 if (ForEpilogue) { 7832 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7833 DT->getNode(Bypass)->getIDom()) && 7834 "TC check is expected to dominate Bypass"); 7835 7836 // Update dominator for Bypass & LoopExit. 7837 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7838 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7839 // For loops with multiple exits, there's no edge from the middle block 7840 // to exit blocks (as the epilogue must run) and thus no need to update 7841 // the immediate dominator of the exit blocks. 7842 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7843 7844 LoopBypassBlocks.push_back(TCCheckBlock); 7845 7846 // Save the trip count so we don't have to regenerate it in the 7847 // vec.epilog.iter.check. This is safe to do because the trip count 7848 // generated here dominates the vector epilog iter check. 7849 EPI.TripCount = Count; 7850 } 7851 7852 ReplaceInstWithInst( 7853 TCCheckBlock->getTerminator(), 7854 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7855 7856 return TCCheckBlock; 7857 } 7858 7859 //===--------------------------------------------------------------------===// 7860 // EpilogueVectorizerEpilogueLoop 7861 //===--------------------------------------------------------------------===// 7862 7863 /// This function is partially responsible for generating the control flow 7864 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7865 std::pair<BasicBlock *, Value *> 7866 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7867 createVectorLoopSkeleton("vec.epilog."); 7868 7869 // Now, compare the remaining count and if there aren't enough iterations to 7870 // execute the vectorized epilogue skip to the scalar part. 7871 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7872 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7873 LoopVectorPreHeader = 7874 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7875 LI, nullptr, "vec.epilog.ph"); 7876 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7877 VecEpilogueIterationCountCheck); 7878 7879 // Adjust the control flow taking the state info from the main loop 7880 // vectorization into account. 7881 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7882 "expected this to be saved from the previous pass."); 7883 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7884 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7885 7886 DT->changeImmediateDominator(LoopVectorPreHeader, 7887 EPI.MainLoopIterationCountCheck); 7888 7889 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7890 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7891 7892 if (EPI.SCEVSafetyCheck) 7893 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7894 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7895 if (EPI.MemSafetyCheck) 7896 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7897 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7898 7899 DT->changeImmediateDominator( 7900 VecEpilogueIterationCountCheck, 7901 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7902 7903 DT->changeImmediateDominator(LoopScalarPreHeader, 7904 EPI.EpilogueIterationCountCheck); 7905 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7906 // If there is an epilogue which must run, there's no edge from the 7907 // middle block to exit blocks and thus no need to update the immediate 7908 // dominator of the exit blocks. 7909 DT->changeImmediateDominator(LoopExitBlock, 7910 EPI.EpilogueIterationCountCheck); 7911 7912 // Keep track of bypass blocks, as they feed start values to the induction and 7913 // reduction phis in the scalar loop preheader. 7914 if (EPI.SCEVSafetyCheck) 7915 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7916 if (EPI.MemSafetyCheck) 7917 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7918 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7919 7920 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7921 // reductions which merge control-flow from the latch block and the middle 7922 // block. Update the incoming values here and move the Phi into the preheader. 7923 SmallVector<PHINode *, 4> PhisInBlock; 7924 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7925 PhisInBlock.push_back(&Phi); 7926 7927 for (PHINode *Phi : PhisInBlock) { 7928 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7929 Phi->replaceIncomingBlockWith( 7930 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7931 VecEpilogueIterationCountCheck); 7932 7933 // If the phi doesn't have an incoming value from the 7934 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7935 // value and also those from other check blocks. This is needed for 7936 // reduction phis only. 7937 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7938 return EPI.EpilogueIterationCountCheck == IncB; 7939 })) 7940 continue; 7941 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7942 if (EPI.SCEVSafetyCheck) 7943 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7944 if (EPI.MemSafetyCheck) 7945 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7946 } 7947 7948 // Generate a resume induction for the vector epilogue and put it in the 7949 // vector epilogue preheader 7950 Type *IdxTy = Legal->getWidestInductionType(); 7951 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7952 LoopVectorPreHeader->getFirstNonPHI()); 7953 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7954 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7955 EPI.MainLoopIterationCountCheck); 7956 7957 // Generate induction resume values. These variables save the new starting 7958 // indexes for the scalar loop. They are used to test if there are any tail 7959 // iterations left once the vector loop has completed. 7960 // Note that when the vectorized epilogue is skipped due to iteration count 7961 // check, then the resume value for the induction variable comes from 7962 // the trip count of the main vector loop, hence passing the AdditionalBypass 7963 // argument. 7964 createInductionResumeValues({VecEpilogueIterationCountCheck, 7965 EPI.VectorTripCount} /* AdditionalBypass */); 7966 7967 return {completeLoopSkeleton(), EPResumeVal}; 7968 } 7969 7970 BasicBlock * 7971 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7972 BasicBlock *Bypass, BasicBlock *Insert) { 7973 7974 assert(EPI.TripCount && 7975 "Expected trip count to have been safed in the first pass."); 7976 assert( 7977 (!isa<Instruction>(EPI.TripCount) || 7978 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7979 "saved trip count does not dominate insertion point."); 7980 Value *TC = EPI.TripCount; 7981 IRBuilder<> Builder(Insert->getTerminator()); 7982 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7983 7984 // Generate code to check if the loop's trip count is less than VF * UF of the 7985 // vector epilogue loop. 7986 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7987 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7988 7989 Value *CheckMinIters = 7990 Builder.CreateICmp(P, Count, 7991 createStepForVF(Builder, Count->getType(), 7992 EPI.EpilogueVF, EPI.EpilogueUF), 7993 "min.epilog.iters.check"); 7994 7995 ReplaceInstWithInst( 7996 Insert->getTerminator(), 7997 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7998 7999 LoopBypassBlocks.push_back(Insert); 8000 return Insert; 8001 } 8002 8003 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8004 LLVM_DEBUG({ 8005 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8006 << "Epilogue Loop VF:" << EPI.EpilogueVF 8007 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8008 }); 8009 } 8010 8011 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8012 DEBUG_WITH_TYPE(VerboseDebug, { 8013 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8014 }); 8015 } 8016 8017 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8018 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8019 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8020 bool PredicateAtRangeStart = Predicate(Range.Start); 8021 8022 for (ElementCount TmpVF = Range.Start * 2; 8023 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8024 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8025 Range.End = TmpVF; 8026 break; 8027 } 8028 8029 return PredicateAtRangeStart; 8030 } 8031 8032 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8033 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8034 /// of VF's starting at a given VF and extending it as much as possible. Each 8035 /// vectorization decision can potentially shorten this sub-range during 8036 /// buildVPlan(). 8037 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8038 ElementCount MaxVF) { 8039 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8040 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8041 VFRange SubRange = {VF, MaxVFPlusOne}; 8042 VPlans.push_back(buildVPlan(SubRange)); 8043 VF = SubRange.End; 8044 } 8045 } 8046 8047 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8048 VPlanPtr &Plan) { 8049 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8050 8051 // Look for cached value. 8052 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8053 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8054 if (ECEntryIt != EdgeMaskCache.end()) 8055 return ECEntryIt->second; 8056 8057 VPValue *SrcMask = createBlockInMask(Src, Plan); 8058 8059 // The terminator has to be a branch inst! 8060 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8061 assert(BI && "Unexpected terminator found"); 8062 8063 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8064 return EdgeMaskCache[Edge] = SrcMask; 8065 8066 // If source is an exiting block, we know the exit edge is dynamically dead 8067 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8068 // adding uses of an otherwise potentially dead instruction. 8069 if (OrigLoop->isLoopExiting(Src)) 8070 return EdgeMaskCache[Edge] = SrcMask; 8071 8072 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8073 assert(EdgeMask && "No Edge Mask found for condition"); 8074 8075 if (BI->getSuccessor(0) != Dst) 8076 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8077 8078 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8079 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8080 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8081 // The select version does not introduce new UB if SrcMask is false and 8082 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8083 VPValue *False = Plan->getOrAddVPValue( 8084 ConstantInt::getFalse(BI->getCondition()->getType())); 8085 EdgeMask = 8086 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8087 } 8088 8089 return EdgeMaskCache[Edge] = EdgeMask; 8090 } 8091 8092 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8093 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8094 8095 // Look for cached value. 8096 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8097 if (BCEntryIt != BlockMaskCache.end()) 8098 return BCEntryIt->second; 8099 8100 // All-one mask is modelled as no-mask following the convention for masked 8101 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8102 VPValue *BlockMask = nullptr; 8103 8104 if (OrigLoop->getHeader() == BB) { 8105 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8106 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8107 8108 assert(CM.foldTailByMasking() && "must fold the tail"); 8109 8110 // If we're using the active lane mask for control flow, then we get the 8111 // mask from the active lane mask PHI that is cached in the VPlan. 8112 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); 8113 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) 8114 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); 8115 8116 // Introduce the early-exit compare IV <= BTC to form header block mask. 8117 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8118 // constructing the desired canonical IV in the header block as its first 8119 // non-phi instructions. 8120 8121 VPBasicBlock *HeaderVPBB = 8122 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8123 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8124 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8125 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8126 8127 VPBuilder::InsertPointGuard Guard(Builder); 8128 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8129 if (EmitGetActiveLaneMask != PredicationStyle::None) { 8130 VPValue *TC = Plan->getOrCreateTripCount(); 8131 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8132 nullptr, "active.lane.mask"); 8133 } else { 8134 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8135 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8136 } 8137 return BlockMaskCache[BB] = BlockMask; 8138 } 8139 8140 // This is the block mask. We OR all incoming edges. 8141 for (auto *Predecessor : predecessors(BB)) { 8142 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8143 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8144 return BlockMaskCache[BB] = EdgeMask; 8145 8146 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8147 BlockMask = EdgeMask; 8148 continue; 8149 } 8150 8151 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8152 } 8153 8154 return BlockMaskCache[BB] = BlockMask; 8155 } 8156 8157 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8158 ArrayRef<VPValue *> Operands, 8159 VFRange &Range, 8160 VPlanPtr &Plan) { 8161 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8162 "Must be called with either a load or store"); 8163 8164 auto willWiden = [&](ElementCount VF) -> bool { 8165 LoopVectorizationCostModel::InstWidening Decision = 8166 CM.getWideningDecision(I, VF); 8167 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8168 "CM decision should be taken at this point."); 8169 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8170 return true; 8171 if (CM.isScalarAfterVectorization(I, VF) || 8172 CM.isProfitableToScalarize(I, VF)) 8173 return false; 8174 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8175 }; 8176 8177 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8178 return nullptr; 8179 8180 VPValue *Mask = nullptr; 8181 if (Legal->isMaskRequired(I)) 8182 Mask = createBlockInMask(I->getParent(), Plan); 8183 8184 // Determine if the pointer operand of the access is either consecutive or 8185 // reverse consecutive. 8186 LoopVectorizationCostModel::InstWidening Decision = 8187 CM.getWideningDecision(I, Range.Start); 8188 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8189 bool Consecutive = 8190 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8191 8192 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8193 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8194 Consecutive, Reverse); 8195 8196 StoreInst *Store = cast<StoreInst>(I); 8197 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8198 Mask, Consecutive, Reverse); 8199 } 8200 8201 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8202 /// insert a recipe to expand the step for the induction recipe. 8203 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8204 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8205 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8206 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8207 // Returns true if an instruction \p I should be scalarized instead of 8208 // vectorized for the chosen vectorization factor. 8209 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8210 return CM.isScalarAfterVectorization(I, VF) || 8211 CM.isProfitableToScalarize(I, VF); 8212 }; 8213 8214 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8215 [&](ElementCount VF) { 8216 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8217 }, 8218 Range); 8219 assert(IndDesc.getStartValue() == 8220 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8221 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8222 "step must be loop invariant"); 8223 8224 VPValue *Step = 8225 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8226 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8227 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8228 !NeedsScalarIVOnly); 8229 } 8230 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8231 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8232 !NeedsScalarIVOnly); 8233 } 8234 8235 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8236 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8237 8238 // Check if this is an integer or fp induction. If so, build the recipe that 8239 // produces its scalar and vector values. 8240 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8241 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8242 *PSE.getSE(), *OrigLoop, Range); 8243 8244 // Check if this is pointer induction. If so, build the recipe for it. 8245 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8246 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8247 *PSE.getSE()); 8248 assert(isa<SCEVConstant>(II->getStep())); 8249 return new VPWidenPointerInductionRecipe( 8250 Phi, Operands[0], Step, *II, 8251 LoopVectorizationPlanner::getDecisionAndClampRange( 8252 [&](ElementCount VF) { 8253 return CM.isScalarAfterVectorization(Phi, VF); 8254 }, 8255 Range)); 8256 } 8257 return nullptr; 8258 } 8259 8260 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8261 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8262 // Optimize the special case where the source is a constant integer 8263 // induction variable. Notice that we can only optimize the 'trunc' case 8264 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8265 // (c) other casts depend on pointer size. 8266 8267 // Determine whether \p K is a truncation based on an induction variable that 8268 // can be optimized. 8269 auto isOptimizableIVTruncate = 8270 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8271 return [=](ElementCount VF) -> bool { 8272 return CM.isOptimizableIVTruncate(K, VF); 8273 }; 8274 }; 8275 8276 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8277 isOptimizableIVTruncate(I), Range)) { 8278 8279 auto *Phi = cast<PHINode>(I->getOperand(0)); 8280 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8281 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8282 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8283 *PSE.getSE(), *OrigLoop, Range); 8284 } 8285 return nullptr; 8286 } 8287 8288 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8289 ArrayRef<VPValue *> Operands, 8290 VPlanPtr &Plan) { 8291 // If all incoming values are equal, the incoming VPValue can be used directly 8292 // instead of creating a new VPBlendRecipe. 8293 if (llvm::all_equal(Operands)) 8294 return Operands[0]; 8295 8296 unsigned NumIncoming = Phi->getNumIncomingValues(); 8297 // For in-loop reductions, we do not need to create an additional select. 8298 VPValue *InLoopVal = nullptr; 8299 for (unsigned In = 0; In < NumIncoming; In++) { 8300 PHINode *PhiOp = 8301 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8302 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8303 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8304 InLoopVal = Operands[In]; 8305 } 8306 } 8307 8308 assert((!InLoopVal || NumIncoming == 2) && 8309 "Found an in-loop reduction for PHI with unexpected number of " 8310 "incoming values"); 8311 if (InLoopVal) 8312 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8313 8314 // We know that all PHIs in non-header blocks are converted into selects, so 8315 // we don't have to worry about the insertion order and we can just use the 8316 // builder. At this point we generate the predication tree. There may be 8317 // duplications since this is a simple recursive scan, but future 8318 // optimizations will clean it up. 8319 SmallVector<VPValue *, 2> OperandsWithMask; 8320 8321 for (unsigned In = 0; In < NumIncoming; In++) { 8322 VPValue *EdgeMask = 8323 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8324 assert((EdgeMask || NumIncoming == 1) && 8325 "Multiple predecessors with one having a full mask"); 8326 OperandsWithMask.push_back(Operands[In]); 8327 if (EdgeMask) 8328 OperandsWithMask.push_back(EdgeMask); 8329 } 8330 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8331 } 8332 8333 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8334 ArrayRef<VPValue *> Operands, 8335 VFRange &Range) const { 8336 8337 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8338 [this, CI](ElementCount VF) { 8339 return CM.isScalarWithPredication(CI, VF); 8340 }, 8341 Range); 8342 8343 if (IsPredicated) 8344 return nullptr; 8345 8346 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8347 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8348 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8349 ID == Intrinsic::pseudoprobe || 8350 ID == Intrinsic::experimental_noalias_scope_decl)) 8351 return nullptr; 8352 8353 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8354 8355 // Is it beneficial to perform intrinsic call compared to lib call? 8356 bool ShouldUseVectorIntrinsic = 8357 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8358 [&](ElementCount VF) -> bool { 8359 bool NeedToScalarize = false; 8360 // Is it beneficial to perform intrinsic call compared to lib 8361 // call? 8362 InstructionCost CallCost = 8363 CM.getVectorCallCost(CI, VF, NeedToScalarize); 8364 InstructionCost IntrinsicCost = 8365 CM.getVectorIntrinsicCost(CI, VF); 8366 return IntrinsicCost <= CallCost; 8367 }, 8368 Range); 8369 if (ShouldUseVectorIntrinsic) 8370 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8371 8372 // Is better to call a vectorized version of the function than to to scalarize 8373 // the call? 8374 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8375 [&](ElementCount VF) -> bool { 8376 // The following case may be scalarized depending on the VF. 8377 // The flag shows whether we can use a usual Call for vectorized 8378 // version of the instruction. 8379 bool NeedToScalarize = false; 8380 CM.getVectorCallCost(CI, VF, NeedToScalarize); 8381 return !NeedToScalarize; 8382 }, 8383 Range); 8384 if (ShouldUseVectorCall) 8385 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8386 Intrinsic::not_intrinsic); 8387 8388 return nullptr; 8389 } 8390 8391 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8392 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8393 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8394 // Instruction should be widened, unless it is scalar after vectorization, 8395 // scalarization is profitable or it is predicated. 8396 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8397 return CM.isScalarAfterVectorization(I, VF) || 8398 CM.isProfitableToScalarize(I, VF) || 8399 CM.isScalarWithPredication(I, VF); 8400 }; 8401 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8402 Range); 8403 } 8404 8405 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8406 ArrayRef<VPValue *> Operands, 8407 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8408 switch (I->getOpcode()) { 8409 default: 8410 return nullptr; 8411 case Instruction::SDiv: 8412 case Instruction::UDiv: 8413 case Instruction::SRem: 8414 case Instruction::URem: { 8415 // If not provably safe, use a select to form a safe divisor before widening the 8416 // div/rem operation itself. Otherwise fall through to general handling below. 8417 if (CM.isPredicatedInst(I)) { 8418 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8419 VPValue *Mask = createBlockInMask(I->getParent(), Plan); 8420 VPValue *One = 8421 Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false)); 8422 auto *SafeRHS = 8423 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8424 I->getDebugLoc()); 8425 VPBB->appendRecipe(SafeRHS); 8426 Ops[1] = SafeRHS; 8427 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8428 } 8429 LLVM_FALLTHROUGH; 8430 } 8431 case Instruction::Add: 8432 case Instruction::And: 8433 case Instruction::AShr: 8434 case Instruction::BitCast: 8435 case Instruction::FAdd: 8436 case Instruction::FCmp: 8437 case Instruction::FDiv: 8438 case Instruction::FMul: 8439 case Instruction::FNeg: 8440 case Instruction::FPExt: 8441 case Instruction::FPToSI: 8442 case Instruction::FPToUI: 8443 case Instruction::FPTrunc: 8444 case Instruction::FRem: 8445 case Instruction::FSub: 8446 case Instruction::ICmp: 8447 case Instruction::IntToPtr: 8448 case Instruction::LShr: 8449 case Instruction::Mul: 8450 case Instruction::Or: 8451 case Instruction::PtrToInt: 8452 case Instruction::Select: 8453 case Instruction::SExt: 8454 case Instruction::Shl: 8455 case Instruction::SIToFP: 8456 case Instruction::Sub: 8457 case Instruction::Trunc: 8458 case Instruction::UIToFP: 8459 case Instruction::Xor: 8460 case Instruction::ZExt: 8461 case Instruction::Freeze: 8462 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8463 }; 8464 } 8465 8466 void VPRecipeBuilder::fixHeaderPhis() { 8467 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8468 for (VPHeaderPHIRecipe *R : PhisToFix) { 8469 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8470 VPRecipeBase *IncR = 8471 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8472 R->addOperand(IncR->getVPSingleValue()); 8473 } 8474 } 8475 8476 VPBasicBlock *VPRecipeBuilder::handleReplication( 8477 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8478 VPlanPtr &Plan) { 8479 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8480 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8481 Range); 8482 8483 bool IsPredicated = CM.isPredicatedInst(I); 8484 8485 // Even if the instruction is not marked as uniform, there are certain 8486 // intrinsic calls that can be effectively treated as such, so we check for 8487 // them here. Conservatively, we only do this for scalable vectors, since 8488 // for fixed-width VFs we can always fall back on full scalarization. 8489 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8490 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8491 case Intrinsic::assume: 8492 case Intrinsic::lifetime_start: 8493 case Intrinsic::lifetime_end: 8494 // For scalable vectors if one of the operands is variant then we still 8495 // want to mark as uniform, which will generate one instruction for just 8496 // the first lane of the vector. We can't scalarize the call in the same 8497 // way as for fixed-width vectors because we don't know how many lanes 8498 // there are. 8499 // 8500 // The reasons for doing it this way for scalable vectors are: 8501 // 1. For the assume intrinsic generating the instruction for the first 8502 // lane is still be better than not generating any at all. For 8503 // example, the input may be a splat across all lanes. 8504 // 2. For the lifetime start/end intrinsics the pointer operand only 8505 // does anything useful when the input comes from a stack object, 8506 // which suggests it should always be uniform. For non-stack objects 8507 // the effect is to poison the object, which still allows us to 8508 // remove the call. 8509 IsUniform = true; 8510 break; 8511 default: 8512 break; 8513 } 8514 } 8515 8516 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8517 IsUniform, IsPredicated); 8518 8519 // Find if I uses a predicated instruction. If so, it will use its scalar 8520 // value. Avoid hoisting the insert-element which packs the scalar value into 8521 // a vector value, as that happens iff all users use the vector value. 8522 for (VPValue *Op : Recipe->operands()) { 8523 auto *PredR = 8524 dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe()); 8525 if (!PredR) 8526 continue; 8527 auto *RepR = cast<VPReplicateRecipe>( 8528 PredR->getOperand(0)->getDefiningRecipe()); 8529 assert(RepR->isPredicated() && 8530 "expected Replicate recipe to be predicated"); 8531 RepR->setAlsoPack(false); 8532 } 8533 8534 // Finalize the recipe for Instr, first if it is not predicated. 8535 if (!IsPredicated) { 8536 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8537 setRecipe(I, Recipe); 8538 Plan->addVPValue(I, Recipe); 8539 VPBB->appendRecipe(Recipe); 8540 return VPBB; 8541 } 8542 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8543 8544 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8545 assert(SingleSucc && "VPBB must have a single successor when handling " 8546 "predicated replication."); 8547 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8548 // Record predicated instructions for above packing optimizations. 8549 VPBlockBase *Region = createReplicateRegion(Recipe, Plan); 8550 VPBlockUtils::insertBlockAfter(Region, VPBB); 8551 auto *RegSucc = new VPBasicBlock(); 8552 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8553 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8554 return RegSucc; 8555 } 8556 8557 VPRegionBlock * 8558 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe, 8559 VPlanPtr &Plan) { 8560 Instruction *Instr = PredRecipe->getUnderlyingInstr(); 8561 // Instructions marked for predication are replicated and placed under an 8562 // if-then construct to prevent side-effects. 8563 // Generate recipes to compute the block mask for this region. 8564 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8565 8566 // Build the triangular if-then region. 8567 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8568 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8569 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8570 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8571 auto *PHIRecipe = Instr->getType()->isVoidTy() 8572 ? nullptr 8573 : new VPPredInstPHIRecipe(PredRecipe); 8574 if (PHIRecipe) { 8575 setRecipe(Instr, PHIRecipe); 8576 Plan->addVPValue(Instr, PHIRecipe); 8577 } else { 8578 setRecipe(Instr, PredRecipe); 8579 Plan->addVPValue(Instr, PredRecipe); 8580 } 8581 8582 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8583 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8584 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8585 8586 // Note: first set Entry as region entry and then connect successors starting 8587 // from it in order, to propagate the "parent" of each VPBasicBlock. 8588 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8589 VPBlockUtils::connectBlocks(Pred, Exiting); 8590 8591 return Region; 8592 } 8593 8594 VPRecipeOrVPValueTy 8595 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8596 ArrayRef<VPValue *> Operands, 8597 VFRange &Range, VPBasicBlock *VPBB, 8598 VPlanPtr &Plan) { 8599 // First, check for specific widening recipes that deal with inductions, Phi 8600 // nodes, calls and memory operations. 8601 VPRecipeBase *Recipe; 8602 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8603 if (Phi->getParent() != OrigLoop->getHeader()) 8604 return tryToBlend(Phi, Operands, Plan); 8605 8606 // Always record recipes for header phis. Later first-order recurrence phis 8607 // can have earlier phis as incoming values. 8608 recordRecipeOf(Phi); 8609 8610 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8611 return toVPRecipeResult(Recipe); 8612 8613 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8614 assert((Legal->isReductionVariable(Phi) || 8615 Legal->isFixedOrderRecurrence(Phi)) && 8616 "can only widen reductions and fixed-order recurrences here"); 8617 VPValue *StartV = Operands[0]; 8618 if (Legal->isReductionVariable(Phi)) { 8619 const RecurrenceDescriptor &RdxDesc = 8620 Legal->getReductionVars().find(Phi)->second; 8621 assert(RdxDesc.getRecurrenceStartValue() == 8622 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8623 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8624 CM.isInLoopReduction(Phi), 8625 CM.useOrderedReductions(RdxDesc)); 8626 } else { 8627 // TODO: Currently fixed-order recurrences are modeled as chains of 8628 // first-order recurrences. If there are no users of the intermediate 8629 // recurrences in the chain, the fixed order recurrence should be modeled 8630 // directly, enabling more efficient codegen. 8631 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8632 } 8633 8634 // Record the incoming value from the backedge, so we can add the incoming 8635 // value from the backedge after all recipes have been created. 8636 auto *Inc = cast<Instruction>( 8637 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8638 auto RecipeIter = Ingredient2Recipe.find(Inc); 8639 if (RecipeIter == Ingredient2Recipe.end()) 8640 recordRecipeOf(Inc); 8641 8642 PhisToFix.push_back(PhiRecipe); 8643 return toVPRecipeResult(PhiRecipe); 8644 } 8645 8646 if (isa<TruncInst>(Instr) && 8647 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8648 Range, *Plan))) 8649 return toVPRecipeResult(Recipe); 8650 8651 // All widen recipes below deal only with VF > 1. 8652 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8653 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8654 return nullptr; 8655 8656 if (auto *CI = dyn_cast<CallInst>(Instr)) 8657 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8658 8659 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8660 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8661 8662 if (!shouldWiden(Instr, Range)) 8663 return nullptr; 8664 8665 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8666 return toVPRecipeResult(new VPWidenGEPRecipe( 8667 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8668 8669 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8670 bool InvariantCond = 8671 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8672 return toVPRecipeResult(new VPWidenSelectRecipe( 8673 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8674 } 8675 8676 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8677 } 8678 8679 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8680 ElementCount MaxVF) { 8681 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8682 8683 // Add assume instructions we need to drop to DeadInstructions, to prevent 8684 // them from being added to the VPlan. 8685 // TODO: We only need to drop assumes in blocks that get flattend. If the 8686 // control flow is preserved, we should keep them. 8687 SmallPtrSet<Instruction *, 4> DeadInstructions; 8688 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8689 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8690 8691 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8692 // Dead instructions do not need sinking. Remove them from SinkAfter. 8693 for (Instruction *I : DeadInstructions) 8694 SinkAfter.erase(I); 8695 8696 // Cannot sink instructions after dead instructions (there won't be any 8697 // recipes for them). Instead, find the first non-dead previous instruction. 8698 for (auto &P : Legal->getSinkAfter()) { 8699 Instruction *SinkTarget = P.second; 8700 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8701 (void)FirstInst; 8702 while (DeadInstructions.contains(SinkTarget)) { 8703 assert( 8704 SinkTarget != FirstInst && 8705 "Must find a live instruction (at least the one feeding the " 8706 "fixed-order recurrence PHI) before reaching beginning of the block"); 8707 SinkTarget = SinkTarget->getPrevNode(); 8708 assert(SinkTarget != P.first && 8709 "sink source equals target, no sinking required"); 8710 } 8711 P.second = SinkTarget; 8712 } 8713 8714 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8715 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8716 VFRange SubRange = {VF, MaxVFPlusOne}; 8717 VPlans.push_back( 8718 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8719 VF = SubRange.End; 8720 } 8721 } 8722 8723 // Add the necessary canonical IV and branch recipes required to control the 8724 // loop. 8725 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8726 bool HasNUW, 8727 bool UseLaneMaskForLoopControlFlow) { 8728 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8729 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8730 8731 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8732 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8733 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8734 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8735 Header->insert(CanonicalIVPHI, Header->begin()); 8736 8737 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8738 // IV by VF * UF. 8739 auto *CanonicalIVIncrement = 8740 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8741 : VPInstruction::CanonicalIVIncrement, 8742 {CanonicalIVPHI}, DL, "index.next"); 8743 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8744 8745 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8746 EB->appendRecipe(CanonicalIVIncrement); 8747 8748 if (UseLaneMaskForLoopControlFlow) { 8749 // Create the active lane mask instruction in the vplan preheader. 8750 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); 8751 8752 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8753 // we have to take unrolling into account. Each part needs to start at 8754 // Part * VF 8755 auto *CanonicalIVIncrementParts = 8756 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8757 : VPInstruction::CanonicalIVIncrementForPart, 8758 {StartV}, DL, "index.part.next"); 8759 Preheader->appendRecipe(CanonicalIVIncrementParts); 8760 8761 // Create the ActiveLaneMask instruction using the correct start values. 8762 VPValue *TC = Plan.getOrCreateTripCount(); 8763 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8764 {CanonicalIVIncrementParts, TC}, DL, 8765 "active.lane.mask.entry"); 8766 Preheader->appendRecipe(EntryALM); 8767 8768 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8769 // preheader ActiveLaneMask instruction. 8770 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8771 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8772 8773 // Create the active lane mask for the next iteration of the loop. 8774 CanonicalIVIncrementParts = 8775 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8776 : VPInstruction::CanonicalIVIncrementForPart, 8777 {CanonicalIVIncrement}, DL); 8778 EB->appendRecipe(CanonicalIVIncrementParts); 8779 8780 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8781 {CanonicalIVIncrementParts, TC}, DL, 8782 "active.lane.mask.next"); 8783 EB->appendRecipe(ALM); 8784 LaneMaskPhi->addOperand(ALM); 8785 8786 // We have to invert the mask here because a true condition means jumping 8787 // to the exit block. 8788 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8789 EB->appendRecipe(NotMask); 8790 8791 VPInstruction *BranchBack = 8792 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8793 EB->appendRecipe(BranchBack); 8794 } else { 8795 // Add the BranchOnCount VPInstruction to the latch. 8796 VPInstruction *BranchBack = new VPInstruction( 8797 VPInstruction::BranchOnCount, 8798 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8799 EB->appendRecipe(BranchBack); 8800 } 8801 } 8802 8803 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8804 // original exit block. 8805 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8806 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8807 VPlan &Plan) { 8808 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8809 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8810 // Only handle single-exit loops with unique exit blocks for now. 8811 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8812 return; 8813 8814 // Introduce VPUsers modeling the exit values. 8815 for (PHINode &ExitPhi : ExitBB->phis()) { 8816 Value *IncomingValue = 8817 ExitPhi.getIncomingValueForBlock(ExitingBB); 8818 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8819 Plan.addLiveOut(&ExitPhi, V); 8820 } 8821 } 8822 8823 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8824 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8825 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8826 8827 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8828 8829 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8830 8831 // --------------------------------------------------------------------------- 8832 // Pre-construction: record ingredients whose recipes we'll need to further 8833 // process after constructing the initial VPlan. 8834 // --------------------------------------------------------------------------- 8835 8836 // Mark instructions we'll need to sink later and their targets as 8837 // ingredients whose recipe we'll need to record. 8838 for (const auto &Entry : SinkAfter) { 8839 RecipeBuilder.recordRecipeOf(Entry.first); 8840 RecipeBuilder.recordRecipeOf(Entry.second); 8841 } 8842 for (const auto &Reduction : CM.getInLoopReductionChains()) { 8843 PHINode *Phi = Reduction.first; 8844 RecurKind Kind = 8845 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8846 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8847 8848 RecipeBuilder.recordRecipeOf(Phi); 8849 for (const auto &R : ReductionOperations) { 8850 RecipeBuilder.recordRecipeOf(R); 8851 // For min/max reductions, where we have a pair of icmp/select, we also 8852 // need to record the ICmp recipe, so it can be removed later. 8853 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8854 "Only min/max recurrences allowed for inloop reductions"); 8855 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8856 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8857 } 8858 } 8859 8860 // For each interleave group which is relevant for this (possibly trimmed) 8861 // Range, add it to the set of groups to be later applied to the VPlan and add 8862 // placeholders for its members' Recipes which we'll be replacing with a 8863 // single VPInterleaveRecipe. 8864 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8865 auto applyIG = [IG, this](ElementCount VF) -> bool { 8866 return (VF.isVector() && // Query is illegal for VF == 1 8867 CM.getWideningDecision(IG->getInsertPos(), VF) == 8868 LoopVectorizationCostModel::CM_Interleave); 8869 }; 8870 if (!getDecisionAndClampRange(applyIG, Range)) 8871 continue; 8872 InterleaveGroups.insert(IG); 8873 for (unsigned i = 0; i < IG->getFactor(); i++) 8874 if (Instruction *Member = IG->getMember(i)) 8875 RecipeBuilder.recordRecipeOf(Member); 8876 }; 8877 8878 // --------------------------------------------------------------------------- 8879 // Build initial VPlan: Scan the body of the loop in a topological order to 8880 // visit each basic block after having visited its predecessor basic blocks. 8881 // --------------------------------------------------------------------------- 8882 8883 // Create initial VPlan skeleton, starting with a block for the pre-header, 8884 // followed by a region for the vector loop, followed by the middle block. The 8885 // skeleton vector loop region contains a header and latch block. 8886 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8887 auto Plan = std::make_unique<VPlan>(Preheader); 8888 8889 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8890 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8891 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8892 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8893 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8894 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8895 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8896 8897 Instruction *DLInst = 8898 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8899 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8900 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8901 !CM.foldTailByMasking(), 8902 CM.useActiveLaneMaskForControlFlow()); 8903 8904 // Scan the body of the loop in a topological order to visit each basic block 8905 // after having visited its predecessor basic blocks. 8906 LoopBlocksDFS DFS(OrigLoop); 8907 DFS.perform(LI); 8908 8909 VPBasicBlock *VPBB = HeaderVPBB; 8910 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8911 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8912 // Relevant instructions from basic block BB will be grouped into VPRecipe 8913 // ingredients and fill a new VPBasicBlock. 8914 unsigned VPBBsForBB = 0; 8915 if (VPBB != HeaderVPBB) 8916 VPBB->setName(BB->getName()); 8917 Builder.setInsertPoint(VPBB); 8918 8919 // Introduce each ingredient into VPlan. 8920 // TODO: Model and preserve debug intrinsics in VPlan. 8921 for (Instruction &I : BB->instructionsWithoutDebug()) { 8922 Instruction *Instr = &I; 8923 8924 // First filter out irrelevant instructions, to ensure no recipes are 8925 // built for them. 8926 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8927 continue; 8928 8929 SmallVector<VPValue *, 4> Operands; 8930 auto *Phi = dyn_cast<PHINode>(Instr); 8931 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8932 Operands.push_back(Plan->getOrAddVPValue( 8933 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8934 } else { 8935 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8936 Operands = {OpRange.begin(), OpRange.end()}; 8937 } 8938 8939 // Invariant stores inside loop will be deleted and a single store 8940 // with the final reduction value will be added to the exit block 8941 StoreInst *SI; 8942 if ((SI = dyn_cast<StoreInst>(&I)) && 8943 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8944 continue; 8945 8946 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8947 Instr, Operands, Range, VPBB, Plan)) { 8948 // If Instr can be simplified to an existing VPValue, use it. 8949 if (RecipeOrValue.is<VPValue *>()) { 8950 auto *VPV = RecipeOrValue.get<VPValue *>(); 8951 Plan->addVPValue(Instr, VPV); 8952 // If the re-used value is a recipe, register the recipe for the 8953 // instruction, in case the recipe for Instr needs to be recorded. 8954 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8955 RecipeBuilder.setRecipe(Instr, R); 8956 continue; 8957 } 8958 // Otherwise, add the new recipe. 8959 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8960 for (auto *Def : Recipe->definedValues()) { 8961 auto *UV = Def->getUnderlyingValue(); 8962 Plan->addVPValue(UV, Def); 8963 } 8964 8965 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8966 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8967 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8968 // of the header block. That can happen for truncates of induction 8969 // variables. Those recipes are moved to the phi section of the header 8970 // block after applying SinkAfter, which relies on the original 8971 // position of the trunc. 8972 assert(isa<TruncInst>(Instr)); 8973 InductionsToMove.push_back( 8974 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8975 } 8976 RecipeBuilder.setRecipe(Instr, Recipe); 8977 VPBB->appendRecipe(Recipe); 8978 continue; 8979 } 8980 8981 // Otherwise, if all widening options failed, Instruction is to be 8982 // replicated. This may create a successor for VPBB. 8983 VPBasicBlock *NextVPBB = 8984 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8985 if (NextVPBB != VPBB) { 8986 VPBB = NextVPBB; 8987 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8988 : ""); 8989 } 8990 } 8991 8992 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8993 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8994 } 8995 8996 // After here, VPBB should not be used. 8997 VPBB = nullptr; 8998 8999 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 9000 9001 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9002 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9003 "entry block must be set to a VPRegionBlock having a non-empty entry " 9004 "VPBasicBlock"); 9005 RecipeBuilder.fixHeaderPhis(); 9006 9007 // --------------------------------------------------------------------------- 9008 // Transform initial VPlan: Apply previously taken decisions, in order, to 9009 // bring the VPlan to its final state. 9010 // --------------------------------------------------------------------------- 9011 9012 // Apply Sink-After legal constraints. 9013 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9014 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9015 if (Region && Region->isReplicator()) { 9016 assert(Region->getNumSuccessors() == 1 && 9017 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9018 assert(R->getParent()->size() == 1 && 9019 "A recipe in an original replicator region must be the only " 9020 "recipe in its block"); 9021 return Region; 9022 } 9023 return nullptr; 9024 }; 9025 for (const auto &Entry : SinkAfter) { 9026 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9027 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9028 9029 auto *TargetRegion = GetReplicateRegion(Target); 9030 auto *SinkRegion = GetReplicateRegion(Sink); 9031 if (!SinkRegion) { 9032 // If the sink source is not a replicate region, sink the recipe directly. 9033 if (TargetRegion) { 9034 // The target is in a replication region, make sure to move Sink to 9035 // the block after it, not into the replication region itself. 9036 VPBasicBlock *NextBlock = 9037 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9038 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9039 } else 9040 Sink->moveAfter(Target); 9041 continue; 9042 } 9043 9044 // The sink source is in a replicate region. Unhook the region from the CFG. 9045 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9046 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9047 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9048 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9049 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9050 9051 if (TargetRegion) { 9052 // The target recipe is also in a replicate region, move the sink region 9053 // after the target region. 9054 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9055 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9056 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9057 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9058 } else { 9059 // The sink source is in a replicate region, we need to move the whole 9060 // replicate region, which should only contain a single recipe in the 9061 // main block. 9062 auto *SplitBlock = 9063 Target->getParent()->splitAt(std::next(Target->getIterator())); 9064 9065 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9066 9067 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9068 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9069 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9070 } 9071 } 9072 9073 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9074 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9075 9076 // Now that sink-after is done, move induction recipes for optimized truncates 9077 // to the phi section of the header block. 9078 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9079 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9080 9081 // Adjust the recipes for any inloop reductions. 9082 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 9083 RecipeBuilder, Range.Start); 9084 9085 // Introduce a recipe to combine the incoming and previous values of a 9086 // fixed-order recurrence. 9087 for (VPRecipeBase &R : 9088 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9089 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9090 if (!RecurPhi) 9091 continue; 9092 9093 VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); 9094 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed 9095 // to terminate. 9096 while (auto *PrevPhi = 9097 dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe)) 9098 PrevRecipe = &PrevPhi->getBackedgeRecipe(); 9099 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9100 auto *Region = GetReplicateRegion(PrevRecipe); 9101 if (Region) 9102 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 9103 if (!InsertBlock) { 9104 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 9105 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 9106 } 9107 if (Region || PrevRecipe->isPhi()) 9108 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9109 else 9110 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9111 9112 auto *RecurSplice = cast<VPInstruction>( 9113 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9114 {RecurPhi, RecurPhi->getBackedgeValue()})); 9115 9116 RecurPhi->replaceAllUsesWith(RecurSplice); 9117 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9118 // all users. 9119 RecurSplice->setOperand(0, RecurPhi); 9120 } 9121 9122 // Interleave memory: for each Interleave Group we marked earlier as relevant 9123 // for this VPlan, replace the Recipes widening its memory instructions with a 9124 // single VPInterleaveRecipe at its insertion point. 9125 for (const auto *IG : InterleaveGroups) { 9126 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9127 RecipeBuilder.getRecipe(IG->getInsertPos())); 9128 SmallVector<VPValue *, 4> StoredValues; 9129 for (unsigned i = 0; i < IG->getFactor(); ++i) 9130 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9131 auto *StoreR = 9132 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9133 StoredValues.push_back(StoreR->getStoredValue()); 9134 } 9135 9136 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9137 Recipe->getMask()); 9138 VPIG->insertBefore(Recipe); 9139 unsigned J = 0; 9140 for (unsigned i = 0; i < IG->getFactor(); ++i) 9141 if (Instruction *Member = IG->getMember(i)) { 9142 if (!Member->getType()->isVoidTy()) { 9143 VPValue *OriginalV = Plan->getVPValue(Member); 9144 Plan->removeVPValueFor(Member); 9145 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9146 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9147 J++; 9148 } 9149 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9150 } 9151 } 9152 9153 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9154 VF *= 2) 9155 Plan->addVF(VF); 9156 Plan->setName("Initial VPlan"); 9157 9158 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9159 // in ways that accessing values using original IR values is incorrect. 9160 Plan->disableValue2VPValue(); 9161 9162 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9163 VPlanTransforms::removeDeadRecipes(*Plan); 9164 9165 bool ShouldSimplify = true; 9166 while (ShouldSimplify) { 9167 ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan); 9168 ShouldSimplify |= 9169 VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan); 9170 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9171 } 9172 9173 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9174 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9175 9176 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9177 return Plan; 9178 } 9179 9180 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9181 // Outer loop handling: They may require CFG and instruction level 9182 // transformations before even evaluating whether vectorization is profitable. 9183 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9184 // the vectorization pipeline. 9185 assert(!OrigLoop->isInnermost()); 9186 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9187 9188 // Create new empty VPlan 9189 auto Plan = std::make_unique<VPlan>(); 9190 9191 // Build hierarchical CFG 9192 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9193 HCFGBuilder.buildHierarchicalCFG(); 9194 9195 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9196 VF *= 2) 9197 Plan->addVF(VF); 9198 9199 SmallPtrSet<Instruction *, 1> DeadInstructions; 9200 VPlanTransforms::VPInstructionsToVPRecipes( 9201 OrigLoop, Plan, 9202 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9203 DeadInstructions, *PSE.getSE(), *TLI); 9204 9205 // Remove the existing terminator of the exiting block of the top-most region. 9206 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9207 auto *Term = 9208 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9209 Term->eraseFromParent(); 9210 9211 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9212 true, CM.useActiveLaneMaskForControlFlow()); 9213 return Plan; 9214 } 9215 9216 // Adjust the recipes for reductions. For in-loop reductions the chain of 9217 // instructions leading from the loop exit instr to the phi need to be converted 9218 // to reductions, with one operand being vector and the other being the scalar 9219 // reduction chain. For other reductions, a select is introduced between the phi 9220 // and live-out recipes when folding the tail. 9221 void LoopVectorizationPlanner::adjustRecipesForReductions( 9222 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9223 ElementCount MinVF) { 9224 for (const auto &Reduction : CM.getInLoopReductionChains()) { 9225 PHINode *Phi = Reduction.first; 9226 const RecurrenceDescriptor &RdxDesc = 9227 Legal->getReductionVars().find(Phi)->second; 9228 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9229 9230 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9231 continue; 9232 9233 // ReductionOperations are orders top-down from the phi's use to the 9234 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9235 // which of the two operands will remain scalar and which will be reduced. 9236 // For minmax the chain will be the select instructions. 9237 Instruction *Chain = Phi; 9238 for (Instruction *R : ReductionOperations) { 9239 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9240 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9241 9242 VPValue *ChainOp = Plan->getVPValue(Chain); 9243 unsigned FirstOpId; 9244 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9245 "Only min/max recurrences allowed for inloop reductions"); 9246 // Recognize a call to the llvm.fmuladd intrinsic. 9247 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9248 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9249 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9250 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9251 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9252 "Expected to replace a VPWidenSelectSC"); 9253 FirstOpId = 1; 9254 } else { 9255 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9256 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9257 "Expected to replace a VPWidenSC"); 9258 FirstOpId = 0; 9259 } 9260 unsigned VecOpId = 9261 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9262 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9263 9264 VPValue *CondOp = nullptr; 9265 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { 9266 VPBuilder::InsertPointGuard Guard(Builder); 9267 Builder.setInsertPoint(WidenRecipe->getParent(), 9268 WidenRecipe->getIterator()); 9269 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan); 9270 } 9271 9272 if (IsFMulAdd) { 9273 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9274 // need to create an fmul recipe to use as the vector operand for the 9275 // fadd reduction. 9276 VPInstruction *FMulRecipe = new VPInstruction( 9277 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9278 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9279 WidenRecipe->getParent()->insert(FMulRecipe, 9280 WidenRecipe->getIterator()); 9281 VecOp = FMulRecipe; 9282 } 9283 VPReductionRecipe *RedRecipe = 9284 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9285 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9286 Plan->removeVPValueFor(R); 9287 Plan->addVPValue(R, RedRecipe); 9288 // Append the recipe to the end of the VPBasicBlock because we need to 9289 // ensure that it comes after all of it's inputs, including CondOp. 9290 WidenRecipe->getParent()->appendRecipe(RedRecipe); 9291 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9292 WidenRecipe->eraseFromParent(); 9293 9294 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9295 VPRecipeBase *CompareRecipe = 9296 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9297 assert(isa<VPWidenRecipe>(CompareRecipe) && 9298 "Expected to replace a VPWidenSC"); 9299 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9300 "Expected no remaining users"); 9301 CompareRecipe->eraseFromParent(); 9302 } 9303 Chain = R; 9304 } 9305 } 9306 9307 // If tail is folded by masking, introduce selects between the phi 9308 // and the live-out instruction of each reduction, at the beginning of the 9309 // dedicated latch block. 9310 if (CM.foldTailByMasking()) { 9311 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9312 for (VPRecipeBase &R : 9313 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9314 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9315 if (!PhiR || PhiR->isInLoop()) 9316 continue; 9317 VPValue *Cond = 9318 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9319 VPValue *Red = PhiR->getBackedgeValue(); 9320 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && 9321 "reduction recipe must be defined before latch"); 9322 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9323 } 9324 } 9325 } 9326 9327 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9328 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9329 VPSlotTracker &SlotTracker) const { 9330 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9331 IG->getInsertPos()->printAsOperand(O, false); 9332 O << ", "; 9333 getAddr()->printAsOperand(O, SlotTracker); 9334 VPValue *Mask = getMask(); 9335 if (Mask) { 9336 O << ", "; 9337 Mask->printAsOperand(O, SlotTracker); 9338 } 9339 9340 unsigned OpIdx = 0; 9341 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9342 if (!IG->getMember(i)) 9343 continue; 9344 if (getNumStoreOperands() > 0) { 9345 O << "\n" << Indent << " store "; 9346 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9347 O << " to index " << i; 9348 } else { 9349 O << "\n" << Indent << " "; 9350 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9351 O << " = load from index " << i; 9352 } 9353 ++OpIdx; 9354 } 9355 } 9356 #endif 9357 9358 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9359 assert(!State.Instance && "Int or FP induction being replicated."); 9360 9361 Value *Start = getStartValue()->getLiveInIRValue(); 9362 const InductionDescriptor &ID = getInductionDescriptor(); 9363 TruncInst *Trunc = getTruncInst(); 9364 IRBuilderBase &Builder = State.Builder; 9365 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9366 assert(State.VF.isVector() && "must have vector VF"); 9367 9368 // The value from the original loop to which we are mapping the new induction 9369 // variable. 9370 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9371 9372 // Fast-math-flags propagate from the original induction instruction. 9373 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9374 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9375 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9376 9377 // Now do the actual transformations, and start with fetching the step value. 9378 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9379 9380 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9381 "Expected either an induction phi-node or a truncate of it!"); 9382 9383 // Construct the initial value of the vector IV in the vector loop preheader 9384 auto CurrIP = Builder.saveIP(); 9385 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9386 Builder.SetInsertPoint(VectorPH->getTerminator()); 9387 if (isa<TruncInst>(EntryVal)) { 9388 assert(Start->getType()->isIntegerTy() && 9389 "Truncation requires an integer type"); 9390 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9391 Step = Builder.CreateTrunc(Step, TruncType); 9392 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9393 } 9394 9395 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9396 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9397 Value *SteppedStart = getStepVector( 9398 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9399 9400 // We create vector phi nodes for both integer and floating-point induction 9401 // variables. Here, we determine the kind of arithmetic we will perform. 9402 Instruction::BinaryOps AddOp; 9403 Instruction::BinaryOps MulOp; 9404 if (Step->getType()->isIntegerTy()) { 9405 AddOp = Instruction::Add; 9406 MulOp = Instruction::Mul; 9407 } else { 9408 AddOp = ID.getInductionOpcode(); 9409 MulOp = Instruction::FMul; 9410 } 9411 9412 // Multiply the vectorization factor by the step using integer or 9413 // floating-point arithmetic as appropriate. 9414 Type *StepType = Step->getType(); 9415 Value *RuntimeVF; 9416 if (Step->getType()->isFloatingPointTy()) 9417 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9418 else 9419 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9420 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9421 9422 // Create a vector splat to use in the induction update. 9423 // 9424 // FIXME: If the step is non-constant, we create the vector splat with 9425 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9426 // handle a constant vector splat. 9427 Value *SplatVF = isa<Constant>(Mul) 9428 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9429 : Builder.CreateVectorSplat(State.VF, Mul); 9430 Builder.restoreIP(CurrIP); 9431 9432 // We may need to add the step a number of times, depending on the unroll 9433 // factor. The last of those goes into the PHI. 9434 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9435 &*State.CFG.PrevBB->getFirstInsertionPt()); 9436 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9437 Instruction *LastInduction = VecInd; 9438 for (unsigned Part = 0; Part < State.UF; ++Part) { 9439 State.set(this, LastInduction, Part); 9440 9441 if (isa<TruncInst>(EntryVal)) 9442 State.addMetadata(LastInduction, EntryVal); 9443 9444 LastInduction = cast<Instruction>( 9445 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9446 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9447 } 9448 9449 LastInduction->setName("vec.ind.next"); 9450 VecInd->addIncoming(SteppedStart, VectorPH); 9451 // Add induction update using an incorrect block temporarily. The phi node 9452 // will be fixed after VPlan execution. Note that at this point the latch 9453 // block cannot be used, as it does not exist yet. 9454 // TODO: Model increment value in VPlan, by turning the recipe into a 9455 // multi-def and a subclass of VPHeaderPHIRecipe. 9456 VecInd->addIncoming(LastInduction, VectorPH); 9457 } 9458 9459 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9460 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9461 "Not a pointer induction according to InductionDescriptor!"); 9462 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9463 "Unexpected type."); 9464 9465 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9466 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9467 9468 if (onlyScalarsGenerated(State.VF)) { 9469 // This is the normalized GEP that starts counting at zero. 9470 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9471 CanonicalIV, IndDesc.getStep()->getType()); 9472 // Determine the number of scalars we need to generate for each unroll 9473 // iteration. If the instruction is uniform, we only need to generate the 9474 // first lane. Otherwise, we generate all VF values. 9475 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9476 assert((IsUniform || !State.VF.isScalable()) && 9477 "Cannot scalarize a scalable VF"); 9478 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9479 9480 for (unsigned Part = 0; Part < State.UF; ++Part) { 9481 Value *PartStart = 9482 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9483 9484 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9485 Value *Idx = State.Builder.CreateAdd( 9486 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9487 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9488 9489 Value *Step = State.get(getOperand(1), VPIteration(0, Part)); 9490 Value *SclrGep = emitTransformedIndex( 9491 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9492 SclrGep->setName("next.gep"); 9493 State.set(this, SclrGep, VPIteration(Part, Lane)); 9494 } 9495 } 9496 return; 9497 } 9498 9499 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9500 "Induction step not a SCEV constant!"); 9501 Type *PhiType = IndDesc.getStep()->getType(); 9502 9503 // Build a pointer phi 9504 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9505 Type *ScStValueType = ScalarStartValue->getType(); 9506 PHINode *NewPointerPhi = 9507 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9508 9509 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9510 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9511 9512 // A pointer induction, performed by using a gep 9513 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9514 9515 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9516 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9517 Value *NumUnrolledElems = 9518 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9519 Value *InductionGEP = GetElementPtrInst::Create( 9520 IndDesc.getElementType(), NewPointerPhi, 9521 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9522 InductionLoc); 9523 // Add induction update using an incorrect block temporarily. The phi node 9524 // will be fixed after VPlan execution. Note that at this point the latch 9525 // block cannot be used, as it does not exist yet. 9526 // TODO: Model increment value in VPlan, by turning the recipe into a 9527 // multi-def and a subclass of VPHeaderPHIRecipe. 9528 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9529 9530 // Create UF many actual address geps that use the pointer 9531 // phi as base and a vectorized version of the step value 9532 // (<step*0, ..., step*N>) as offset. 9533 for (unsigned Part = 0; Part < State.UF; ++Part) { 9534 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9535 Value *StartOffsetScalar = 9536 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9537 Value *StartOffset = 9538 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9539 // Create a vector of consecutive numbers from zero to VF. 9540 StartOffset = State.Builder.CreateAdd( 9541 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9542 9543 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && 9544 "scalar step must be the same across all parts"); 9545 Value *GEP = State.Builder.CreateGEP( 9546 IndDesc.getElementType(), NewPointerPhi, 9547 State.Builder.CreateMul( 9548 StartOffset, 9549 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9550 "vector.gep")); 9551 State.set(this, GEP, Part); 9552 } 9553 } 9554 9555 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9556 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9557 9558 // Fast-math-flags propagate from the original induction instruction. 9559 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9560 if (IndDesc.getInductionBinOp() && 9561 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9562 State.Builder.setFastMathFlags( 9563 IndDesc.getInductionBinOp()->getFastMathFlags()); 9564 9565 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9566 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9567 Value *DerivedIV = 9568 emitTransformedIndex(State.Builder, CanonicalIV, 9569 getStartValue()->getLiveInIRValue(), Step, IndDesc); 9570 DerivedIV->setName("offset.idx"); 9571 if (ResultTy != DerivedIV->getType()) { 9572 assert(Step->getType()->isIntegerTy() && 9573 "Truncation requires an integer step"); 9574 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); 9575 } 9576 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9577 9578 State.set(this, DerivedIV, VPIteration(0, 0)); 9579 } 9580 9581 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9582 // Fast-math-flags propagate from the original induction instruction. 9583 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9584 if (IndDesc.getInductionBinOp() && 9585 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9586 State.Builder.setFastMathFlags( 9587 IndDesc.getInductionBinOp()->getFastMathFlags()); 9588 9589 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 9590 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9591 9592 buildScalarSteps(BaseIV, Step, IndDesc, this, State); 9593 } 9594 9595 void VPInterleaveRecipe::execute(VPTransformState &State) { 9596 assert(!State.Instance && "Interleave group being replicated."); 9597 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9598 getStoredValues(), getMask()); 9599 } 9600 9601 void VPReductionRecipe::execute(VPTransformState &State) { 9602 assert(!State.Instance && "Reduction being replicated."); 9603 Value *PrevInChain = State.get(getChainOp(), 0); 9604 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9605 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9606 // Propagate the fast-math flags carried by the underlying instruction. 9607 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9608 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9609 for (unsigned Part = 0; Part < State.UF; ++Part) { 9610 Value *NewVecOp = State.get(getVecOp(), Part); 9611 if (VPValue *Cond = getCondOp()) { 9612 Value *NewCond = State.get(Cond, Part); 9613 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9614 Value *Iden = RdxDesc->getRecurrenceIdentity( 9615 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9616 Value *IdenVec = 9617 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9618 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9619 NewVecOp = Select; 9620 } 9621 Value *NewRed; 9622 Value *NextInChain; 9623 if (IsOrdered) { 9624 if (State.VF.isVector()) 9625 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9626 PrevInChain); 9627 else 9628 NewRed = State.Builder.CreateBinOp( 9629 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9630 NewVecOp); 9631 PrevInChain = NewRed; 9632 } else { 9633 PrevInChain = State.get(getChainOp(), Part); 9634 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9635 } 9636 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9637 NextInChain = 9638 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9639 NewRed, PrevInChain); 9640 } else if (IsOrdered) 9641 NextInChain = NewRed; 9642 else 9643 NextInChain = State.Builder.CreateBinOp( 9644 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9645 PrevInChain); 9646 State.set(this, NextInChain, Part); 9647 } 9648 } 9649 9650 void VPReplicateRecipe::execute(VPTransformState &State) { 9651 Instruction *UI = getUnderlyingInstr(); 9652 if (State.Instance) { // Generate a single instance. 9653 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9654 State.ILV->scalarizeInstruction(UI, this, *State.Instance, 9655 IsPredicated, State); 9656 // Insert scalar instance packing it into a vector. 9657 if (AlsoPack && State.VF.isVector()) { 9658 // If we're constructing lane 0, initialize to start from poison. 9659 if (State.Instance->Lane.isFirstLane()) { 9660 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9661 Value *Poison = PoisonValue::get( 9662 VectorType::get(UI->getType(), State.VF)); 9663 State.set(this, Poison, State.Instance->Part); 9664 } 9665 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9666 } 9667 return; 9668 } 9669 9670 if (IsUniform) { 9671 // If the recipe is uniform across all parts (instead of just per VF), only 9672 // generate a single instance. 9673 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9674 all_of(operands(), [](VPValue *Op) { 9675 return Op->isDefinedOutsideVectorRegions(); 9676 })) { 9677 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated, 9678 State); 9679 if (user_begin() != user_end()) { 9680 for (unsigned Part = 1; Part < State.UF; ++Part) 9681 State.set(this, State.get(this, VPIteration(0, 0)), 9682 VPIteration(Part, 0)); 9683 } 9684 return; 9685 } 9686 9687 // Uniform within VL means we need to generate lane 0 only for each 9688 // unrolled copy. 9689 for (unsigned Part = 0; Part < State.UF; ++Part) 9690 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), 9691 IsPredicated, State); 9692 return; 9693 } 9694 9695 // A store of a loop varying value to a loop invariant address only 9696 // needs only the last copy of the store. 9697 if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) { 9698 auto Lane = VPLane::getLastLaneForVF(State.VF); 9699 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated, 9700 State); 9701 return; 9702 } 9703 9704 // Generate scalar instances for all VF lanes of all UF parts. 9705 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9706 const unsigned EndLane = State.VF.getKnownMinValue(); 9707 for (unsigned Part = 0; Part < State.UF; ++Part) 9708 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9709 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), 9710 IsPredicated, State); 9711 } 9712 9713 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9714 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9715 9716 // Attempt to issue a wide load. 9717 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9718 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9719 9720 assert((LI || SI) && "Invalid Load/Store instruction"); 9721 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9722 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9723 9724 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9725 9726 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9727 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9728 bool CreateGatherScatter = !Consecutive; 9729 9730 auto &Builder = State.Builder; 9731 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9732 bool isMaskRequired = getMask(); 9733 if (isMaskRequired) 9734 for (unsigned Part = 0; Part < State.UF; ++Part) 9735 BlockInMaskParts[Part] = State.get(getMask(), Part); 9736 9737 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9738 // Calculate the pointer for the specific unroll-part. 9739 GetElementPtrInst *PartPtr = nullptr; 9740 9741 bool InBounds = false; 9742 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9743 InBounds = gep->isInBounds(); 9744 if (Reverse) { 9745 // If the address is consecutive but reversed, then the 9746 // wide store needs to start at the last vector element. 9747 // RunTimeVF = VScale * VF.getKnownMinValue() 9748 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9749 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9750 // NumElt = -Part * RunTimeVF 9751 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9752 // LastLane = 1 - RunTimeVF 9753 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9754 PartPtr = 9755 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9756 PartPtr->setIsInBounds(InBounds); 9757 PartPtr = cast<GetElementPtrInst>( 9758 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9759 PartPtr->setIsInBounds(InBounds); 9760 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9761 BlockInMaskParts[Part] = 9762 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9763 } else { 9764 Value *Increment = 9765 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9766 PartPtr = cast<GetElementPtrInst>( 9767 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9768 PartPtr->setIsInBounds(InBounds); 9769 } 9770 9771 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9772 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9773 }; 9774 9775 // Handle Stores: 9776 if (SI) { 9777 State.setDebugLocFromInst(SI); 9778 9779 for (unsigned Part = 0; Part < State.UF; ++Part) { 9780 Instruction *NewSI = nullptr; 9781 Value *StoredVal = State.get(StoredValue, Part); 9782 if (CreateGatherScatter) { 9783 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9784 Value *VectorGep = State.get(getAddr(), Part); 9785 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9786 MaskPart); 9787 } else { 9788 if (Reverse) { 9789 // If we store to reverse consecutive memory locations, then we need 9790 // to reverse the order of elements in the stored value. 9791 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9792 // We don't want to update the value in the map as it might be used in 9793 // another expression. So don't call resetVectorValue(StoredVal). 9794 } 9795 auto *VecPtr = 9796 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9797 if (isMaskRequired) 9798 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9799 BlockInMaskParts[Part]); 9800 else 9801 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9802 } 9803 State.addMetadata(NewSI, SI); 9804 } 9805 return; 9806 } 9807 9808 // Handle loads. 9809 assert(LI && "Must have a load instruction"); 9810 State.setDebugLocFromInst(LI); 9811 for (unsigned Part = 0; Part < State.UF; ++Part) { 9812 Value *NewLI; 9813 if (CreateGatherScatter) { 9814 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9815 Value *VectorGep = State.get(getAddr(), Part); 9816 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9817 nullptr, "wide.masked.gather"); 9818 State.addMetadata(NewLI, LI); 9819 } else { 9820 auto *VecPtr = 9821 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9822 if (isMaskRequired) 9823 NewLI = Builder.CreateMaskedLoad( 9824 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9825 PoisonValue::get(DataTy), "wide.masked.load"); 9826 else 9827 NewLI = 9828 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9829 9830 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9831 State.addMetadata(NewLI, LI); 9832 if (Reverse) 9833 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9834 } 9835 9836 State.set(getVPSingleValue(), NewLI, Part); 9837 } 9838 } 9839 9840 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9841 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9842 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9843 // for predication. 9844 static ScalarEpilogueLowering getScalarEpilogueLowering( 9845 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9846 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9847 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9848 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9849 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9850 // don't look at hints or options, and don't request a scalar epilogue. 9851 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9852 // LoopAccessInfo (due to code dependency and not being able to reliably get 9853 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9854 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9855 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9856 // back to the old way and vectorize with versioning when forced. See D81345.) 9857 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9858 PGSOQueryType::IRPass) && 9859 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9860 return CM_ScalarEpilogueNotAllowedOptSize; 9861 9862 // 2) If set, obey the directives 9863 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9864 switch (PreferPredicateOverEpilogue) { 9865 case PreferPredicateTy::ScalarEpilogue: 9866 return CM_ScalarEpilogueAllowed; 9867 case PreferPredicateTy::PredicateElseScalarEpilogue: 9868 return CM_ScalarEpilogueNotNeededUsePredicate; 9869 case PreferPredicateTy::PredicateOrDontVectorize: 9870 return CM_ScalarEpilogueNotAllowedUsePredicate; 9871 }; 9872 } 9873 9874 // 3) If set, obey the hints 9875 switch (Hints.getPredicate()) { 9876 case LoopVectorizeHints::FK_Enabled: 9877 return CM_ScalarEpilogueNotNeededUsePredicate; 9878 case LoopVectorizeHints::FK_Disabled: 9879 return CM_ScalarEpilogueAllowed; 9880 }; 9881 9882 // 4) if the TTI hook indicates this is profitable, request predication. 9883 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) 9884 return CM_ScalarEpilogueNotNeededUsePredicate; 9885 9886 return CM_ScalarEpilogueAllowed; 9887 } 9888 9889 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9890 // If Values have been set for this Def return the one relevant for \p Part. 9891 if (hasVectorValue(Def, Part)) 9892 return Data.PerPartOutput[Def][Part]; 9893 9894 if (!hasScalarValue(Def, {Part, 0})) { 9895 Value *IRV = Def->getLiveInIRValue(); 9896 Value *B = ILV->getBroadcastInstrs(IRV); 9897 set(Def, B, Part); 9898 return B; 9899 } 9900 9901 Value *ScalarValue = get(Def, {Part, 0}); 9902 // If we aren't vectorizing, we can just copy the scalar map values over 9903 // to the vector map. 9904 if (VF.isScalar()) { 9905 set(Def, ScalarValue, Part); 9906 return ScalarValue; 9907 } 9908 9909 bool IsUniform = vputils::isUniformAfterVectorization(Def); 9910 9911 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9912 // Check if there is a scalar value for the selected lane. 9913 if (!hasScalarValue(Def, {Part, LastLane})) { 9914 // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform. 9915 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || 9916 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && 9917 "unexpected recipe found to be invariant"); 9918 IsUniform = true; 9919 LastLane = 0; 9920 } 9921 9922 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9923 // Set the insert point after the last scalarized instruction or after the 9924 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9925 // will directly follow the scalar definitions. 9926 auto OldIP = Builder.saveIP(); 9927 auto NewIP = 9928 isa<PHINode>(LastInst) 9929 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9930 : std::next(BasicBlock::iterator(LastInst)); 9931 Builder.SetInsertPoint(&*NewIP); 9932 9933 // However, if we are vectorizing, we need to construct the vector values. 9934 // If the value is known to be uniform after vectorization, we can just 9935 // broadcast the scalar value corresponding to lane zero for each unroll 9936 // iteration. Otherwise, we construct the vector values using 9937 // insertelement instructions. Since the resulting vectors are stored in 9938 // State, we will only generate the insertelements once. 9939 Value *VectorValue = nullptr; 9940 if (IsUniform) { 9941 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9942 set(Def, VectorValue, Part); 9943 } else { 9944 // Initialize packing with insertelements to start from undef. 9945 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9946 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9947 set(Def, Undef, Part); 9948 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9949 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9950 VectorValue = get(Def, Part); 9951 } 9952 Builder.restoreIP(OldIP); 9953 return VectorValue; 9954 } 9955 9956 // Process the loop in the VPlan-native vectorization path. This path builds 9957 // VPlan upfront in the vectorization pipeline, which allows to apply 9958 // VPlan-to-VPlan transformations from the very beginning without modifying the 9959 // input LLVM IR. 9960 static bool processLoopInVPlanNativePath( 9961 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9962 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9963 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9964 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9965 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9966 LoopVectorizationRequirements &Requirements) { 9967 9968 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9969 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9970 return false; 9971 } 9972 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9973 Function *F = L->getHeader()->getParent(); 9974 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9975 9976 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9977 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); 9978 9979 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9980 &Hints, IAI); 9981 // Use the planner for outer loop vectorization. 9982 // TODO: CM is not used at this point inside the planner. Turn CM into an 9983 // optional argument if we don't need it in the future. 9984 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); 9985 9986 // Get user vectorization factor. 9987 ElementCount UserVF = Hints.getWidth(); 9988 9989 CM.collectElementTypesForWidening(); 9990 9991 // Plan how to best vectorize, return the best VF and its cost. 9992 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9993 9994 // If we are stress testing VPlan builds, do not attempt to generate vector 9995 // code. Masked vector code generation support will follow soon. 9996 // Also, do not attempt to vectorize if no vector code will be produced. 9997 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9998 return false; 9999 10000 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10001 10002 { 10003 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10004 F->getParent()->getDataLayout()); 10005 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10006 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 10007 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10008 << L->getHeader()->getParent()->getName() << "\"\n"); 10009 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10010 } 10011 10012 // Mark the loop as already vectorized to avoid vectorizing again. 10013 Hints.setAlreadyVectorized(); 10014 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10015 return true; 10016 } 10017 10018 // Emit a remark if there are stores to floats that required a floating point 10019 // extension. If the vectorized loop was generated with floating point there 10020 // will be a performance penalty from the conversion overhead and the change in 10021 // the vector width. 10022 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10023 SmallVector<Instruction *, 4> Worklist; 10024 for (BasicBlock *BB : L->getBlocks()) { 10025 for (Instruction &Inst : *BB) { 10026 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10027 if (S->getValueOperand()->getType()->isFloatTy()) 10028 Worklist.push_back(S); 10029 } 10030 } 10031 } 10032 10033 // Traverse the floating point stores upwards searching, for floating point 10034 // conversions. 10035 SmallPtrSet<const Instruction *, 4> Visited; 10036 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10037 while (!Worklist.empty()) { 10038 auto *I = Worklist.pop_back_val(); 10039 if (!L->contains(I)) 10040 continue; 10041 if (!Visited.insert(I).second) 10042 continue; 10043 10044 // Emit a remark if the floating point store required a floating 10045 // point conversion. 10046 // TODO: More work could be done to identify the root cause such as a 10047 // constant or a function return type and point the user to it. 10048 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10049 ORE->emit([&]() { 10050 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10051 I->getDebugLoc(), L->getHeader()) 10052 << "floating point conversion changes vector width. " 10053 << "Mixed floating point precision requires an up/down " 10054 << "cast that will negatively impact performance."; 10055 }); 10056 10057 for (Use &Op : I->operands()) 10058 if (auto *OpI = dyn_cast<Instruction>(Op)) 10059 Worklist.push_back(OpI); 10060 } 10061 } 10062 10063 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10064 VectorizationFactor &VF, 10065 std::optional<unsigned> VScale, Loop *L, 10066 ScalarEvolution &SE) { 10067 InstructionCost CheckCost = Checks.getCost(); 10068 if (!CheckCost.isValid()) 10069 return false; 10070 10071 // When interleaving only scalar and vector cost will be equal, which in turn 10072 // would lead to a divide by 0. Fall back to hard threshold. 10073 if (VF.Width.isScalar()) { 10074 if (CheckCost > VectorizeMemoryCheckThreshold) { 10075 LLVM_DEBUG( 10076 dbgs() 10077 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10078 return false; 10079 } 10080 return true; 10081 } 10082 10083 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10084 double ScalarC = *VF.ScalarCost.getValue(); 10085 if (ScalarC == 0) 10086 return true; 10087 10088 // First, compute the minimum iteration count required so that the vector 10089 // loop outperforms the scalar loop. 10090 // The total cost of the scalar loop is 10091 // ScalarC * TC 10092 // where 10093 // * TC is the actual trip count of the loop. 10094 // * ScalarC is the cost of a single scalar iteration. 10095 // 10096 // The total cost of the vector loop is 10097 // RtC + VecC * (TC / VF) + EpiC 10098 // where 10099 // * RtC is the cost of the generated runtime checks 10100 // * VecC is the cost of a single vector iteration. 10101 // * TC is the actual trip count of the loop 10102 // * VF is the vectorization factor 10103 // * EpiCost is the cost of the generated epilogue, including the cost 10104 // of the remaining scalar operations. 10105 // 10106 // Vectorization is profitable once the total vector cost is less than the 10107 // total scalar cost: 10108 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10109 // 10110 // Now we can compute the minimum required trip count TC as 10111 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10112 // 10113 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10114 // the computations are performed on doubles, not integers and the result 10115 // is rounded up, hence we get an upper estimate of the TC. 10116 unsigned IntVF = VF.Width.getKnownMinValue(); 10117 if (VF.Width.isScalable()) { 10118 unsigned AssumedMinimumVscale = 1; 10119 if (VScale) 10120 AssumedMinimumVscale = *VScale; 10121 IntVF *= AssumedMinimumVscale; 10122 } 10123 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10124 double RtC = *CheckCost.getValue(); 10125 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10126 10127 // Second, compute a minimum iteration count so that the cost of the 10128 // runtime checks is only a fraction of the total scalar loop cost. This 10129 // adds a loop-dependent bound on the overhead incurred if the runtime 10130 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10131 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10132 // cost, compute 10133 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10134 double MinTC2 = RtC * 10 / ScalarC; 10135 10136 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10137 // next closest multiple of VF. This should partly compensate for ignoring 10138 // the epilogue cost. 10139 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10140 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10141 10142 LLVM_DEBUG( 10143 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10144 << VF.MinProfitableTripCount << "\n"); 10145 10146 // Skip vectorization if the expected trip count is less than the minimum 10147 // required trip count. 10148 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10149 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10150 VF.MinProfitableTripCount)) { 10151 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10152 "trip count < minimum profitable VF (" 10153 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10154 << ")\n"); 10155 10156 return false; 10157 } 10158 } 10159 return true; 10160 } 10161 10162 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10163 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10164 !EnableLoopInterleaving), 10165 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10166 !EnableLoopVectorization) {} 10167 10168 bool LoopVectorizePass::processLoop(Loop *L) { 10169 assert((EnableVPlanNativePath || L->isInnermost()) && 10170 "VPlan-native path is not enabled. Only process inner loops."); 10171 10172 #ifndef NDEBUG 10173 const std::string DebugLocStr = getDebugLocString(L); 10174 #endif /* NDEBUG */ 10175 10176 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10177 << L->getHeader()->getParent()->getName() << "' from " 10178 << DebugLocStr << "\n"); 10179 10180 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10181 10182 LLVM_DEBUG( 10183 dbgs() << "LV: Loop hints:" 10184 << " force=" 10185 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10186 ? "disabled" 10187 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10188 ? "enabled" 10189 : "?")) 10190 << " width=" << Hints.getWidth() 10191 << " interleave=" << Hints.getInterleave() << "\n"); 10192 10193 // Function containing loop 10194 Function *F = L->getHeader()->getParent(); 10195 10196 // Looking at the diagnostic output is the only way to determine if a loop 10197 // was vectorized (other than looking at the IR or machine code), so it 10198 // is important to generate an optimization remark for each loop. Most of 10199 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10200 // generated as OptimizationRemark and OptimizationRemarkMissed are 10201 // less verbose reporting vectorized loops and unvectorized loops that may 10202 // benefit from vectorization, respectively. 10203 10204 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10205 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10206 return false; 10207 } 10208 10209 PredicatedScalarEvolution PSE(*SE, *L); 10210 10211 // Check if it is legal to vectorize the loop. 10212 LoopVectorizationRequirements Requirements; 10213 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10214 &Requirements, &Hints, DB, AC, BFI, PSI); 10215 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10217 Hints.emitRemarkWithHints(); 10218 return false; 10219 } 10220 10221 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10222 // here. They may require CFG and instruction level transformations before 10223 // even evaluating whether vectorization is profitable. Since we cannot modify 10224 // the incoming IR, we need to build VPlan upfront in the vectorization 10225 // pipeline. 10226 if (!L->isInnermost()) 10227 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10228 ORE, BFI, PSI, Hints, Requirements); 10229 10230 assert(L->isInnermost() && "Inner loop expected."); 10231 10232 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10233 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10234 10235 // If an override option has been passed in for interleaved accesses, use it. 10236 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10237 UseInterleaved = EnableInterleavedMemAccesses; 10238 10239 // Analyze interleaved memory accesses. 10240 if (UseInterleaved) 10241 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10242 10243 // Check the function attributes and profiles to find out if this function 10244 // should be optimized for size. 10245 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10246 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); 10247 10248 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10249 // count by optimizing for size, to minimize overheads. 10250 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10251 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10252 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10253 << "This loop is worth vectorizing only if no scalar " 10254 << "iteration overheads are incurred."); 10255 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10256 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10257 else { 10258 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10259 LLVM_DEBUG(dbgs() << "\n"); 10260 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10261 } else { 10262 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10263 "small to consider vectorizing.\n"); 10264 reportVectorizationFailure( 10265 "The trip count is below the minial threshold value.", 10266 "loop trip count is too low, avoiding vectorization", 10267 "LowTripCount", ORE, L); 10268 Hints.emitRemarkWithHints(); 10269 return false; 10270 } 10271 } 10272 } 10273 10274 // Check the function attributes to see if implicit floats or vectors are 10275 // allowed. 10276 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10277 reportVectorizationFailure( 10278 "Can't vectorize when the NoImplicitFloat attribute is used", 10279 "loop not vectorized due to NoImplicitFloat attribute", 10280 "NoImplicitFloat", ORE, L); 10281 Hints.emitRemarkWithHints(); 10282 return false; 10283 } 10284 10285 // Check if the target supports potentially unsafe FP vectorization. 10286 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10287 // for the target we're vectorizing for, to make sure none of the 10288 // additional fp-math flags can help. 10289 if (Hints.isPotentiallyUnsafe() && 10290 TTI->isFPVectorizationPotentiallyUnsafe()) { 10291 reportVectorizationFailure( 10292 "Potentially unsafe FP op prevents vectorization", 10293 "loop not vectorized due to unsafe FP support.", 10294 "UnsafeFP", ORE, L); 10295 Hints.emitRemarkWithHints(); 10296 return false; 10297 } 10298 10299 bool AllowOrderedReductions; 10300 // If the flag is set, use that instead and override the TTI behaviour. 10301 if (ForceOrderedReductions.getNumOccurrences() > 0) 10302 AllowOrderedReductions = ForceOrderedReductions; 10303 else 10304 AllowOrderedReductions = TTI->enableOrderedReductions(); 10305 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10306 ORE->emit([&]() { 10307 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10308 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10309 ExactFPMathInst->getDebugLoc(), 10310 ExactFPMathInst->getParent()) 10311 << "loop not vectorized: cannot prove it is safe to reorder " 10312 "floating-point operations"; 10313 }); 10314 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10315 "reorder floating-point operations\n"); 10316 Hints.emitRemarkWithHints(); 10317 return false; 10318 } 10319 10320 // Use the cost model. 10321 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10322 F, &Hints, IAI); 10323 CM.collectValuesToIgnore(); 10324 CM.collectElementTypesForWidening(); 10325 10326 // Use the planner for vectorization. 10327 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); 10328 10329 // Get user vectorization factor and interleave count. 10330 ElementCount UserVF = Hints.getWidth(); 10331 unsigned UserIC = Hints.getInterleave(); 10332 10333 // Plan how to best vectorize, return the best VF and its cost. 10334 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10335 10336 VectorizationFactor VF = VectorizationFactor::Disabled(); 10337 unsigned IC = 1; 10338 10339 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10340 F->getParent()->getDataLayout()); 10341 if (MaybeVF) { 10342 VF = *MaybeVF; 10343 // Select the interleave count. 10344 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10345 10346 unsigned SelectedIC = std::max(IC, UserIC); 10347 // Optimistically generate runtime checks if they are needed. Drop them if 10348 // they turn out to not be profitable. 10349 if (VF.Width.isVector() || SelectedIC > 1) 10350 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10351 10352 // Check if it is profitable to vectorize with runtime checks. 10353 bool ForceVectorization = 10354 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10355 if (!ForceVectorization && 10356 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, 10357 *PSE.getSE())) { 10358 ORE->emit([&]() { 10359 return OptimizationRemarkAnalysisAliasing( 10360 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10361 L->getHeader()) 10362 << "loop not vectorized: cannot prove it is safe to reorder " 10363 "memory operations"; 10364 }); 10365 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10366 Hints.emitRemarkWithHints(); 10367 return false; 10368 } 10369 } 10370 10371 // Identify the diagnostic messages that should be produced. 10372 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10373 bool VectorizeLoop = true, InterleaveLoop = true; 10374 if (VF.Width.isScalar()) { 10375 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10376 VecDiagMsg = std::make_pair( 10377 "VectorizationNotBeneficial", 10378 "the cost-model indicates that vectorization is not beneficial"); 10379 VectorizeLoop = false; 10380 } 10381 10382 if (!MaybeVF && UserIC > 1) { 10383 // Tell the user interleaving was avoided up-front, despite being explicitly 10384 // requested. 10385 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10386 "interleaving should be avoided up front\n"); 10387 IntDiagMsg = std::make_pair( 10388 "InterleavingAvoided", 10389 "Ignoring UserIC, because interleaving was avoided up front"); 10390 InterleaveLoop = false; 10391 } else if (IC == 1 && UserIC <= 1) { 10392 // Tell the user interleaving is not beneficial. 10393 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10394 IntDiagMsg = std::make_pair( 10395 "InterleavingNotBeneficial", 10396 "the cost-model indicates that interleaving is not beneficial"); 10397 InterleaveLoop = false; 10398 if (UserIC == 1) { 10399 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10400 IntDiagMsg.second += 10401 " and is explicitly disabled or interleave count is set to 1"; 10402 } 10403 } else if (IC > 1 && UserIC == 1) { 10404 // Tell the user interleaving is beneficial, but it explicitly disabled. 10405 LLVM_DEBUG( 10406 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10407 IntDiagMsg = std::make_pair( 10408 "InterleavingBeneficialButDisabled", 10409 "the cost-model indicates that interleaving is beneficial " 10410 "but is explicitly disabled or interleave count is set to 1"); 10411 InterleaveLoop = false; 10412 } 10413 10414 // Override IC if user provided an interleave count. 10415 IC = UserIC > 0 ? UserIC : IC; 10416 10417 // Emit diagnostic messages, if any. 10418 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10419 if (!VectorizeLoop && !InterleaveLoop) { 10420 // Do not vectorize or interleaving the loop. 10421 ORE->emit([&]() { 10422 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10423 L->getStartLoc(), L->getHeader()) 10424 << VecDiagMsg.second; 10425 }); 10426 ORE->emit([&]() { 10427 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10428 L->getStartLoc(), L->getHeader()) 10429 << IntDiagMsg.second; 10430 }); 10431 return false; 10432 } else if (!VectorizeLoop && InterleaveLoop) { 10433 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10434 ORE->emit([&]() { 10435 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10436 L->getStartLoc(), L->getHeader()) 10437 << VecDiagMsg.second; 10438 }); 10439 } else if (VectorizeLoop && !InterleaveLoop) { 10440 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10441 << ") in " << DebugLocStr << '\n'); 10442 ORE->emit([&]() { 10443 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10444 L->getStartLoc(), L->getHeader()) 10445 << IntDiagMsg.second; 10446 }); 10447 } else if (VectorizeLoop && InterleaveLoop) { 10448 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10449 << ") in " << DebugLocStr << '\n'); 10450 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10451 } 10452 10453 bool DisableRuntimeUnroll = false; 10454 MDNode *OrigLoopID = L->getLoopID(); 10455 { 10456 using namespace ore; 10457 if (!VectorizeLoop) { 10458 assert(IC > 1 && "interleave count should not be 1 or 0"); 10459 // If we decided that it is not legal to vectorize the loop, then 10460 // interleave it. 10461 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10462 &CM, BFI, PSI, Checks); 10463 10464 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10465 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10466 10467 ORE->emit([&]() { 10468 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10469 L->getHeader()) 10470 << "interleaved loop (interleaved count: " 10471 << NV("InterleaveCount", IC) << ")"; 10472 }); 10473 } else { 10474 // If we decided that it is *legal* to vectorize the loop, then do it. 10475 10476 // Consider vectorizing the epilogue too if it's profitable. 10477 VectorizationFactor EpilogueVF = 10478 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10479 if (EpilogueVF.Width.isVector()) { 10480 10481 // The first pass vectorizes the main loop and creates a scalar epilogue 10482 // to be vectorized by executing the plan (potentially with a different 10483 // factor) again shortly afterwards. 10484 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10485 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10486 EPI, &LVL, &CM, BFI, PSI, Checks); 10487 10488 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10489 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10490 DT, true); 10491 ++LoopsVectorized; 10492 10493 // Second pass vectorizes the epilogue and adjusts the control flow 10494 // edges from the first pass. 10495 EPI.MainLoopVF = EPI.EpilogueVF; 10496 EPI.MainLoopUF = EPI.EpilogueUF; 10497 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10498 ORE, EPI, &LVL, &CM, BFI, PSI, 10499 Checks); 10500 10501 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10502 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10503 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10504 Header->setName("vec.epilog.vector.body"); 10505 10506 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10507 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10508 // before vectorizing the epilogue loop. 10509 for (VPRecipeBase &R : Header->phis()) { 10510 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10511 continue; 10512 10513 Value *ResumeV = nullptr; 10514 // TODO: Move setting of resume values to prepareToExecute. 10515 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10516 ResumeV = MainILV.getReductionResumeValue( 10517 ReductionPhi->getRecurrenceDescriptor()); 10518 } else { 10519 // Create induction resume values for both widened pointer and 10520 // integer/fp inductions and update the start value of the induction 10521 // recipes to use the resume value. 10522 PHINode *IndPhi = nullptr; 10523 const InductionDescriptor *ID; 10524 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10525 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10526 ID = &Ind->getInductionDescriptor(); 10527 } else { 10528 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10529 IndPhi = WidenInd->getPHINode(); 10530 ID = &WidenInd->getInductionDescriptor(); 10531 } 10532 10533 ResumeV = MainILV.createInductionResumeValue( 10534 IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); 10535 } 10536 assert(ResumeV && "Must have a resume value"); 10537 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); 10538 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10539 } 10540 10541 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10542 DT, true); 10543 ++LoopsEpilogueVectorized; 10544 10545 if (!MainILV.areSafetyChecksAdded()) 10546 DisableRuntimeUnroll = true; 10547 } else { 10548 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10549 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10550 PSI, Checks); 10551 10552 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10553 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10554 ++LoopsVectorized; 10555 10556 // Add metadata to disable runtime unrolling a scalar loop when there 10557 // are no runtime checks about strides and memory. A scalar loop that is 10558 // rarely used is not worth unrolling. 10559 if (!LB.areSafetyChecksAdded()) 10560 DisableRuntimeUnroll = true; 10561 } 10562 // Report the vectorization decision. 10563 ORE->emit([&]() { 10564 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10565 L->getHeader()) 10566 << "vectorized loop (vectorization width: " 10567 << NV("VectorizationFactor", VF.Width) 10568 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10569 }); 10570 } 10571 10572 if (ORE->allowExtraAnalysis(LV_NAME)) 10573 checkMixedPrecision(L, ORE); 10574 } 10575 10576 std::optional<MDNode *> RemainderLoopID = 10577 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10578 LLVMLoopVectorizeFollowupEpilogue}); 10579 if (RemainderLoopID) { 10580 L->setLoopID(*RemainderLoopID); 10581 } else { 10582 if (DisableRuntimeUnroll) 10583 AddRuntimeUnrollDisableMetaData(L); 10584 10585 // Mark the loop as already vectorized to avoid vectorizing again. 10586 Hints.setAlreadyVectorized(); 10587 } 10588 10589 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10590 return true; 10591 } 10592 10593 LoopVectorizeResult LoopVectorizePass::runImpl( 10594 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10595 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10596 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10597 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10598 SE = &SE_; 10599 LI = &LI_; 10600 TTI = &TTI_; 10601 DT = &DT_; 10602 BFI = &BFI_; 10603 TLI = TLI_; 10604 AC = &AC_; 10605 LAIs = &LAIs_; 10606 DB = &DB_; 10607 ORE = &ORE_; 10608 PSI = PSI_; 10609 10610 // Don't attempt if 10611 // 1. the target claims to have no vector registers, and 10612 // 2. interleaving won't help ILP. 10613 // 10614 // The second condition is necessary because, even if the target has no 10615 // vector registers, loop vectorization may still enable scalar 10616 // interleaving. 10617 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10618 TTI->getMaxInterleaveFactor(1) < 2) 10619 return LoopVectorizeResult(false, false); 10620 10621 bool Changed = false, CFGChanged = false; 10622 10623 // The vectorizer requires loops to be in simplified form. 10624 // Since simplification may add new inner loops, it has to run before the 10625 // legality and profitability checks. This means running the loop vectorizer 10626 // will simplify all loops, regardless of whether anything end up being 10627 // vectorized. 10628 for (const auto &L : *LI) 10629 Changed |= CFGChanged |= 10630 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10631 10632 // Build up a worklist of inner-loops to vectorize. This is necessary as 10633 // the act of vectorizing or partially unrolling a loop creates new loops 10634 // and can invalidate iterators across the loops. 10635 SmallVector<Loop *, 8> Worklist; 10636 10637 for (Loop *L : *LI) 10638 collectSupportedLoops(*L, LI, ORE, Worklist); 10639 10640 LoopsAnalyzed += Worklist.size(); 10641 10642 // Now walk the identified inner loops. 10643 while (!Worklist.empty()) { 10644 Loop *L = Worklist.pop_back_val(); 10645 10646 // For the inner loops we actually process, form LCSSA to simplify the 10647 // transform. 10648 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10649 10650 Changed |= CFGChanged |= processLoop(L); 10651 10652 if (Changed) 10653 LAIs->clear(); 10654 } 10655 10656 // Process each loop nest in the function. 10657 return LoopVectorizeResult(Changed, CFGChanged); 10658 } 10659 10660 PreservedAnalyses LoopVectorizePass::run(Function &F, 10661 FunctionAnalysisManager &AM) { 10662 auto &LI = AM.getResult<LoopAnalysis>(F); 10663 // There are no loops in the function. Return before computing other expensive 10664 // analyses. 10665 if (LI.empty()) 10666 return PreservedAnalyses::all(); 10667 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10668 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10669 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10670 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10671 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10672 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10673 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10674 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10675 10676 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10677 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10678 ProfileSummaryInfo *PSI = 10679 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10680 LoopVectorizeResult Result = 10681 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10682 if (!Result.MadeAnyChange) 10683 return PreservedAnalyses::all(); 10684 PreservedAnalyses PA; 10685 10686 // We currently do not preserve loopinfo/dominator analyses with outer loop 10687 // vectorization. Until this is addressed, mark these analyses as preserved 10688 // only for non-VPlan-native path. 10689 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10690 if (!EnableVPlanNativePath) { 10691 PA.preserve<LoopAnalysis>(); 10692 PA.preserve<DominatorTreeAnalysis>(); 10693 } 10694 10695 if (Result.MadeCFGChange) { 10696 // Making CFG changes likely means a loop got vectorized. Indicate that 10697 // extra simplification passes should be run. 10698 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10699 // be run if runtime checks have been added. 10700 AM.getResult<ShouldRunExtraVectorPasses>(F); 10701 PA.preserve<ShouldRunExtraVectorPasses>(); 10702 } else { 10703 PA.preserveSet<CFGAnalyses>(); 10704 } 10705 return PA; 10706 } 10707 10708 void LoopVectorizePass::printPipeline( 10709 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10710 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10711 OS, MapClassName2PassName); 10712 10713 OS << "<"; 10714 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10715 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10716 OS << ">"; 10717 } 10718