1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop. 474 /// In the case of epilogue vectorization, this function is overriden to 475 /// handle the more complex control flow around the loops. 476 virtual BasicBlock *createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single first-order recurrence or pointer induction PHINode in 494 /// a block. This method handles the induction variable canonicalization. It 495 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 496 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 497 VPTransformState &State); 498 499 /// A helper function to scalarize a single Instruction in the innermost loop. 500 /// Generates a sequence of scalar instances for each lane between \p MinLane 501 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 502 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 503 /// Instr's operands. 504 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 505 const VPIteration &Instance, bool IfPredicateInstr, 506 VPTransformState &State); 507 508 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 509 /// is provided, the integer induction variable will first be truncated to 510 /// the corresponding type. 511 void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, 512 Value *Start, TruncInst *Trunc, VPValue *Def, 513 VPTransformState &State); 514 515 /// Construct the vector value of a scalarized value \p V one lane at a time. 516 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 517 VPTransformState &State); 518 519 /// Try to vectorize interleaved access group \p Group with the base address 520 /// given in \p Addr, optionally masking the vector operations if \p 521 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 522 /// values in the vectorized loop. 523 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 524 ArrayRef<VPValue *> VPDefs, 525 VPTransformState &State, VPValue *Addr, 526 ArrayRef<VPValue *> StoredValues, 527 VPValue *BlockInMask = nullptr); 528 529 /// Set the debug location in the builder \p Ptr using the debug location in 530 /// \p V. If \p Ptr is None then it uses the class member's Builder. 531 void setDebugLocFromInst(const Value *V, 532 Optional<IRBuilder<> *> CustomBuilder = None); 533 534 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 535 void fixNonInductionPHIs(VPTransformState &State); 536 537 /// Returns true if the reordering of FP operations is not allowed, but we are 538 /// able to vectorize with strict in-order reductions for the given RdxDesc. 539 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 540 541 /// Create a broadcast instruction. This method generates a broadcast 542 /// instruction (shuffle) for loop invariant values and for the induction 543 /// value. If this is the induction variable then we extend it to N, N+1, ... 544 /// this is needed because each iteration in the loop corresponds to a SIMD 545 /// element. 546 virtual Value *getBroadcastInstrs(Value *V); 547 548 /// Add metadata from one instruction to another. 549 /// 550 /// This includes both the original MDs from \p From and additional ones (\see 551 /// addNewMetadata). Use this for *newly created* instructions in the vector 552 /// loop. 553 void addMetadata(Instruction *To, Instruction *From); 554 555 /// Similar to the previous function but it adds the metadata to a 556 /// vector of instructions. 557 void addMetadata(ArrayRef<Value *> To, Instruction *From); 558 559 protected: 560 friend class LoopVectorizationPlanner; 561 562 /// A small list of PHINodes. 563 using PhiVector = SmallVector<PHINode *, 4>; 564 565 /// A type for scalarized values in the new loop. Each value from the 566 /// original loop, when scalarized, is represented by UF x VF scalar values 567 /// in the new unrolled loop, where UF is the unroll factor and VF is the 568 /// vectorization factor. 569 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 570 571 /// Set up the values of the IVs correctly when exiting the vector loop. 572 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 573 Value *CountRoundDown, Value *EndValue, 574 BasicBlock *MiddleBlock); 575 576 /// Create a new induction variable inside L. 577 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 578 Value *Step, Instruction *DL); 579 580 /// Handle all cross-iteration phis in the header. 581 void fixCrossIterationPHIs(VPTransformState &State); 582 583 /// Create the exit value of first order recurrences in the middle block and 584 /// update their users. 585 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State); 586 587 /// Create code for the loop exit value of the reduction. 588 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 592 VPTransformState &State); 593 594 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 595 /// means we need to add the appropriate incoming value from the middle 596 /// block as exiting edges from the scalar epilogue loop (if present) are 597 /// already in place, and we exit the vector loop exclusively to the middle 598 /// block. 599 void fixLCSSAPHIs(VPTransformState &State); 600 601 /// Iteratively sink the scalarized operands of a predicated instruction into 602 /// the block that was created for it. 603 void sinkScalarOperands(Instruction *PredInst); 604 605 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 606 /// represented as. 607 void truncateToMinimalBitwidths(VPTransformState &State); 608 609 /// This function adds 610 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 611 /// to each vector element of Val. The sequence starts at StartIndex. 612 /// \p Opcode is relevant for FP induction variable. 613 virtual Value * 614 getStepVector(Value *Val, Value *StartIdx, Value *Step, 615 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd); 616 617 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 618 /// variable on which to base the steps, \p Step is the size of the step, and 619 /// \p EntryVal is the value from the original loop that maps to the steps. 620 /// Note that \p EntryVal doesn't have to be an induction variable - it 621 /// can also be a truncate instruction. 622 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 623 const InductionDescriptor &ID, VPValue *Def, 624 VPTransformState &State); 625 626 /// Create a vector induction phi node based on an existing scalar one. \p 627 /// EntryVal is the value from the original loop that maps to the vector phi 628 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 629 /// truncate instruction, instead of widening the original IV, we widen a 630 /// version of the IV truncated to \p EntryVal's type. 631 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 632 Value *Step, Value *Start, 633 Instruction *EntryVal, VPValue *Def, 634 VPTransformState &State); 635 636 /// Returns true if an instruction \p I should be scalarized instead of 637 /// vectorized for the chosen vectorization factor. 638 bool shouldScalarizeInstruction(Instruction *I) const; 639 640 /// Returns true if we should generate a scalar version of \p IV. 641 bool needsScalarInduction(Instruction *IV) const; 642 643 /// Generate a shuffle sequence that will reverse the vector Vec. 644 virtual Value *reverseVector(Value *Vec); 645 646 /// Returns (and creates if needed) the original loop trip count. 647 Value *getOrCreateTripCount(Loop *NewLoop); 648 649 /// Returns (and creates if needed) the trip count of the widened loop. 650 Value *getOrCreateVectorTripCount(Loop *NewLoop); 651 652 /// Returns a bitcasted value to the requested vector type. 653 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 654 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 655 const DataLayout &DL); 656 657 /// Emit a bypass check to see if the vector trip count is zero, including if 658 /// it overflows. 659 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 660 661 /// Emit a bypass check to see if all of the SCEV assumptions we've 662 /// had to make are correct. Returns the block containing the checks or 663 /// nullptr if no checks have been added. 664 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 665 666 /// Emit bypass checks to check any memory assumptions we may have made. 667 /// Returns the block containing the checks or nullptr if no checks have been 668 /// added. 669 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 670 671 /// Compute the transformed value of Index at offset StartValue using step 672 /// StepValue. 673 /// For integer induction, returns StartValue + Index * StepValue. 674 /// For pointer induction, returns StartValue[Index * StepValue]. 675 /// FIXME: The newly created binary instructions should contain nsw/nuw 676 /// flags, which can be found from the original scalar operations. 677 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 678 const DataLayout &DL, 679 const InductionDescriptor &ID, 680 BasicBlock *VectorHeader) const; 681 682 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 683 /// vector loop preheader, middle block and scalar preheader. Also 684 /// allocate a loop object for the new vector loop and return it. 685 Loop *createVectorLoopSkeleton(StringRef Prefix); 686 687 /// Create new phi nodes for the induction variables to resume iteration count 688 /// in the scalar epilogue, from where the vectorized loop left off (given by 689 /// \p VectorTripCount). 690 /// In cases where the loop skeleton is more complicated (eg. epilogue 691 /// vectorization) and the resume values can come from an additional bypass 692 /// block, the \p AdditionalBypass pair provides information about the bypass 693 /// block and the end value on the edge from bypass to this loop. 694 void createInductionResumeValues( 695 Loop *L, Value *VectorTripCount, 696 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 697 698 /// Complete the loop skeleton by adding debug MDs, creating appropriate 699 /// conditional branches in the middle block, preparing the builder and 700 /// running the verifier. Take in the vector loop \p L as argument, and return 701 /// the preheader of the completed vector loop. 702 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 703 704 /// Add additional metadata to \p To that was not present on \p Orig. 705 /// 706 /// Currently this is used to add the noalias annotations based on the 707 /// inserted memchecks. Use this for instructions that are *cloned* into the 708 /// vector loop. 709 void addNewMetadata(Instruction *To, const Instruction *Orig); 710 711 /// Collect poison-generating recipes that may generate a poison value that is 712 /// used after vectorization, even when their operands are not poison. Those 713 /// recipes meet the following conditions: 714 /// * Contribute to the address computation of a recipe generating a widen 715 /// memory load/store (VPWidenMemoryInstructionRecipe or 716 /// VPInterleaveRecipe). 717 /// * Such a widen memory load/store has at least one underlying Instruction 718 /// that is in a basic block that needs predication and after vectorization 719 /// the generated instruction won't be predicated. 720 void collectPoisonGeneratingRecipes(VPTransformState &State); 721 722 /// Allow subclasses to override and print debug traces before/after vplan 723 /// execution, when trace information is requested. 724 virtual void printDebugTracesAtStart(){}; 725 virtual void printDebugTracesAtEnd(){}; 726 727 /// The original loop. 728 Loop *OrigLoop; 729 730 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 731 /// dynamic knowledge to simplify SCEV expressions and converts them to a 732 /// more usable form. 733 PredicatedScalarEvolution &PSE; 734 735 /// Loop Info. 736 LoopInfo *LI; 737 738 /// Dominator Tree. 739 DominatorTree *DT; 740 741 /// Alias Analysis. 742 AAResults *AA; 743 744 /// Target Library Info. 745 const TargetLibraryInfo *TLI; 746 747 /// Target Transform Info. 748 const TargetTransformInfo *TTI; 749 750 /// Assumption Cache. 751 AssumptionCache *AC; 752 753 /// Interface to emit optimization remarks. 754 OptimizationRemarkEmitter *ORE; 755 756 /// LoopVersioning. It's only set up (non-null) if memchecks were 757 /// used. 758 /// 759 /// This is currently only used to add no-alias metadata based on the 760 /// memchecks. The actually versioning is performed manually. 761 std::unique_ptr<LoopVersioning> LVer; 762 763 /// The vectorization SIMD factor to use. Each vector will have this many 764 /// vector elements. 765 ElementCount VF; 766 767 /// The vectorization unroll factor to use. Each scalar is vectorized to this 768 /// many different vector instructions. 769 unsigned UF; 770 771 /// The builder that we use 772 IRBuilder<> Builder; 773 774 // --- Vectorization state --- 775 776 /// The vector-loop preheader. 777 BasicBlock *LoopVectorPreHeader; 778 779 /// The scalar-loop preheader. 780 BasicBlock *LoopScalarPreHeader; 781 782 /// Middle Block between the vector and the scalar. 783 BasicBlock *LoopMiddleBlock; 784 785 /// The unique ExitBlock of the scalar loop if one exists. Note that 786 /// there can be multiple exiting edges reaching this block. 787 BasicBlock *LoopExitBlock; 788 789 /// The vector loop body. 790 BasicBlock *LoopVectorBody; 791 792 /// The scalar loop body. 793 BasicBlock *LoopScalarBody; 794 795 /// A list of all bypass blocks. The first block is the entry of the loop. 796 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 797 798 /// The new Induction variable which was added to the new block. 799 PHINode *Induction = nullptr; 800 801 /// The induction variable of the old basic block. 802 PHINode *OldInduction = nullptr; 803 804 /// Store instructions that were predicated. 805 SmallVector<Instruction *, 4> PredicatedInstructions; 806 807 /// Trip count of the original loop. 808 Value *TripCount = nullptr; 809 810 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 811 Value *VectorTripCount = nullptr; 812 813 /// The legality analysis. 814 LoopVectorizationLegality *Legal; 815 816 /// The profitablity analysis. 817 LoopVectorizationCostModel *Cost; 818 819 // Record whether runtime checks are added. 820 bool AddedSafetyChecks = false; 821 822 // Holds the end values for each induction variable. We save the end values 823 // so we can later fix-up the external users of the induction variables. 824 DenseMap<PHINode *, Value *> IVEndValues; 825 826 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 827 // fixed up at the end of vector code generation. 828 SmallVector<PHINode *, 8> OrigPHIsToFix; 829 830 /// BFI and PSI are used to check for profile guided size optimizations. 831 BlockFrequencyInfo *BFI; 832 ProfileSummaryInfo *PSI; 833 834 // Whether this loop should be optimized for size based on profile guided size 835 // optimizatios. 836 bool OptForSizeBasedOnProfile; 837 838 /// Structure to hold information about generated runtime checks, responsible 839 /// for cleaning the checks, if vectorization turns out unprofitable. 840 GeneratedRTChecks &RTChecks; 841 }; 842 843 class InnerLoopUnroller : public InnerLoopVectorizer { 844 public: 845 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 846 LoopInfo *LI, DominatorTree *DT, 847 const TargetLibraryInfo *TLI, 848 const TargetTransformInfo *TTI, AssumptionCache *AC, 849 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 850 LoopVectorizationLegality *LVL, 851 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 852 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 853 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 854 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 855 BFI, PSI, Check) {} 856 857 private: 858 Value *getBroadcastInstrs(Value *V) override; 859 Value *getStepVector( 860 Value *Val, Value *StartIdx, Value *Step, 861 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override; 862 Value *reverseVector(Value *Vec) override; 863 }; 864 865 /// Encapsulate information regarding vectorization of a loop and its epilogue. 866 /// This information is meant to be updated and used across two stages of 867 /// epilogue vectorization. 868 struct EpilogueLoopVectorizationInfo { 869 ElementCount MainLoopVF = ElementCount::getFixed(0); 870 unsigned MainLoopUF = 0; 871 ElementCount EpilogueVF = ElementCount::getFixed(0); 872 unsigned EpilogueUF = 0; 873 BasicBlock *MainLoopIterationCountCheck = nullptr; 874 BasicBlock *EpilogueIterationCountCheck = nullptr; 875 BasicBlock *SCEVSafetyCheck = nullptr; 876 BasicBlock *MemSafetyCheck = nullptr; 877 Value *TripCount = nullptr; 878 Value *VectorTripCount = nullptr; 879 880 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 881 ElementCount EVF, unsigned EUF) 882 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 883 assert(EUF == 1 && 884 "A high UF for the epilogue loop is likely not beneficial."); 885 } 886 }; 887 888 /// An extension of the inner loop vectorizer that creates a skeleton for a 889 /// vectorized loop that has its epilogue (residual) also vectorized. 890 /// The idea is to run the vplan on a given loop twice, firstly to setup the 891 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 892 /// from the first step and vectorize the epilogue. This is achieved by 893 /// deriving two concrete strategy classes from this base class and invoking 894 /// them in succession from the loop vectorizer planner. 895 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 896 public: 897 InnerLoopAndEpilogueVectorizer( 898 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 899 DominatorTree *DT, const TargetLibraryInfo *TLI, 900 const TargetTransformInfo *TTI, AssumptionCache *AC, 901 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 902 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 903 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 904 GeneratedRTChecks &Checks) 905 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 906 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 907 Checks), 908 EPI(EPI) {} 909 910 // Override this function to handle the more complex control flow around the 911 // three loops. 912 BasicBlock *createVectorizedLoopSkeleton() final override { 913 return createEpilogueVectorizedLoopSkeleton(); 914 } 915 916 /// The interface for creating a vectorized skeleton using one of two 917 /// different strategies, each corresponding to one execution of the vplan 918 /// as described above. 919 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 920 921 /// Holds and updates state information required to vectorize the main loop 922 /// and its epilogue in two separate passes. This setup helps us avoid 923 /// regenerating and recomputing runtime safety checks. It also helps us to 924 /// shorten the iteration-count-check path length for the cases where the 925 /// iteration count of the loop is so small that the main vector loop is 926 /// completely skipped. 927 EpilogueLoopVectorizationInfo &EPI; 928 }; 929 930 /// A specialized derived class of inner loop vectorizer that performs 931 /// vectorization of *main* loops in the process of vectorizing loops and their 932 /// epilogues. 933 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 934 public: 935 EpilogueVectorizerMainLoop( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Check) 943 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI, LVL, CM, BFI, PSI, Check) {} 945 /// Implements the interface for creating a vectorized skeleton using the 946 /// *main loop* strategy (ie the first pass of vplan execution). 947 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 948 949 protected: 950 /// Emits an iteration count bypass check once for the main loop (when \p 951 /// ForEpilogue is false) and once for the epilogue loop (when \p 952 /// ForEpilogue is true). 953 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 954 bool ForEpilogue); 955 void printDebugTracesAtStart() override; 956 void printDebugTracesAtEnd() override; 957 }; 958 959 // A specialized derived class of inner loop vectorizer that performs 960 // vectorization of *epilogue* loops in the process of vectorizing loops and 961 // their epilogues. 962 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 963 public: 964 EpilogueVectorizerEpilogueLoop( 965 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 966 DominatorTree *DT, const TargetLibraryInfo *TLI, 967 const TargetTransformInfo *TTI, AssumptionCache *AC, 968 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 969 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 970 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 971 GeneratedRTChecks &Checks) 972 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 973 EPI, LVL, CM, BFI, PSI, Checks) {} 974 /// Implements the interface for creating a vectorized skeleton using the 975 /// *epilogue loop* strategy (ie the second pass of vplan execution). 976 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 977 978 protected: 979 /// Emits an iteration count bypass check after the main vector loop has 980 /// finished to see if there are any iterations left to execute by either 981 /// the vector epilogue or the scalar epilogue. 982 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 983 BasicBlock *Bypass, 984 BasicBlock *Insert); 985 void printDebugTracesAtStart() override; 986 void printDebugTracesAtEnd() override; 987 }; 988 } // end namespace llvm 989 990 /// Look for a meaningful debug location on the instruction or it's 991 /// operands. 992 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 993 if (!I) 994 return I; 995 996 DebugLoc Empty; 997 if (I->getDebugLoc() != Empty) 998 return I; 999 1000 for (Use &Op : I->operands()) { 1001 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1002 if (OpInst->getDebugLoc() != Empty) 1003 return OpInst; 1004 } 1005 1006 return I; 1007 } 1008 1009 void InnerLoopVectorizer::setDebugLocFromInst( 1010 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1011 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1012 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1013 const DILocation *DIL = Inst->getDebugLoc(); 1014 1015 // When a FSDiscriminator is enabled, we don't need to add the multiply 1016 // factors to the discriminators. 1017 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1018 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1019 // FIXME: For scalable vectors, assume vscale=1. 1020 auto NewDIL = 1021 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1022 if (NewDIL) 1023 B->SetCurrentDebugLocation(NewDIL.getValue()); 1024 else 1025 LLVM_DEBUG(dbgs() 1026 << "Failed to create new discriminator: " 1027 << DIL->getFilename() << " Line: " << DIL->getLine()); 1028 } else 1029 B->SetCurrentDebugLocation(DIL); 1030 } else 1031 B->SetCurrentDebugLocation(DebugLoc()); 1032 } 1033 1034 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1035 /// is passed, the message relates to that particular instruction. 1036 #ifndef NDEBUG 1037 static void debugVectorizationMessage(const StringRef Prefix, 1038 const StringRef DebugMsg, 1039 Instruction *I) { 1040 dbgs() << "LV: " << Prefix << DebugMsg; 1041 if (I != nullptr) 1042 dbgs() << " " << *I; 1043 else 1044 dbgs() << '.'; 1045 dbgs() << '\n'; 1046 } 1047 #endif 1048 1049 /// Create an analysis remark that explains why vectorization failed 1050 /// 1051 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1052 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1053 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1054 /// the location of the remark. \return the remark object that can be 1055 /// streamed to. 1056 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1057 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1058 Value *CodeRegion = TheLoop->getHeader(); 1059 DebugLoc DL = TheLoop->getStartLoc(); 1060 1061 if (I) { 1062 CodeRegion = I->getParent(); 1063 // If there is no debug location attached to the instruction, revert back to 1064 // using the loop's. 1065 if (I->getDebugLoc()) 1066 DL = I->getDebugLoc(); 1067 } 1068 1069 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1070 } 1071 1072 /// Return a value for Step multiplied by VF. 1073 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1074 int64_t Step) { 1075 assert(Ty->isIntegerTy() && "Expected an integer step"); 1076 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1077 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1078 } 1079 1080 namespace llvm { 1081 1082 /// Return the runtime value for VF. 1083 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1084 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1085 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1086 } 1087 1088 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1089 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1090 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1091 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1092 return B.CreateUIToFP(RuntimeVF, FTy); 1093 } 1094 1095 void reportVectorizationFailure(const StringRef DebugMsg, 1096 const StringRef OREMsg, const StringRef ORETag, 1097 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1098 Instruction *I) { 1099 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1100 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1101 ORE->emit( 1102 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1103 << "loop not vectorized: " << OREMsg); 1104 } 1105 1106 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1107 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1108 Instruction *I) { 1109 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1110 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1111 ORE->emit( 1112 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1113 << Msg); 1114 } 1115 1116 } // end namespace llvm 1117 1118 #ifndef NDEBUG 1119 /// \return string containing a file name and a line # for the given loop. 1120 static std::string getDebugLocString(const Loop *L) { 1121 std::string Result; 1122 if (L) { 1123 raw_string_ostream OS(Result); 1124 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1125 LoopDbgLoc.print(OS); 1126 else 1127 // Just print the module name. 1128 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1129 OS.flush(); 1130 } 1131 return Result; 1132 } 1133 #endif 1134 1135 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1136 const Instruction *Orig) { 1137 // If the loop was versioned with memchecks, add the corresponding no-alias 1138 // metadata. 1139 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1140 LVer->annotateInstWithNoAlias(To, Orig); 1141 } 1142 1143 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1144 VPTransformState &State) { 1145 1146 // Collect recipes in the backward slice of `Root` that may generate a poison 1147 // value that is used after vectorization. 1148 SmallPtrSet<VPRecipeBase *, 16> Visited; 1149 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1150 SmallVector<VPRecipeBase *, 16> Worklist; 1151 Worklist.push_back(Root); 1152 1153 // Traverse the backward slice of Root through its use-def chain. 1154 while (!Worklist.empty()) { 1155 VPRecipeBase *CurRec = Worklist.back(); 1156 Worklist.pop_back(); 1157 1158 if (!Visited.insert(CurRec).second) 1159 continue; 1160 1161 // Prune search if we find another recipe generating a widen memory 1162 // instruction. Widen memory instructions involved in address computation 1163 // will lead to gather/scatter instructions, which don't need to be 1164 // handled. 1165 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1166 isa<VPInterleaveRecipe>(CurRec)) 1167 continue; 1168 1169 // This recipe contributes to the address computation of a widen 1170 // load/store. Collect recipe if its underlying instruction has 1171 // poison-generating flags. 1172 Instruction *Instr = CurRec->getUnderlyingInstr(); 1173 if (Instr && Instr->hasPoisonGeneratingFlags()) 1174 State.MayGeneratePoisonRecipes.insert(CurRec); 1175 1176 // Add new definitions to the worklist. 1177 for (VPValue *operand : CurRec->operands()) 1178 if (VPDef *OpDef = operand->getDef()) 1179 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1180 } 1181 }); 1182 1183 // Traverse all the recipes in the VPlan and collect the poison-generating 1184 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1185 // VPInterleaveRecipe. 1186 auto Iter = depth_first( 1187 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1188 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1189 for (VPRecipeBase &Recipe : *VPBB) { 1190 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1191 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1192 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1193 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1194 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1195 collectPoisonGeneratingInstrsInBackwardSlice( 1196 cast<VPRecipeBase>(AddrDef)); 1197 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1198 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1199 if (AddrDef) { 1200 // Check if any member of the interleave group needs predication. 1201 const InterleaveGroup<Instruction> *InterGroup = 1202 InterleaveRec->getInterleaveGroup(); 1203 bool NeedPredication = false; 1204 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1205 I < NumMembers; ++I) { 1206 Instruction *Member = InterGroup->getMember(I); 1207 if (Member) 1208 NeedPredication |= 1209 Legal->blockNeedsPredication(Member->getParent()); 1210 } 1211 1212 if (NeedPredication) 1213 collectPoisonGeneratingInstrsInBackwardSlice( 1214 cast<VPRecipeBase>(AddrDef)); 1215 } 1216 } 1217 } 1218 } 1219 } 1220 1221 void InnerLoopVectorizer::addMetadata(Instruction *To, 1222 Instruction *From) { 1223 propagateMetadata(To, From); 1224 addNewMetadata(To, From); 1225 } 1226 1227 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1228 Instruction *From) { 1229 for (Value *V : To) { 1230 if (Instruction *I = dyn_cast<Instruction>(V)) 1231 addMetadata(I, From); 1232 } 1233 } 1234 1235 namespace llvm { 1236 1237 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1238 // lowered. 1239 enum ScalarEpilogueLowering { 1240 1241 // The default: allowing scalar epilogues. 1242 CM_ScalarEpilogueAllowed, 1243 1244 // Vectorization with OptForSize: don't allow epilogues. 1245 CM_ScalarEpilogueNotAllowedOptSize, 1246 1247 // A special case of vectorisation with OptForSize: loops with a very small 1248 // trip count are considered for vectorization under OptForSize, thereby 1249 // making sure the cost of their loop body is dominant, free of runtime 1250 // guards and scalar iteration overheads. 1251 CM_ScalarEpilogueNotAllowedLowTripLoop, 1252 1253 // Loop hint predicate indicating an epilogue is undesired. 1254 CM_ScalarEpilogueNotNeededUsePredicate, 1255 1256 // Directive indicating we must either tail fold or not vectorize 1257 CM_ScalarEpilogueNotAllowedUsePredicate 1258 }; 1259 1260 /// ElementCountComparator creates a total ordering for ElementCount 1261 /// for the purposes of using it in a set structure. 1262 struct ElementCountComparator { 1263 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1264 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1265 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1266 } 1267 }; 1268 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1269 1270 /// LoopVectorizationCostModel - estimates the expected speedups due to 1271 /// vectorization. 1272 /// In many cases vectorization is not profitable. This can happen because of 1273 /// a number of reasons. In this class we mainly attempt to predict the 1274 /// expected speedup/slowdowns due to the supported instruction set. We use the 1275 /// TargetTransformInfo to query the different backends for the cost of 1276 /// different operations. 1277 class LoopVectorizationCostModel { 1278 public: 1279 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1280 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1281 LoopVectorizationLegality *Legal, 1282 const TargetTransformInfo &TTI, 1283 const TargetLibraryInfo *TLI, DemandedBits *DB, 1284 AssumptionCache *AC, 1285 OptimizationRemarkEmitter *ORE, const Function *F, 1286 const LoopVectorizeHints *Hints, 1287 InterleavedAccessInfo &IAI) 1288 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1289 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1290 Hints(Hints), InterleaveInfo(IAI) {} 1291 1292 /// \return An upper bound for the vectorization factors (both fixed and 1293 /// scalable). If the factors are 0, vectorization and interleaving should be 1294 /// avoided up front. 1295 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1296 1297 /// \return True if runtime checks are required for vectorization, and false 1298 /// otherwise. 1299 bool runtimeChecksRequired(); 1300 1301 /// \return The most profitable vectorization factor and the cost of that VF. 1302 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1303 /// then this vectorization factor will be selected if vectorization is 1304 /// possible. 1305 VectorizationFactor 1306 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1307 1308 VectorizationFactor 1309 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1310 const LoopVectorizationPlanner &LVP); 1311 1312 /// Setup cost-based decisions for user vectorization factor. 1313 /// \return true if the UserVF is a feasible VF to be chosen. 1314 bool selectUserVectorizationFactor(ElementCount UserVF) { 1315 collectUniformsAndScalars(UserVF); 1316 collectInstsToScalarize(UserVF); 1317 return expectedCost(UserVF).first.isValid(); 1318 } 1319 1320 /// \return The size (in bits) of the smallest and widest types in the code 1321 /// that needs to be vectorized. We ignore values that remain scalar such as 1322 /// 64 bit loop indices. 1323 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1324 1325 /// \return The desired interleave count. 1326 /// If interleave count has been specified by metadata it will be returned. 1327 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1328 /// are the selected vectorization factor and the cost of the selected VF. 1329 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1330 1331 /// Memory access instruction may be vectorized in more than one way. 1332 /// Form of instruction after vectorization depends on cost. 1333 /// This function takes cost-based decisions for Load/Store instructions 1334 /// and collects them in a map. This decisions map is used for building 1335 /// the lists of loop-uniform and loop-scalar instructions. 1336 /// The calculated cost is saved with widening decision in order to 1337 /// avoid redundant calculations. 1338 void setCostBasedWideningDecision(ElementCount VF); 1339 1340 /// A struct that represents some properties of the register usage 1341 /// of a loop. 1342 struct RegisterUsage { 1343 /// Holds the number of loop invariant values that are used in the loop. 1344 /// The key is ClassID of target-provided register class. 1345 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1346 /// Holds the maximum number of concurrent live intervals in the loop. 1347 /// The key is ClassID of target-provided register class. 1348 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1349 }; 1350 1351 /// \return Returns information about the register usages of the loop for the 1352 /// given vectorization factors. 1353 SmallVector<RegisterUsage, 8> 1354 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1355 1356 /// Collect values we want to ignore in the cost model. 1357 void collectValuesToIgnore(); 1358 1359 /// Collect all element types in the loop for which widening is needed. 1360 void collectElementTypesForWidening(); 1361 1362 /// Split reductions into those that happen in the loop, and those that happen 1363 /// outside. In loop reductions are collected into InLoopReductionChains. 1364 void collectInLoopReductions(); 1365 1366 /// Returns true if we should use strict in-order reductions for the given 1367 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1368 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1369 /// of FP operations. 1370 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1371 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1372 } 1373 1374 /// \returns The smallest bitwidth each instruction can be represented with. 1375 /// The vector equivalents of these instructions should be truncated to this 1376 /// type. 1377 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1378 return MinBWs; 1379 } 1380 1381 /// \returns True if it is more profitable to scalarize instruction \p I for 1382 /// vectorization factor \p VF. 1383 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1384 assert(VF.isVector() && 1385 "Profitable to scalarize relevant only for VF > 1."); 1386 1387 // Cost model is not run in the VPlan-native path - return conservative 1388 // result until this changes. 1389 if (EnableVPlanNativePath) 1390 return false; 1391 1392 auto Scalars = InstsToScalarize.find(VF); 1393 assert(Scalars != InstsToScalarize.end() && 1394 "VF not yet analyzed for scalarization profitability"); 1395 return Scalars->second.find(I) != Scalars->second.end(); 1396 } 1397 1398 /// Returns true if \p I is known to be uniform after vectorization. 1399 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1400 if (VF.isScalar()) 1401 return true; 1402 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return false; 1407 1408 auto UniformsPerVF = Uniforms.find(VF); 1409 assert(UniformsPerVF != Uniforms.end() && 1410 "VF not yet analyzed for uniformity"); 1411 return UniformsPerVF->second.count(I); 1412 } 1413 1414 /// Returns true if \p I is known to be scalar after vectorization. 1415 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1416 if (VF.isScalar()) 1417 return true; 1418 1419 // Cost model is not run in the VPlan-native path - return conservative 1420 // result until this changes. 1421 if (EnableVPlanNativePath) 1422 return false; 1423 1424 auto ScalarsPerVF = Scalars.find(VF); 1425 assert(ScalarsPerVF != Scalars.end() && 1426 "Scalar values are not calculated for VF"); 1427 return ScalarsPerVF->second.count(I); 1428 } 1429 1430 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1431 /// for vectorization factor \p VF. 1432 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1433 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1434 !isProfitableToScalarize(I, VF) && 1435 !isScalarAfterVectorization(I, VF); 1436 } 1437 1438 /// Decision that was taken during cost calculation for memory instruction. 1439 enum InstWidening { 1440 CM_Unknown, 1441 CM_Widen, // For consecutive accesses with stride +1. 1442 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1443 CM_Interleave, 1444 CM_GatherScatter, 1445 CM_Scalarize 1446 }; 1447 1448 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1449 /// instruction \p I and vector width \p VF. 1450 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1451 InstructionCost Cost) { 1452 assert(VF.isVector() && "Expected VF >=2"); 1453 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1454 } 1455 1456 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1457 /// interleaving group \p Grp and vector width \p VF. 1458 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1459 ElementCount VF, InstWidening W, 1460 InstructionCost Cost) { 1461 assert(VF.isVector() && "Expected VF >=2"); 1462 /// Broadcast this decicion to all instructions inside the group. 1463 /// But the cost will be assigned to one instruction only. 1464 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1465 if (auto *I = Grp->getMember(i)) { 1466 if (Grp->getInsertPos() == I) 1467 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1468 else 1469 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1470 } 1471 } 1472 } 1473 1474 /// Return the cost model decision for the given instruction \p I and vector 1475 /// width \p VF. Return CM_Unknown if this instruction did not pass 1476 /// through the cost modeling. 1477 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1478 assert(VF.isVector() && "Expected VF to be a vector VF"); 1479 // Cost model is not run in the VPlan-native path - return conservative 1480 // result until this changes. 1481 if (EnableVPlanNativePath) 1482 return CM_GatherScatter; 1483 1484 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1485 auto Itr = WideningDecisions.find(InstOnVF); 1486 if (Itr == WideningDecisions.end()) 1487 return CM_Unknown; 1488 return Itr->second.first; 1489 } 1490 1491 /// Return the vectorization cost for the given instruction \p I and vector 1492 /// width \p VF. 1493 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1494 assert(VF.isVector() && "Expected VF >=2"); 1495 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1496 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1497 "The cost is not calculated"); 1498 return WideningDecisions[InstOnVF].second; 1499 } 1500 1501 /// Return True if instruction \p I is an optimizable truncate whose operand 1502 /// is an induction variable. Such a truncate will be removed by adding a new 1503 /// induction variable with the destination type. 1504 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1505 // If the instruction is not a truncate, return false. 1506 auto *Trunc = dyn_cast<TruncInst>(I); 1507 if (!Trunc) 1508 return false; 1509 1510 // Get the source and destination types of the truncate. 1511 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1512 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1513 1514 // If the truncate is free for the given types, return false. Replacing a 1515 // free truncate with an induction variable would add an induction variable 1516 // update instruction to each iteration of the loop. We exclude from this 1517 // check the primary induction variable since it will need an update 1518 // instruction regardless. 1519 Value *Op = Trunc->getOperand(0); 1520 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1521 return false; 1522 1523 // If the truncated value is not an induction variable, return false. 1524 return Legal->isInductionPhi(Op); 1525 } 1526 1527 /// Collects the instructions to scalarize for each predicated instruction in 1528 /// the loop. 1529 void collectInstsToScalarize(ElementCount VF); 1530 1531 /// Collect Uniform and Scalar values for the given \p VF. 1532 /// The sets depend on CM decision for Load/Store instructions 1533 /// that may be vectorized as interleave, gather-scatter or scalarized. 1534 void collectUniformsAndScalars(ElementCount VF) { 1535 // Do the analysis once. 1536 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1537 return; 1538 setCostBasedWideningDecision(VF); 1539 collectLoopUniforms(VF); 1540 collectLoopScalars(VF); 1541 } 1542 1543 /// Returns true if the target machine supports masked store operation 1544 /// for the given \p DataType and kind of access to \p Ptr. 1545 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1546 return Legal->isConsecutivePtr(DataType, Ptr) && 1547 TTI.isLegalMaskedStore(DataType, Alignment); 1548 } 1549 1550 /// Returns true if the target machine supports masked load operation 1551 /// for the given \p DataType and kind of access to \p Ptr. 1552 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1553 return Legal->isConsecutivePtr(DataType, Ptr) && 1554 TTI.isLegalMaskedLoad(DataType, Alignment); 1555 } 1556 1557 /// Returns true if the target machine can represent \p V as a masked gather 1558 /// or scatter operation. 1559 bool isLegalGatherOrScatter(Value *V) { 1560 bool LI = isa<LoadInst>(V); 1561 bool SI = isa<StoreInst>(V); 1562 if (!LI && !SI) 1563 return false; 1564 auto *Ty = getLoadStoreType(V); 1565 Align Align = getLoadStoreAlignment(V); 1566 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1567 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1568 } 1569 1570 /// Returns true if the target machine supports all of the reduction 1571 /// variables found for the given VF. 1572 bool canVectorizeReductions(ElementCount VF) const { 1573 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1574 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1575 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1576 })); 1577 } 1578 1579 /// Returns true if \p I is an instruction that will be scalarized with 1580 /// predication. Such instructions include conditional stores and 1581 /// instructions that may divide by zero. 1582 /// If a non-zero VF has been calculated, we check if I will be scalarized 1583 /// predication for that VF. 1584 bool isScalarWithPredication(Instruction *I) const; 1585 1586 // Returns true if \p I is an instruction that will be predicated either 1587 // through scalar predication or masked load/store or masked gather/scatter. 1588 // Superset of instructions that return true for isScalarWithPredication. 1589 bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { 1590 // When we know the load is uniform and the original scalar loop was not 1591 // predicated we don't need to mark it as a predicated instruction. Any 1592 // vectorised blocks created when tail-folding are something artificial we 1593 // have introduced and we know there is always at least one active lane. 1594 // That's why we call Legal->blockNeedsPredication here because it doesn't 1595 // query tail-folding. 1596 if (IsKnownUniform && isa<LoadInst>(I) && 1597 !Legal->blockNeedsPredication(I->getParent())) 1598 return false; 1599 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1600 return false; 1601 // Loads and stores that need some form of masked operation are predicated 1602 // instructions. 1603 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1604 return Legal->isMaskRequired(I); 1605 return isScalarWithPredication(I); 1606 } 1607 1608 /// Returns true if \p I is a memory instruction with consecutive memory 1609 /// access that can be widened. 1610 bool 1611 memoryInstructionCanBeWidened(Instruction *I, 1612 ElementCount VF = ElementCount::getFixed(1)); 1613 1614 /// Returns true if \p I is a memory instruction in an interleaved-group 1615 /// of memory accesses that can be vectorized with wide vector loads/stores 1616 /// and shuffles. 1617 bool 1618 interleavedAccessCanBeWidened(Instruction *I, 1619 ElementCount VF = ElementCount::getFixed(1)); 1620 1621 /// Check if \p Instr belongs to any interleaved access group. 1622 bool isAccessInterleaved(Instruction *Instr) { 1623 return InterleaveInfo.isInterleaved(Instr); 1624 } 1625 1626 /// Get the interleaved access group that \p Instr belongs to. 1627 const InterleaveGroup<Instruction> * 1628 getInterleavedAccessGroup(Instruction *Instr) { 1629 return InterleaveInfo.getInterleaveGroup(Instr); 1630 } 1631 1632 /// Returns true if we're required to use a scalar epilogue for at least 1633 /// the final iteration of the original loop. 1634 bool requiresScalarEpilogue(ElementCount VF) const { 1635 if (!isScalarEpilogueAllowed()) 1636 return false; 1637 // If we might exit from anywhere but the latch, must run the exiting 1638 // iteration in scalar form. 1639 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1640 return true; 1641 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1642 } 1643 1644 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1645 /// loop hint annotation. 1646 bool isScalarEpilogueAllowed() const { 1647 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1648 } 1649 1650 /// Returns true if all loop blocks should be masked to fold tail loop. 1651 bool foldTailByMasking() const { return FoldTailByMasking; } 1652 1653 /// Returns true if the instructions in this block requires predication 1654 /// for any reason, e.g. because tail folding now requires a predicate 1655 /// or because the block in the original loop was predicated. 1656 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1657 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1658 } 1659 1660 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1661 /// nodes to the chain of instructions representing the reductions. Uses a 1662 /// MapVector to ensure deterministic iteration order. 1663 using ReductionChainMap = 1664 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1665 1666 /// Return the chain of instructions representing an inloop reduction. 1667 const ReductionChainMap &getInLoopReductionChains() const { 1668 return InLoopReductionChains; 1669 } 1670 1671 /// Returns true if the Phi is part of an inloop reduction. 1672 bool isInLoopReduction(PHINode *Phi) const { 1673 return InLoopReductionChains.count(Phi); 1674 } 1675 1676 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1677 /// with factor VF. Return the cost of the instruction, including 1678 /// scalarization overhead if it's needed. 1679 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1680 1681 /// Estimate cost of a call instruction CI if it were vectorized with factor 1682 /// VF. Return the cost of the instruction, including scalarization overhead 1683 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1684 /// scalarized - 1685 /// i.e. either vector version isn't available, or is too expensive. 1686 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1687 bool &NeedToScalarize) const; 1688 1689 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1690 /// that of B. 1691 bool isMoreProfitable(const VectorizationFactor &A, 1692 const VectorizationFactor &B) const; 1693 1694 /// Invalidates decisions already taken by the cost model. 1695 void invalidateCostModelingDecisions() { 1696 WideningDecisions.clear(); 1697 Uniforms.clear(); 1698 Scalars.clear(); 1699 } 1700 1701 private: 1702 unsigned NumPredStores = 0; 1703 1704 /// \return An upper bound for the vectorization factors for both 1705 /// fixed and scalable vectorization, where the minimum-known number of 1706 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1707 /// disabled or unsupported, then the scalable part will be equal to 1708 /// ElementCount::getScalable(0). 1709 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1710 ElementCount UserVF, 1711 bool FoldTailByMasking); 1712 1713 /// \return the maximized element count based on the targets vector 1714 /// registers and the loop trip-count, but limited to a maximum safe VF. 1715 /// This is a helper function of computeFeasibleMaxVF. 1716 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1717 /// issue that occurred on one of the buildbots which cannot be reproduced 1718 /// without having access to the properietary compiler (see comments on 1719 /// D98509). The issue is currently under investigation and this workaround 1720 /// will be removed as soon as possible. 1721 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1722 unsigned SmallestType, 1723 unsigned WidestType, 1724 const ElementCount &MaxSafeVF, 1725 bool FoldTailByMasking); 1726 1727 /// \return the maximum legal scalable VF, based on the safe max number 1728 /// of elements. 1729 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1730 1731 /// The vectorization cost is a combination of the cost itself and a boolean 1732 /// indicating whether any of the contributing operations will actually 1733 /// operate on vector values after type legalization in the backend. If this 1734 /// latter value is false, then all operations will be scalarized (i.e. no 1735 /// vectorization has actually taken place). 1736 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1737 1738 /// Returns the expected execution cost. The unit of the cost does 1739 /// not matter because we use the 'cost' units to compare different 1740 /// vector widths. The cost that is returned is *not* normalized by 1741 /// the factor width. If \p Invalid is not nullptr, this function 1742 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1743 /// each instruction that has an Invalid cost for the given VF. 1744 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1745 VectorizationCostTy 1746 expectedCost(ElementCount VF, 1747 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1748 1749 /// Returns the execution time cost of an instruction for a given vector 1750 /// width. Vector width of one means scalar. 1751 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1752 1753 /// The cost-computation logic from getInstructionCost which provides 1754 /// the vector type as an output parameter. 1755 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1756 Type *&VectorTy); 1757 1758 /// Return the cost of instructions in an inloop reduction pattern, if I is 1759 /// part of that pattern. 1760 Optional<InstructionCost> 1761 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1762 TTI::TargetCostKind CostKind); 1763 1764 /// Calculate vectorization cost of memory instruction \p I. 1765 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1766 1767 /// The cost computation for scalarized memory instruction. 1768 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1769 1770 /// The cost computation for interleaving group of memory instructions. 1771 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1772 1773 /// The cost computation for Gather/Scatter instruction. 1774 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1775 1776 /// The cost computation for widening instruction \p I with consecutive 1777 /// memory access. 1778 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1779 1780 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1781 /// Load: scalar load + broadcast. 1782 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1783 /// element) 1784 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1785 1786 /// Estimate the overhead of scalarizing an instruction. This is a 1787 /// convenience wrapper for the type-based getScalarizationOverhead API. 1788 InstructionCost getScalarizationOverhead(Instruction *I, 1789 ElementCount VF) const; 1790 1791 /// Returns whether the instruction is a load or store and will be a emitted 1792 /// as a vector operation. 1793 bool isConsecutiveLoadOrStore(Instruction *I); 1794 1795 /// Returns true if an artificially high cost for emulated masked memrefs 1796 /// should be used. 1797 bool useEmulatedMaskMemRefHack(Instruction *I); 1798 1799 /// Map of scalar integer values to the smallest bitwidth they can be legally 1800 /// represented as. The vector equivalents of these values should be truncated 1801 /// to this type. 1802 MapVector<Instruction *, uint64_t> MinBWs; 1803 1804 /// A type representing the costs for instructions if they were to be 1805 /// scalarized rather than vectorized. The entries are Instruction-Cost 1806 /// pairs. 1807 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1808 1809 /// A set containing all BasicBlocks that are known to present after 1810 /// vectorization as a predicated block. 1811 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1812 1813 /// Records whether it is allowed to have the original scalar loop execute at 1814 /// least once. This may be needed as a fallback loop in case runtime 1815 /// aliasing/dependence checks fail, or to handle the tail/remainder 1816 /// iterations when the trip count is unknown or doesn't divide by the VF, 1817 /// or as a peel-loop to handle gaps in interleave-groups. 1818 /// Under optsize and when the trip count is very small we don't allow any 1819 /// iterations to execute in the scalar loop. 1820 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1821 1822 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1823 bool FoldTailByMasking = false; 1824 1825 /// A map holding scalar costs for different vectorization factors. The 1826 /// presence of a cost for an instruction in the mapping indicates that the 1827 /// instruction will be scalarized when vectorizing with the associated 1828 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1829 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1830 1831 /// Holds the instructions known to be uniform after vectorization. 1832 /// The data is collected per VF. 1833 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1834 1835 /// Holds the instructions known to be scalar after vectorization. 1836 /// The data is collected per VF. 1837 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1838 1839 /// Holds the instructions (address computations) that are forced to be 1840 /// scalarized. 1841 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1842 1843 /// PHINodes of the reductions that should be expanded in-loop along with 1844 /// their associated chains of reduction operations, in program order from top 1845 /// (PHI) to bottom 1846 ReductionChainMap InLoopReductionChains; 1847 1848 /// A Map of inloop reduction operations and their immediate chain operand. 1849 /// FIXME: This can be removed once reductions can be costed correctly in 1850 /// vplan. This was added to allow quick lookup to the inloop operations, 1851 /// without having to loop through InLoopReductionChains. 1852 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1853 1854 /// Returns the expected difference in cost from scalarizing the expression 1855 /// feeding a predicated instruction \p PredInst. The instructions to 1856 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1857 /// non-negative return value implies the expression will be scalarized. 1858 /// Currently, only single-use chains are considered for scalarization. 1859 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1860 ElementCount VF); 1861 1862 /// Collect the instructions that are uniform after vectorization. An 1863 /// instruction is uniform if we represent it with a single scalar value in 1864 /// the vectorized loop corresponding to each vector iteration. Examples of 1865 /// uniform instructions include pointer operands of consecutive or 1866 /// interleaved memory accesses. Note that although uniformity implies an 1867 /// instruction will be scalar, the reverse is not true. In general, a 1868 /// scalarized instruction will be represented by VF scalar values in the 1869 /// vectorized loop, each corresponding to an iteration of the original 1870 /// scalar loop. 1871 void collectLoopUniforms(ElementCount VF); 1872 1873 /// Collect the instructions that are scalar after vectorization. An 1874 /// instruction is scalar if it is known to be uniform or will be scalarized 1875 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1876 /// to the list if they are used by a load/store instruction that is marked as 1877 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1878 /// VF values in the vectorized loop, each corresponding to an iteration of 1879 /// the original scalar loop. 1880 void collectLoopScalars(ElementCount VF); 1881 1882 /// Keeps cost model vectorization decision and cost for instructions. 1883 /// Right now it is used for memory instructions only. 1884 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1885 std::pair<InstWidening, InstructionCost>>; 1886 1887 DecisionList WideningDecisions; 1888 1889 /// Returns true if \p V is expected to be vectorized and it needs to be 1890 /// extracted. 1891 bool needsExtract(Value *V, ElementCount VF) const { 1892 Instruction *I = dyn_cast<Instruction>(V); 1893 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1894 TheLoop->isLoopInvariant(I)) 1895 return false; 1896 1897 // Assume we can vectorize V (and hence we need extraction) if the 1898 // scalars are not computed yet. This can happen, because it is called 1899 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1900 // the scalars are collected. That should be a safe assumption in most 1901 // cases, because we check if the operands have vectorizable types 1902 // beforehand in LoopVectorizationLegality. 1903 return Scalars.find(VF) == Scalars.end() || 1904 !isScalarAfterVectorization(I, VF); 1905 }; 1906 1907 /// Returns a range containing only operands needing to be extracted. 1908 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1909 ElementCount VF) const { 1910 return SmallVector<Value *, 4>(make_filter_range( 1911 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1912 } 1913 1914 /// Determines if we have the infrastructure to vectorize loop \p L and its 1915 /// epilogue, assuming the main loop is vectorized by \p VF. 1916 bool isCandidateForEpilogueVectorization(const Loop &L, 1917 const ElementCount VF) const; 1918 1919 /// Returns true if epilogue vectorization is considered profitable, and 1920 /// false otherwise. 1921 /// \p VF is the vectorization factor chosen for the original loop. 1922 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1923 1924 public: 1925 /// The loop that we evaluate. 1926 Loop *TheLoop; 1927 1928 /// Predicated scalar evolution analysis. 1929 PredicatedScalarEvolution &PSE; 1930 1931 /// Loop Info analysis. 1932 LoopInfo *LI; 1933 1934 /// Vectorization legality. 1935 LoopVectorizationLegality *Legal; 1936 1937 /// Vector target information. 1938 const TargetTransformInfo &TTI; 1939 1940 /// Target Library Info. 1941 const TargetLibraryInfo *TLI; 1942 1943 /// Demanded bits analysis. 1944 DemandedBits *DB; 1945 1946 /// Assumption cache. 1947 AssumptionCache *AC; 1948 1949 /// Interface to emit optimization remarks. 1950 OptimizationRemarkEmitter *ORE; 1951 1952 const Function *TheFunction; 1953 1954 /// Loop Vectorize Hint. 1955 const LoopVectorizeHints *Hints; 1956 1957 /// The interleave access information contains groups of interleaved accesses 1958 /// with the same stride and close to each other. 1959 InterleavedAccessInfo &InterleaveInfo; 1960 1961 /// Values to ignore in the cost model. 1962 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1963 1964 /// Values to ignore in the cost model when VF > 1. 1965 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1966 1967 /// All element types found in the loop. 1968 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1969 1970 /// Profitable vector factors. 1971 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1972 }; 1973 } // end namespace llvm 1974 1975 /// Helper struct to manage generating runtime checks for vectorization. 1976 /// 1977 /// The runtime checks are created up-front in temporary blocks to allow better 1978 /// estimating the cost and un-linked from the existing IR. After deciding to 1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1980 /// temporary blocks are completely removed. 1981 class GeneratedRTChecks { 1982 /// Basic block which contains the generated SCEV checks, if any. 1983 BasicBlock *SCEVCheckBlock = nullptr; 1984 1985 /// The value representing the result of the generated SCEV checks. If it is 1986 /// nullptr, either no SCEV checks have been generated or they have been used. 1987 Value *SCEVCheckCond = nullptr; 1988 1989 /// Basic block which contains the generated memory runtime checks, if any. 1990 BasicBlock *MemCheckBlock = nullptr; 1991 1992 /// The value representing the result of the generated memory runtime checks. 1993 /// If it is nullptr, either no memory runtime checks have been generated or 1994 /// they have been used. 1995 Value *MemRuntimeCheckCond = nullptr; 1996 1997 DominatorTree *DT; 1998 LoopInfo *LI; 1999 2000 SCEVExpander SCEVExp; 2001 SCEVExpander MemCheckExp; 2002 2003 public: 2004 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2005 const DataLayout &DL) 2006 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2007 MemCheckExp(SE, DL, "scev.check") {} 2008 2009 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2010 /// accurately estimate the cost of the runtime checks. The blocks are 2011 /// un-linked from the IR and is added back during vector code generation. If 2012 /// there is no vector code generation, the check blocks are removed 2013 /// completely. 2014 void Create(Loop *L, const LoopAccessInfo &LAI, 2015 const SCEVUnionPredicate &UnionPred) { 2016 2017 BasicBlock *LoopHeader = L->getHeader(); 2018 BasicBlock *Preheader = L->getLoopPreheader(); 2019 2020 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2021 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2022 // may be used by SCEVExpander. The blocks will be un-linked from their 2023 // predecessors and removed from LI & DT at the end of the function. 2024 if (!UnionPred.isAlwaysTrue()) { 2025 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2026 nullptr, "vector.scevcheck"); 2027 2028 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2029 &UnionPred, SCEVCheckBlock->getTerminator()); 2030 } 2031 2032 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2033 if (RtPtrChecking.Need) { 2034 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2035 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2036 "vector.memcheck"); 2037 2038 MemRuntimeCheckCond = 2039 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2040 RtPtrChecking.getChecks(), MemCheckExp); 2041 assert(MemRuntimeCheckCond && 2042 "no RT checks generated although RtPtrChecking " 2043 "claimed checks are required"); 2044 } 2045 2046 if (!MemCheckBlock && !SCEVCheckBlock) 2047 return; 2048 2049 // Unhook the temporary block with the checks, update various places 2050 // accordingly. 2051 if (SCEVCheckBlock) 2052 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2053 if (MemCheckBlock) 2054 MemCheckBlock->replaceAllUsesWith(Preheader); 2055 2056 if (SCEVCheckBlock) { 2057 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2058 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2059 Preheader->getTerminator()->eraseFromParent(); 2060 } 2061 if (MemCheckBlock) { 2062 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2063 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2064 Preheader->getTerminator()->eraseFromParent(); 2065 } 2066 2067 DT->changeImmediateDominator(LoopHeader, Preheader); 2068 if (MemCheckBlock) { 2069 DT->eraseNode(MemCheckBlock); 2070 LI->removeBlock(MemCheckBlock); 2071 } 2072 if (SCEVCheckBlock) { 2073 DT->eraseNode(SCEVCheckBlock); 2074 LI->removeBlock(SCEVCheckBlock); 2075 } 2076 } 2077 2078 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2079 /// unused. 2080 ~GeneratedRTChecks() { 2081 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2082 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2083 if (!SCEVCheckCond) 2084 SCEVCleaner.markResultUsed(); 2085 2086 if (!MemRuntimeCheckCond) 2087 MemCheckCleaner.markResultUsed(); 2088 2089 if (MemRuntimeCheckCond) { 2090 auto &SE = *MemCheckExp.getSE(); 2091 // Memory runtime check generation creates compares that use expanded 2092 // values. Remove them before running the SCEVExpanderCleaners. 2093 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2094 if (MemCheckExp.isInsertedInstruction(&I)) 2095 continue; 2096 SE.forgetValue(&I); 2097 I.eraseFromParent(); 2098 } 2099 } 2100 MemCheckCleaner.cleanup(); 2101 SCEVCleaner.cleanup(); 2102 2103 if (SCEVCheckCond) 2104 SCEVCheckBlock->eraseFromParent(); 2105 if (MemRuntimeCheckCond) 2106 MemCheckBlock->eraseFromParent(); 2107 } 2108 2109 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2110 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2111 /// depending on the generated condition. 2112 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2113 BasicBlock *LoopVectorPreHeader, 2114 BasicBlock *LoopExitBlock) { 2115 if (!SCEVCheckCond) 2116 return nullptr; 2117 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2118 if (C->isZero()) 2119 return nullptr; 2120 2121 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2122 2123 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2124 // Create new preheader for vector loop. 2125 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2126 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2127 2128 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2129 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2130 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2131 SCEVCheckBlock); 2132 2133 DT->addNewBlock(SCEVCheckBlock, Pred); 2134 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2135 2136 ReplaceInstWithInst( 2137 SCEVCheckBlock->getTerminator(), 2138 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2139 // Mark the check as used, to prevent it from being removed during cleanup. 2140 SCEVCheckCond = nullptr; 2141 return SCEVCheckBlock; 2142 } 2143 2144 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2145 /// the branches to branch to the vector preheader or \p Bypass, depending on 2146 /// the generated condition. 2147 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2148 BasicBlock *LoopVectorPreHeader) { 2149 // Check if we generated code that checks in runtime if arrays overlap. 2150 if (!MemRuntimeCheckCond) 2151 return nullptr; 2152 2153 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2154 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2155 MemCheckBlock); 2156 2157 DT->addNewBlock(MemCheckBlock, Pred); 2158 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2159 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2160 2161 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2162 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2163 2164 ReplaceInstWithInst( 2165 MemCheckBlock->getTerminator(), 2166 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2167 MemCheckBlock->getTerminator()->setDebugLoc( 2168 Pred->getTerminator()->getDebugLoc()); 2169 2170 // Mark the check as used, to prevent it from being removed during cleanup. 2171 MemRuntimeCheckCond = nullptr; 2172 return MemCheckBlock; 2173 } 2174 }; 2175 2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2177 // vectorization. The loop needs to be annotated with #pragma omp simd 2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2179 // vector length information is not provided, vectorization is not considered 2180 // explicit. Interleave hints are not allowed either. These limitations will be 2181 // relaxed in the future. 2182 // Please, note that we are currently forced to abuse the pragma 'clang 2183 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2185 // provides *explicit vectorization hints* (LV can bypass legal checks and 2186 // assume that vectorization is legal). However, both hints are implemented 2187 // using the same metadata (llvm.loop.vectorize, processed by 2188 // LoopVectorizeHints). This will be fixed in the future when the native IR 2189 // representation for pragma 'omp simd' is introduced. 2190 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2191 OptimizationRemarkEmitter *ORE) { 2192 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2193 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2194 2195 // Only outer loops with an explicit vectorization hint are supported. 2196 // Unannotated outer loops are ignored. 2197 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2198 return false; 2199 2200 Function *Fn = OuterLp->getHeader()->getParent(); 2201 if (!Hints.allowVectorization(Fn, OuterLp, 2202 true /*VectorizeOnlyWhenForced*/)) { 2203 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2204 return false; 2205 } 2206 2207 if (Hints.getInterleave() > 1) { 2208 // TODO: Interleave support is future work. 2209 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2210 "outer loops.\n"); 2211 Hints.emitRemarkWithHints(); 2212 return false; 2213 } 2214 2215 return true; 2216 } 2217 2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2219 OptimizationRemarkEmitter *ORE, 2220 SmallVectorImpl<Loop *> &V) { 2221 // Collect inner loops and outer loops without irreducible control flow. For 2222 // now, only collect outer loops that have explicit vectorization hints. If we 2223 // are stress testing the VPlan H-CFG construction, we collect the outermost 2224 // loop of every loop nest. 2225 if (L.isInnermost() || VPlanBuildStressTest || 2226 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2227 LoopBlocksRPO RPOT(&L); 2228 RPOT.perform(LI); 2229 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2230 V.push_back(&L); 2231 // TODO: Collect inner loops inside marked outer loops in case 2232 // vectorization fails for the outer loop. Do not invoke 2233 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2234 // already known to be reducible. We can use an inherited attribute for 2235 // that. 2236 return; 2237 } 2238 } 2239 for (Loop *InnerL : L) 2240 collectSupportedLoops(*InnerL, LI, ORE, V); 2241 } 2242 2243 namespace { 2244 2245 /// The LoopVectorize Pass. 2246 struct LoopVectorize : public FunctionPass { 2247 /// Pass identification, replacement for typeid 2248 static char ID; 2249 2250 LoopVectorizePass Impl; 2251 2252 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2253 bool VectorizeOnlyWhenForced = false) 2254 : FunctionPass(ID), 2255 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2256 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2257 } 2258 2259 bool runOnFunction(Function &F) override { 2260 if (skipFunction(F)) 2261 return false; 2262 2263 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2264 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2265 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2266 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2267 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2268 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2269 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2270 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2271 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2272 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2273 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2274 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2275 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2276 2277 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2278 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2279 2280 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2281 GetLAA, *ORE, PSI).MadeAnyChange; 2282 } 2283 2284 void getAnalysisUsage(AnalysisUsage &AU) const override { 2285 AU.addRequired<AssumptionCacheTracker>(); 2286 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2287 AU.addRequired<DominatorTreeWrapperPass>(); 2288 AU.addRequired<LoopInfoWrapperPass>(); 2289 AU.addRequired<ScalarEvolutionWrapperPass>(); 2290 AU.addRequired<TargetTransformInfoWrapperPass>(); 2291 AU.addRequired<AAResultsWrapperPass>(); 2292 AU.addRequired<LoopAccessLegacyAnalysis>(); 2293 AU.addRequired<DemandedBitsWrapperPass>(); 2294 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2295 AU.addRequired<InjectTLIMappingsLegacy>(); 2296 2297 // We currently do not preserve loopinfo/dominator analyses with outer loop 2298 // vectorization. Until this is addressed, mark these analyses as preserved 2299 // only for non-VPlan-native path. 2300 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2301 if (!EnableVPlanNativePath) { 2302 AU.addPreserved<LoopInfoWrapperPass>(); 2303 AU.addPreserved<DominatorTreeWrapperPass>(); 2304 } 2305 2306 AU.addPreserved<BasicAAWrapperPass>(); 2307 AU.addPreserved<GlobalsAAWrapperPass>(); 2308 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2309 } 2310 }; 2311 2312 } // end anonymous namespace 2313 2314 //===----------------------------------------------------------------------===// 2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2316 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2317 //===----------------------------------------------------------------------===// 2318 2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2320 // We need to place the broadcast of invariant variables outside the loop, 2321 // but only if it's proven safe to do so. Else, broadcast will be inside 2322 // vector loop body. 2323 Instruction *Instr = dyn_cast<Instruction>(V); 2324 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2325 (!Instr || 2326 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2327 // Place the code for broadcasting invariant variables in the new preheader. 2328 IRBuilder<>::InsertPointGuard Guard(Builder); 2329 if (SafeToHoist) 2330 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2331 2332 // Broadcast the scalar into all locations in the vector. 2333 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2334 2335 return Shuf; 2336 } 2337 2338 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2339 const InductionDescriptor &II, Value *Step, Value *Start, 2340 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2341 IRBuilder<> &Builder = State.Builder; 2342 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2343 "Expected either an induction phi-node or a truncate of it!"); 2344 2345 // Construct the initial value of the vector IV in the vector loop preheader 2346 auto CurrIP = Builder.saveIP(); 2347 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2348 if (isa<TruncInst>(EntryVal)) { 2349 assert(Start->getType()->isIntegerTy() && 2350 "Truncation requires an integer type"); 2351 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2352 Step = Builder.CreateTrunc(Step, TruncType); 2353 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2354 } 2355 2356 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2357 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2358 Value *SteppedStart = 2359 getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); 2360 2361 // We create vector phi nodes for both integer and floating-point induction 2362 // variables. Here, we determine the kind of arithmetic we will perform. 2363 Instruction::BinaryOps AddOp; 2364 Instruction::BinaryOps MulOp; 2365 if (Step->getType()->isIntegerTy()) { 2366 AddOp = Instruction::Add; 2367 MulOp = Instruction::Mul; 2368 } else { 2369 AddOp = II.getInductionOpcode(); 2370 MulOp = Instruction::FMul; 2371 } 2372 2373 // Multiply the vectorization factor by the step using integer or 2374 // floating-point arithmetic as appropriate. 2375 Type *StepType = Step->getType(); 2376 Value *RuntimeVF; 2377 if (Step->getType()->isFloatingPointTy()) 2378 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2379 else 2380 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2381 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2382 2383 // Create a vector splat to use in the induction update. 2384 // 2385 // FIXME: If the step is non-constant, we create the vector splat with 2386 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2387 // handle a constant vector splat. 2388 Value *SplatVF = isa<Constant>(Mul) 2389 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2390 : Builder.CreateVectorSplat(State.VF, Mul); 2391 Builder.restoreIP(CurrIP); 2392 2393 // We may need to add the step a number of times, depending on the unroll 2394 // factor. The last of those goes into the PHI. 2395 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2396 &*LoopVectorBody->getFirstInsertionPt()); 2397 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2398 Instruction *LastInduction = VecInd; 2399 for (unsigned Part = 0; Part < UF; ++Part) { 2400 State.set(Def, LastInduction, Part); 2401 2402 if (isa<TruncInst>(EntryVal)) 2403 addMetadata(LastInduction, EntryVal); 2404 2405 LastInduction = cast<Instruction>( 2406 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2407 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2408 } 2409 2410 // Move the last step to the end of the latch block. This ensures consistent 2411 // placement of all induction updates. 2412 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2413 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2414 auto *ICmp = cast<Instruction>(Br->getCondition()); 2415 LastInduction->moveBefore(ICmp); 2416 LastInduction->setName("vec.ind.next"); 2417 2418 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2419 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2420 } 2421 2422 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2423 return Cost->isScalarAfterVectorization(I, VF) || 2424 Cost->isProfitableToScalarize(I, VF); 2425 } 2426 2427 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2428 if (shouldScalarizeInstruction(IV)) 2429 return true; 2430 auto isScalarInst = [&](User *U) -> bool { 2431 auto *I = cast<Instruction>(U); 2432 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2433 }; 2434 return llvm::any_of(IV->users(), isScalarInst); 2435 } 2436 2437 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, 2438 const InductionDescriptor &ID, 2439 Value *Start, TruncInst *Trunc, 2440 VPValue *Def, 2441 VPTransformState &State) { 2442 IRBuilder<> &Builder = State.Builder; 2443 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2444 "Primary induction variable must have an integer type"); 2445 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2446 2447 // The value from the original loop to which we are mapping the new induction 2448 // variable. 2449 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2450 2451 auto &DL = EntryVal->getModule()->getDataLayout(); 2452 2453 // Generate code for the induction step. Note that induction steps are 2454 // required to be loop-invariant 2455 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2456 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2457 "Induction step should be loop invariant"); 2458 if (PSE.getSE()->isSCEVable(IV->getType())) { 2459 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2460 return Exp.expandCodeFor(Step, Step->getType(), 2461 State.CFG.VectorPreHeader->getTerminator()); 2462 } 2463 return cast<SCEVUnknown>(Step)->getValue(); 2464 }; 2465 2466 // The scalar value to broadcast. This is derived from the canonical 2467 // induction variable. If a truncation type is given, truncate the canonical 2468 // induction variable and step. Otherwise, derive these values from the 2469 // induction descriptor. 2470 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2471 Value *ScalarIV = Induction; 2472 if (IV != OldInduction) { 2473 ScalarIV = IV->getType()->isIntegerTy() 2474 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2475 : Builder.CreateCast(Instruction::SIToFP, Induction, 2476 IV->getType()); 2477 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2478 State.CFG.PrevBB); 2479 ScalarIV->setName("offset.idx"); 2480 } 2481 if (Trunc) { 2482 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2483 assert(Step->getType()->isIntegerTy() && 2484 "Truncation requires an integer step"); 2485 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2486 Step = Builder.CreateTrunc(Step, TruncType); 2487 } 2488 return ScalarIV; 2489 }; 2490 2491 // Create the vector values from the scalar IV, in the absence of creating a 2492 // vector IV. 2493 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2494 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2495 for (unsigned Part = 0; Part < UF; ++Part) { 2496 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 2497 Value *StartIdx; 2498 if (Step->getType()->isFloatingPointTy()) 2499 StartIdx = 2500 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2501 else 2502 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2503 2504 Value *EntryPart = 2505 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); 2506 State.set(Def, EntryPart, Part); 2507 if (Trunc) 2508 addMetadata(EntryPart, Trunc); 2509 } 2510 }; 2511 2512 // Fast-math-flags propagate from the original induction instruction. 2513 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2514 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2515 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2516 2517 // Now do the actual transformations, and start with creating the step value. 2518 Value *Step = CreateStepValue(ID.getStep()); 2519 if (State.VF.isZero() || State.VF.isScalar()) { 2520 Value *ScalarIV = CreateScalarIV(Step); 2521 CreateSplatIV(ScalarIV, Step); 2522 return; 2523 } 2524 2525 // Determine if we want a scalar version of the induction variable. This is 2526 // true if the induction variable itself is not widened, or if it has at 2527 // least one user in the loop that is not widened. 2528 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2529 if (!NeedsScalarIV) { 2530 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2531 return; 2532 } 2533 2534 // Try to create a new independent vector induction variable. If we can't 2535 // create the phi node, we will splat the scalar induction variable in each 2536 // loop iteration. 2537 if (!shouldScalarizeInstruction(EntryVal)) { 2538 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2539 Value *ScalarIV = CreateScalarIV(Step); 2540 // Create scalar steps that can be used by instructions we will later 2541 // scalarize. Note that the addition of the scalar steps will not increase 2542 // the number of instructions in the loop in the common case prior to 2543 // InstCombine. We will be trading one vector extract for each scalar step. 2544 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2545 return; 2546 } 2547 2548 // All IV users are scalar instructions, so only emit a scalar IV, not a 2549 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2550 // predicate used by the masked loads/stores. 2551 Value *ScalarIV = CreateScalarIV(Step); 2552 if (!Cost->isScalarEpilogueAllowed()) 2553 CreateSplatIV(ScalarIV, Step); 2554 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2555 } 2556 2557 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, 2558 Value *Step, 2559 Instruction::BinaryOps BinOp) { 2560 // Create and check the types. 2561 auto *ValVTy = cast<VectorType>(Val->getType()); 2562 ElementCount VLen = ValVTy->getElementCount(); 2563 2564 Type *STy = Val->getType()->getScalarType(); 2565 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2566 "Induction Step must be an integer or FP"); 2567 assert(Step->getType() == STy && "Step has wrong type"); 2568 2569 SmallVector<Constant *, 8> Indices; 2570 2571 // Create a vector of consecutive numbers from zero to VF. 2572 VectorType *InitVecValVTy = ValVTy; 2573 Type *InitVecValSTy = STy; 2574 if (STy->isFloatingPointTy()) { 2575 InitVecValSTy = 2576 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2577 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2578 } 2579 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2580 2581 // Splat the StartIdx 2582 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2583 2584 if (STy->isIntegerTy()) { 2585 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2586 Step = Builder.CreateVectorSplat(VLen, Step); 2587 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2588 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2589 // which can be found from the original scalar operations. 2590 Step = Builder.CreateMul(InitVec, Step); 2591 return Builder.CreateAdd(Val, Step, "induction"); 2592 } 2593 2594 // Floating point induction. 2595 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2596 "Binary Opcode should be specified for FP induction"); 2597 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2598 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2599 2600 Step = Builder.CreateVectorSplat(VLen, Step); 2601 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2602 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2603 } 2604 2605 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2606 Instruction *EntryVal, 2607 const InductionDescriptor &ID, 2608 VPValue *Def, 2609 VPTransformState &State) { 2610 IRBuilder<> &Builder = State.Builder; 2611 // We shouldn't have to build scalar steps if we aren't vectorizing. 2612 assert(State.VF.isVector() && "VF should be greater than one"); 2613 // Get the value type and ensure it and the step have the same integer type. 2614 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2615 assert(ScalarIVTy == Step->getType() && 2616 "Val and Step should have the same type"); 2617 2618 // We build scalar steps for both integer and floating-point induction 2619 // variables. Here, we determine the kind of arithmetic we will perform. 2620 Instruction::BinaryOps AddOp; 2621 Instruction::BinaryOps MulOp; 2622 if (ScalarIVTy->isIntegerTy()) { 2623 AddOp = Instruction::Add; 2624 MulOp = Instruction::Mul; 2625 } else { 2626 AddOp = ID.getInductionOpcode(); 2627 MulOp = Instruction::FMul; 2628 } 2629 2630 // Determine the number of scalars we need to generate for each unroll 2631 // iteration. If EntryVal is uniform, we only need to generate the first 2632 // lane. Otherwise, we generate all VF values. 2633 bool IsUniform = 2634 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2635 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2636 // Compute the scalar steps and save the results in State. 2637 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2638 ScalarIVTy->getScalarSizeInBits()); 2639 Type *VecIVTy = nullptr; 2640 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2641 if (!IsUniform && State.VF.isScalable()) { 2642 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2643 UnitStepVec = 2644 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2645 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2646 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2647 } 2648 2649 for (unsigned Part = 0; Part < State.UF; ++Part) { 2650 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2651 2652 if (!IsUniform && State.VF.isScalable()) { 2653 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2654 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2655 if (ScalarIVTy->isFloatingPointTy()) 2656 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2657 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2658 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2659 State.set(Def, Add, Part); 2660 // It's useful to record the lane values too for the known minimum number 2661 // of elements so we do those below. This improves the code quality when 2662 // trying to extract the first element, for example. 2663 } 2664 2665 if (ScalarIVTy->isFloatingPointTy()) 2666 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2667 2668 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2669 Value *StartIdx = Builder.CreateBinOp( 2670 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2671 // The step returned by `createStepForVF` is a runtime-evaluated value 2672 // when VF is scalable. Otherwise, it should be folded into a Constant. 2673 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2674 "Expected StartIdx to be folded to a constant when VF is not " 2675 "scalable"); 2676 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2677 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2678 State.set(Def, Add, VPIteration(Part, Lane)); 2679 } 2680 } 2681 } 2682 2683 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2684 const VPIteration &Instance, 2685 VPTransformState &State) { 2686 Value *ScalarInst = State.get(Def, Instance); 2687 Value *VectorValue = State.get(Def, Instance.Part); 2688 VectorValue = Builder.CreateInsertElement( 2689 VectorValue, ScalarInst, 2690 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2691 State.set(Def, VectorValue, Instance.Part); 2692 } 2693 2694 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2695 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2696 return Builder.CreateVectorReverse(Vec, "reverse"); 2697 } 2698 2699 // Return whether we allow using masked interleave-groups (for dealing with 2700 // strided loads/stores that reside in predicated blocks, or for dealing 2701 // with gaps). 2702 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2703 // If an override option has been passed in for interleaved accesses, use it. 2704 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2705 return EnableMaskedInterleavedMemAccesses; 2706 2707 return TTI.enableMaskedInterleavedAccessVectorization(); 2708 } 2709 2710 // Try to vectorize the interleave group that \p Instr belongs to. 2711 // 2712 // E.g. Translate following interleaved load group (factor = 3): 2713 // for (i = 0; i < N; i+=3) { 2714 // R = Pic[i]; // Member of index 0 2715 // G = Pic[i+1]; // Member of index 1 2716 // B = Pic[i+2]; // Member of index 2 2717 // ... // do something to R, G, B 2718 // } 2719 // To: 2720 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2721 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2722 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2723 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2724 // 2725 // Or translate following interleaved store group (factor = 3): 2726 // for (i = 0; i < N; i+=3) { 2727 // ... do something to R, G, B 2728 // Pic[i] = R; // Member of index 0 2729 // Pic[i+1] = G; // Member of index 1 2730 // Pic[i+2] = B; // Member of index 2 2731 // } 2732 // To: 2733 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2734 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2735 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2736 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2737 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2738 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2739 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2740 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2741 VPValue *BlockInMask) { 2742 Instruction *Instr = Group->getInsertPos(); 2743 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2744 2745 // Prepare for the vector type of the interleaved load/store. 2746 Type *ScalarTy = getLoadStoreType(Instr); 2747 unsigned InterleaveFactor = Group->getFactor(); 2748 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2749 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2750 2751 // Prepare for the new pointers. 2752 SmallVector<Value *, 2> AddrParts; 2753 unsigned Index = Group->getIndex(Instr); 2754 2755 // TODO: extend the masked interleaved-group support to reversed access. 2756 assert((!BlockInMask || !Group->isReverse()) && 2757 "Reversed masked interleave-group not supported."); 2758 2759 // If the group is reverse, adjust the index to refer to the last vector lane 2760 // instead of the first. We adjust the index from the first vector lane, 2761 // rather than directly getting the pointer for lane VF - 1, because the 2762 // pointer operand of the interleaved access is supposed to be uniform. For 2763 // uniform instructions, we're only required to generate a value for the 2764 // first vector lane in each unroll iteration. 2765 if (Group->isReverse()) 2766 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2767 2768 for (unsigned Part = 0; Part < UF; Part++) { 2769 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2770 setDebugLocFromInst(AddrPart); 2771 2772 // Notice current instruction could be any index. Need to adjust the address 2773 // to the member of index 0. 2774 // 2775 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2776 // b = A[i]; // Member of index 0 2777 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2778 // 2779 // E.g. A[i+1] = a; // Member of index 1 2780 // A[i] = b; // Member of index 0 2781 // A[i+2] = c; // Member of index 2 (Current instruction) 2782 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2783 2784 bool InBounds = false; 2785 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2786 InBounds = gep->isInBounds(); 2787 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2788 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2789 2790 // Cast to the vector pointer type. 2791 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2792 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2793 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2794 } 2795 2796 setDebugLocFromInst(Instr); 2797 Value *PoisonVec = PoisonValue::get(VecTy); 2798 2799 Value *MaskForGaps = nullptr; 2800 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2801 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2802 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2803 } 2804 2805 // Vectorize the interleaved load group. 2806 if (isa<LoadInst>(Instr)) { 2807 // For each unroll part, create a wide load for the group. 2808 SmallVector<Value *, 2> NewLoads; 2809 for (unsigned Part = 0; Part < UF; Part++) { 2810 Instruction *NewLoad; 2811 if (BlockInMask || MaskForGaps) { 2812 assert(useMaskedInterleavedAccesses(*TTI) && 2813 "masked interleaved groups are not allowed."); 2814 Value *GroupMask = MaskForGaps; 2815 if (BlockInMask) { 2816 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2817 Value *ShuffledMask = Builder.CreateShuffleVector( 2818 BlockInMaskPart, 2819 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2820 "interleaved.mask"); 2821 GroupMask = MaskForGaps 2822 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2823 MaskForGaps) 2824 : ShuffledMask; 2825 } 2826 NewLoad = 2827 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2828 GroupMask, PoisonVec, "wide.masked.vec"); 2829 } 2830 else 2831 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2832 Group->getAlign(), "wide.vec"); 2833 Group->addMetadata(NewLoad); 2834 NewLoads.push_back(NewLoad); 2835 } 2836 2837 // For each member in the group, shuffle out the appropriate data from the 2838 // wide loads. 2839 unsigned J = 0; 2840 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2841 Instruction *Member = Group->getMember(I); 2842 2843 // Skip the gaps in the group. 2844 if (!Member) 2845 continue; 2846 2847 auto StrideMask = 2848 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2849 for (unsigned Part = 0; Part < UF; Part++) { 2850 Value *StridedVec = Builder.CreateShuffleVector( 2851 NewLoads[Part], StrideMask, "strided.vec"); 2852 2853 // If this member has different type, cast the result type. 2854 if (Member->getType() != ScalarTy) { 2855 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2856 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2857 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2858 } 2859 2860 if (Group->isReverse()) 2861 StridedVec = reverseVector(StridedVec); 2862 2863 State.set(VPDefs[J], StridedVec, Part); 2864 } 2865 ++J; 2866 } 2867 return; 2868 } 2869 2870 // The sub vector type for current instruction. 2871 auto *SubVT = VectorType::get(ScalarTy, VF); 2872 2873 // Vectorize the interleaved store group. 2874 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2875 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2876 "masked interleaved groups are not allowed."); 2877 assert((!MaskForGaps || !VF.isScalable()) && 2878 "masking gaps for scalable vectors is not yet supported."); 2879 for (unsigned Part = 0; Part < UF; Part++) { 2880 // Collect the stored vector from each member. 2881 SmallVector<Value *, 4> StoredVecs; 2882 for (unsigned i = 0; i < InterleaveFactor; i++) { 2883 assert((Group->getMember(i) || MaskForGaps) && 2884 "Fail to get a member from an interleaved store group"); 2885 Instruction *Member = Group->getMember(i); 2886 2887 // Skip the gaps in the group. 2888 if (!Member) { 2889 Value *Undef = PoisonValue::get(SubVT); 2890 StoredVecs.push_back(Undef); 2891 continue; 2892 } 2893 2894 Value *StoredVec = State.get(StoredValues[i], Part); 2895 2896 if (Group->isReverse()) 2897 StoredVec = reverseVector(StoredVec); 2898 2899 // If this member has different type, cast it to a unified type. 2900 2901 if (StoredVec->getType() != SubVT) 2902 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2903 2904 StoredVecs.push_back(StoredVec); 2905 } 2906 2907 // Concatenate all vectors into a wide vector. 2908 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2909 2910 // Interleave the elements in the wide vector. 2911 Value *IVec = Builder.CreateShuffleVector( 2912 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2913 "interleaved.vec"); 2914 2915 Instruction *NewStoreInstr; 2916 if (BlockInMask || MaskForGaps) { 2917 Value *GroupMask = MaskForGaps; 2918 if (BlockInMask) { 2919 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2920 Value *ShuffledMask = Builder.CreateShuffleVector( 2921 BlockInMaskPart, 2922 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2923 "interleaved.mask"); 2924 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2925 ShuffledMask, MaskForGaps) 2926 : ShuffledMask; 2927 } 2928 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2929 Group->getAlign(), GroupMask); 2930 } else 2931 NewStoreInstr = 2932 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2933 2934 Group->addMetadata(NewStoreInstr); 2935 } 2936 } 2937 2938 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2939 VPReplicateRecipe *RepRecipe, 2940 const VPIteration &Instance, 2941 bool IfPredicateInstr, 2942 VPTransformState &State) { 2943 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2944 2945 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2946 // the first lane and part. 2947 if (isa<NoAliasScopeDeclInst>(Instr)) 2948 if (!Instance.isFirstIteration()) 2949 return; 2950 2951 setDebugLocFromInst(Instr); 2952 2953 // Does this instruction return a value ? 2954 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2955 2956 Instruction *Cloned = Instr->clone(); 2957 if (!IsVoidRetTy) 2958 Cloned->setName(Instr->getName() + ".cloned"); 2959 2960 // If the scalarized instruction contributes to the address computation of a 2961 // widen masked load/store which was in a basic block that needed predication 2962 // and is not predicated after vectorization, we can't propagate 2963 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2964 // instruction could feed a poison value to the base address of the widen 2965 // load/store. 2966 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2967 Cloned->dropPoisonGeneratingFlags(); 2968 2969 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2970 Builder.GetInsertPoint()); 2971 // Replace the operands of the cloned instructions with their scalar 2972 // equivalents in the new loop. 2973 for (auto &I : enumerate(RepRecipe->operands())) { 2974 auto InputInstance = Instance; 2975 VPValue *Operand = I.value(); 2976 if (State.Plan->isUniformAfterVectorization(Operand)) 2977 InputInstance.Lane = VPLane::getFirstLane(); 2978 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2979 } 2980 addNewMetadata(Cloned, Instr); 2981 2982 // Place the cloned scalar in the new loop. 2983 Builder.Insert(Cloned); 2984 2985 State.set(RepRecipe, Cloned, Instance); 2986 2987 // If we just cloned a new assumption, add it the assumption cache. 2988 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2989 AC->registerAssumption(II); 2990 2991 // End if-block. 2992 if (IfPredicateInstr) 2993 PredicatedInstructions.push_back(Cloned); 2994 } 2995 2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2997 Value *End, Value *Step, 2998 Instruction *DL) { 2999 BasicBlock *Header = L->getHeader(); 3000 BasicBlock *Latch = L->getLoopLatch(); 3001 // As we're just creating this loop, it's possible no latch exists 3002 // yet. If so, use the header as this will be a single block loop. 3003 if (!Latch) 3004 Latch = Header; 3005 3006 IRBuilder<> B(&*Header->getFirstInsertionPt()); 3007 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3008 setDebugLocFromInst(OldInst, &B); 3009 auto *Induction = B.CreatePHI(Start->getType(), 2, "index"); 3010 3011 B.SetInsertPoint(Latch->getTerminator()); 3012 setDebugLocFromInst(OldInst, &B); 3013 3014 // Create i+1 and fill the PHINode. 3015 // 3016 // If the tail is not folded, we know that End - Start >= Step (either 3017 // statically or through the minimum iteration checks). We also know that both 3018 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + 3019 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned 3020 // overflows and we can mark the induction increment as NUW. 3021 Value *Next = B.CreateAdd(Induction, Step, "index.next", 3022 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false); 3023 Induction->addIncoming(Start, L->getLoopPreheader()); 3024 Induction->addIncoming(Next, Latch); 3025 // Create the compare. 3026 Value *ICmp = B.CreateICmpEQ(Next, End); 3027 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3028 3029 // Now we have two terminators. Remove the old one from the block. 3030 Latch->getTerminator()->eraseFromParent(); 3031 3032 return Induction; 3033 } 3034 3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3036 if (TripCount) 3037 return TripCount; 3038 3039 assert(L && "Create Trip Count for null loop."); 3040 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3041 // Find the loop boundaries. 3042 ScalarEvolution *SE = PSE.getSE(); 3043 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3044 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3045 "Invalid loop count"); 3046 3047 Type *IdxTy = Legal->getWidestInductionType(); 3048 assert(IdxTy && "No type for induction"); 3049 3050 // The exit count might have the type of i64 while the phi is i32. This can 3051 // happen if we have an induction variable that is sign extended before the 3052 // compare. The only way that we get a backedge taken count is that the 3053 // induction variable was signed and as such will not overflow. In such a case 3054 // truncation is legal. 3055 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3056 IdxTy->getPrimitiveSizeInBits()) 3057 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3058 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3059 3060 // Get the total trip count from the count by adding 1. 3061 const SCEV *ExitCount = SE->getAddExpr( 3062 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3063 3064 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3065 3066 // Expand the trip count and place the new instructions in the preheader. 3067 // Notice that the pre-header does not change, only the loop body. 3068 SCEVExpander Exp(*SE, DL, "induction"); 3069 3070 // Count holds the overall loop count (N). 3071 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3072 L->getLoopPreheader()->getTerminator()); 3073 3074 if (TripCount->getType()->isPointerTy()) 3075 TripCount = 3076 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3077 L->getLoopPreheader()->getTerminator()); 3078 3079 return TripCount; 3080 } 3081 3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3083 if (VectorTripCount) 3084 return VectorTripCount; 3085 3086 Value *TC = getOrCreateTripCount(L); 3087 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3088 3089 Type *Ty = TC->getType(); 3090 // This is where we can make the step a runtime constant. 3091 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3092 3093 // If the tail is to be folded by masking, round the number of iterations N 3094 // up to a multiple of Step instead of rounding down. This is done by first 3095 // adding Step-1 and then rounding down. Note that it's ok if this addition 3096 // overflows: the vector induction variable will eventually wrap to zero given 3097 // that it starts at zero and its Step is a power of two; the loop will then 3098 // exit, with the last early-exit vector comparison also producing all-true. 3099 if (Cost->foldTailByMasking()) { 3100 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3101 "VF*UF must be a power of 2 when folding tail by masking"); 3102 assert(!VF.isScalable() && 3103 "Tail folding not yet supported for scalable vectors"); 3104 TC = Builder.CreateAdd( 3105 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3106 } 3107 3108 // Now we need to generate the expression for the part of the loop that the 3109 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3110 // iterations are not required for correctness, or N - Step, otherwise. Step 3111 // is equal to the vectorization factor (number of SIMD elements) times the 3112 // unroll factor (number of SIMD instructions). 3113 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3114 3115 // There are cases where we *must* run at least one iteration in the remainder 3116 // loop. See the cost model for when this can happen. If the step evenly 3117 // divides the trip count, we set the remainder to be equal to the step. If 3118 // the step does not evenly divide the trip count, no adjustment is necessary 3119 // since there will already be scalar iterations. Note that the minimum 3120 // iterations check ensures that N >= Step. 3121 if (Cost->requiresScalarEpilogue(VF)) { 3122 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3123 R = Builder.CreateSelect(IsZero, Step, R); 3124 } 3125 3126 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3127 3128 return VectorTripCount; 3129 } 3130 3131 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3132 const DataLayout &DL) { 3133 // Verify that V is a vector type with same number of elements as DstVTy. 3134 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3135 unsigned VF = DstFVTy->getNumElements(); 3136 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3137 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3138 Type *SrcElemTy = SrcVecTy->getElementType(); 3139 Type *DstElemTy = DstFVTy->getElementType(); 3140 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3141 "Vector elements must have same size"); 3142 3143 // Do a direct cast if element types are castable. 3144 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3145 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3146 } 3147 // V cannot be directly casted to desired vector type. 3148 // May happen when V is a floating point vector but DstVTy is a vector of 3149 // pointers or vice-versa. Handle this using a two-step bitcast using an 3150 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3151 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3152 "Only one type should be a pointer type"); 3153 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3154 "Only one type should be a floating point type"); 3155 Type *IntTy = 3156 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3157 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3158 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3159 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3160 } 3161 3162 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3163 BasicBlock *Bypass) { 3164 Value *Count = getOrCreateTripCount(L); 3165 // Reuse existing vector loop preheader for TC checks. 3166 // Note that new preheader block is generated for vector loop. 3167 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3168 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3169 3170 // Generate code to check if the loop's trip count is less than VF * UF, or 3171 // equal to it in case a scalar epilogue is required; this implies that the 3172 // vector trip count is zero. This check also covers the case where adding one 3173 // to the backedge-taken count overflowed leading to an incorrect trip count 3174 // of zero. In this case we will also jump to the scalar loop. 3175 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3176 : ICmpInst::ICMP_ULT; 3177 3178 // If tail is to be folded, vector loop takes care of all iterations. 3179 Value *CheckMinIters = Builder.getFalse(); 3180 if (!Cost->foldTailByMasking()) { 3181 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3182 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3183 } 3184 // Create new preheader for vector loop. 3185 LoopVectorPreHeader = 3186 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3187 "vector.ph"); 3188 3189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3190 DT->getNode(Bypass)->getIDom()) && 3191 "TC check is expected to dominate Bypass"); 3192 3193 // Update dominator for Bypass & LoopExit (if needed). 3194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3195 if (!Cost->requiresScalarEpilogue(VF)) 3196 // If there is an epilogue which must run, there's no edge from the 3197 // middle block to exit blocks and thus no need to update the immediate 3198 // dominator of the exit blocks. 3199 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3200 3201 ReplaceInstWithInst( 3202 TCCheckBlock->getTerminator(), 3203 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3204 LoopBypassBlocks.push_back(TCCheckBlock); 3205 } 3206 3207 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3208 3209 BasicBlock *const SCEVCheckBlock = 3210 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3211 if (!SCEVCheckBlock) 3212 return nullptr; 3213 3214 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3215 (OptForSizeBasedOnProfile && 3216 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3217 "Cannot SCEV check stride or overflow when optimizing for size"); 3218 3219 3220 // Update dominator only if this is first RT check. 3221 if (LoopBypassBlocks.empty()) { 3222 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3223 if (!Cost->requiresScalarEpilogue(VF)) 3224 // If there is an epilogue which must run, there's no edge from the 3225 // middle block to exit blocks and thus no need to update the immediate 3226 // dominator of the exit blocks. 3227 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3228 } 3229 3230 LoopBypassBlocks.push_back(SCEVCheckBlock); 3231 AddedSafetyChecks = true; 3232 return SCEVCheckBlock; 3233 } 3234 3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3236 BasicBlock *Bypass) { 3237 // VPlan-native path does not do any analysis for runtime checks currently. 3238 if (EnableVPlanNativePath) 3239 return nullptr; 3240 3241 BasicBlock *const MemCheckBlock = 3242 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3243 3244 // Check if we generated code that checks in runtime if arrays overlap. We put 3245 // the checks into a separate block to make the more common case of few 3246 // elements faster. 3247 if (!MemCheckBlock) 3248 return nullptr; 3249 3250 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3251 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3252 "Cannot emit memory checks when optimizing for size, unless forced " 3253 "to vectorize."); 3254 ORE->emit([&]() { 3255 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3256 L->getStartLoc(), L->getHeader()) 3257 << "Code-size may be reduced by not forcing " 3258 "vectorization, or by source-code modifications " 3259 "eliminating the need for runtime checks " 3260 "(e.g., adding 'restrict')."; 3261 }); 3262 } 3263 3264 LoopBypassBlocks.push_back(MemCheckBlock); 3265 3266 AddedSafetyChecks = true; 3267 3268 // We currently don't use LoopVersioning for the actual loop cloning but we 3269 // still use it to add the noalias metadata. 3270 LVer = std::make_unique<LoopVersioning>( 3271 *Legal->getLAI(), 3272 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3273 DT, PSE.getSE()); 3274 LVer->prepareNoAliasMetadata(); 3275 return MemCheckBlock; 3276 } 3277 3278 Value *InnerLoopVectorizer::emitTransformedIndex( 3279 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3280 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3281 3282 SCEVExpander Exp(*SE, DL, "induction"); 3283 auto Step = ID.getStep(); 3284 auto StartValue = ID.getStartValue(); 3285 assert(Index->getType()->getScalarType() == Step->getType() && 3286 "Index scalar type does not match StepValue type"); 3287 3288 // Note: the IR at this point is broken. We cannot use SE to create any new 3289 // SCEV and then expand it, hoping that SCEV's simplification will give us 3290 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3291 // lead to various SCEV crashes. So all we can do is to use builder and rely 3292 // on InstCombine for future simplifications. Here we handle some trivial 3293 // cases only. 3294 auto CreateAdd = [&B](Value *X, Value *Y) { 3295 assert(X->getType() == Y->getType() && "Types don't match!"); 3296 if (auto *CX = dyn_cast<ConstantInt>(X)) 3297 if (CX->isZero()) 3298 return Y; 3299 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3300 if (CY->isZero()) 3301 return X; 3302 return B.CreateAdd(X, Y); 3303 }; 3304 3305 // We allow X to be a vector type, in which case Y will potentially be 3306 // splatted into a vector with the same element count. 3307 auto CreateMul = [&B](Value *X, Value *Y) { 3308 assert(X->getType()->getScalarType() == Y->getType() && 3309 "Types don't match!"); 3310 if (auto *CX = dyn_cast<ConstantInt>(X)) 3311 if (CX->isOne()) 3312 return Y; 3313 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3314 if (CY->isOne()) 3315 return X; 3316 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3317 if (XVTy && !isa<VectorType>(Y->getType())) 3318 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3319 return B.CreateMul(X, Y); 3320 }; 3321 3322 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3323 // loop, choose the end of the vector loop header (=VectorHeader), because 3324 // the DomTree is not kept up-to-date for additional blocks generated in the 3325 // vector loop. By using the header as insertion point, we guarantee that the 3326 // expanded instructions dominate all their uses. 3327 auto GetInsertPoint = [this, &B, VectorHeader]() { 3328 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3329 if (InsertBB != LoopVectorBody && 3330 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3331 return VectorHeader->getTerminator(); 3332 return &*B.GetInsertPoint(); 3333 }; 3334 3335 switch (ID.getKind()) { 3336 case InductionDescriptor::IK_IntInduction: { 3337 assert(!isa<VectorType>(Index->getType()) && 3338 "Vector indices not supported for integer inductions yet"); 3339 assert(Index->getType() == StartValue->getType() && 3340 "Index type does not match StartValue type"); 3341 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3342 return B.CreateSub(StartValue, Index); 3343 auto *Offset = CreateMul( 3344 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3345 return CreateAdd(StartValue, Offset); 3346 } 3347 case InductionDescriptor::IK_PtrInduction: { 3348 assert(isa<SCEVConstant>(Step) && 3349 "Expected constant step for pointer induction"); 3350 return B.CreateGEP( 3351 ID.getElementType(), StartValue, 3352 CreateMul(Index, 3353 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3354 GetInsertPoint()))); 3355 } 3356 case InductionDescriptor::IK_FpInduction: { 3357 assert(!isa<VectorType>(Index->getType()) && 3358 "Vector indices not supported for FP inductions yet"); 3359 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3360 auto InductionBinOp = ID.getInductionBinOp(); 3361 assert(InductionBinOp && 3362 (InductionBinOp->getOpcode() == Instruction::FAdd || 3363 InductionBinOp->getOpcode() == Instruction::FSub) && 3364 "Original bin op should be defined for FP induction"); 3365 3366 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3367 Value *MulExp = B.CreateFMul(StepValue, Index); 3368 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3369 "induction"); 3370 } 3371 case InductionDescriptor::IK_NoInduction: 3372 return nullptr; 3373 } 3374 llvm_unreachable("invalid enum"); 3375 } 3376 3377 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3378 LoopScalarBody = OrigLoop->getHeader(); 3379 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3380 assert(LoopVectorPreHeader && "Invalid loop structure"); 3381 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3382 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3383 "multiple exit loop without required epilogue?"); 3384 3385 LoopMiddleBlock = 3386 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3387 LI, nullptr, Twine(Prefix) + "middle.block"); 3388 LoopScalarPreHeader = 3389 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3390 nullptr, Twine(Prefix) + "scalar.ph"); 3391 3392 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3393 3394 // Set up the middle block terminator. Two cases: 3395 // 1) If we know that we must execute the scalar epilogue, emit an 3396 // unconditional branch. 3397 // 2) Otherwise, we must have a single unique exit block (due to how we 3398 // implement the multiple exit case). In this case, set up a conditonal 3399 // branch from the middle block to the loop scalar preheader, and the 3400 // exit block. completeLoopSkeleton will update the condition to use an 3401 // iteration check, if required to decide whether to execute the remainder. 3402 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3403 BranchInst::Create(LoopScalarPreHeader) : 3404 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3405 Builder.getTrue()); 3406 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3407 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3408 3409 // We intentionally don't let SplitBlock to update LoopInfo since 3410 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3411 // LoopVectorBody is explicitly added to the correct place few lines later. 3412 LoopVectorBody = 3413 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3414 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3415 3416 // Update dominator for loop exit. 3417 if (!Cost->requiresScalarEpilogue(VF)) 3418 // If there is an epilogue which must run, there's no edge from the 3419 // middle block to exit blocks and thus no need to update the immediate 3420 // dominator of the exit blocks. 3421 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3422 3423 // Create and register the new vector loop. 3424 Loop *Lp = LI->AllocateLoop(); 3425 Loop *ParentLoop = OrigLoop->getParentLoop(); 3426 3427 // Insert the new loop into the loop nest and register the new basic blocks 3428 // before calling any utilities such as SCEV that require valid LoopInfo. 3429 if (ParentLoop) { 3430 ParentLoop->addChildLoop(Lp); 3431 } else { 3432 LI->addTopLevelLoop(Lp); 3433 } 3434 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3435 return Lp; 3436 } 3437 3438 void InnerLoopVectorizer::createInductionResumeValues( 3439 Loop *L, Value *VectorTripCount, 3440 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3441 assert(VectorTripCount && L && "Expected valid arguments"); 3442 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3443 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3444 "Inconsistent information about additional bypass."); 3445 // We are going to resume the execution of the scalar loop. 3446 // Go over all of the induction variables that we found and fix the 3447 // PHIs that are left in the scalar version of the loop. 3448 // The starting values of PHI nodes depend on the counter of the last 3449 // iteration in the vectorized loop. 3450 // If we come from a bypass edge then we need to start from the original 3451 // start value. 3452 for (auto &InductionEntry : Legal->getInductionVars()) { 3453 PHINode *OrigPhi = InductionEntry.first; 3454 InductionDescriptor II = InductionEntry.second; 3455 3456 // Create phi nodes to merge from the backedge-taken check block. 3457 PHINode *BCResumeVal = 3458 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3459 LoopScalarPreHeader->getTerminator()); 3460 // Copy original phi DL over to the new one. 3461 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3462 Value *&EndValue = IVEndValues[OrigPhi]; 3463 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3464 if (OrigPhi == OldInduction) { 3465 // We know what the end value is. 3466 EndValue = VectorTripCount; 3467 } else { 3468 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3469 3470 // Fast-math-flags propagate from the original induction instruction. 3471 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3472 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3473 3474 Type *StepType = II.getStep()->getType(); 3475 Instruction::CastOps CastOp = 3476 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3477 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3478 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3479 EndValue = 3480 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3481 EndValue->setName("ind.end"); 3482 3483 // Compute the end value for the additional bypass (if applicable). 3484 if (AdditionalBypass.first) { 3485 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3486 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3487 StepType, true); 3488 CRD = 3489 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3490 EndValueFromAdditionalBypass = 3491 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3492 EndValueFromAdditionalBypass->setName("ind.end"); 3493 } 3494 } 3495 // The new PHI merges the original incoming value, in case of a bypass, 3496 // or the value at the end of the vectorized loop. 3497 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3498 3499 // Fix the scalar body counter (PHI node). 3500 // The old induction's phi node in the scalar body needs the truncated 3501 // value. 3502 for (BasicBlock *BB : LoopBypassBlocks) 3503 BCResumeVal->addIncoming(II.getStartValue(), BB); 3504 3505 if (AdditionalBypass.first) 3506 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3507 EndValueFromAdditionalBypass); 3508 3509 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3510 } 3511 } 3512 3513 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3514 MDNode *OrigLoopID) { 3515 assert(L && "Expected valid loop."); 3516 3517 // The trip counts should be cached by now. 3518 Value *Count = getOrCreateTripCount(L); 3519 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3520 3521 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3522 3523 // Add a check in the middle block to see if we have completed 3524 // all of the iterations in the first vector loop. Three cases: 3525 // 1) If we require a scalar epilogue, there is no conditional branch as 3526 // we unconditionally branch to the scalar preheader. Do nothing. 3527 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3528 // Thus if tail is to be folded, we know we don't need to run the 3529 // remainder and we can use the previous value for the condition (true). 3530 // 3) Otherwise, construct a runtime check. 3531 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3532 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3533 Count, VectorTripCount, "cmp.n", 3534 LoopMiddleBlock->getTerminator()); 3535 3536 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3537 // of the corresponding compare because they may have ended up with 3538 // different line numbers and we want to avoid awkward line stepping while 3539 // debugging. Eg. if the compare has got a line number inside the loop. 3540 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3541 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3542 } 3543 3544 // Get ready to start creating new instructions into the vectorized body. 3545 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3546 "Inconsistent vector loop preheader"); 3547 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3548 3549 Optional<MDNode *> VectorizedLoopID = 3550 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3551 LLVMLoopVectorizeFollowupVectorized}); 3552 if (VectorizedLoopID.hasValue()) { 3553 L->setLoopID(VectorizedLoopID.getValue()); 3554 3555 // Do not setAlreadyVectorized if loop attributes have been defined 3556 // explicitly. 3557 return LoopVectorPreHeader; 3558 } 3559 3560 // Keep all loop hints from the original loop on the vector loop (we'll 3561 // replace the vectorizer-specific hints below). 3562 if (MDNode *LID = OrigLoop->getLoopID()) 3563 L->setLoopID(LID); 3564 3565 LoopVectorizeHints Hints(L, true, *ORE, TTI); 3566 Hints.setAlreadyVectorized(); 3567 3568 #ifdef EXPENSIVE_CHECKS 3569 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3570 LI->verify(*DT); 3571 #endif 3572 3573 return LoopVectorPreHeader; 3574 } 3575 3576 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3577 /* 3578 In this function we generate a new loop. The new loop will contain 3579 the vectorized instructions while the old loop will continue to run the 3580 scalar remainder. 3581 3582 [ ] <-- loop iteration number check. 3583 / | 3584 / v 3585 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3586 | / | 3587 | / v 3588 || [ ] <-- vector pre header. 3589 |/ | 3590 | v 3591 | [ ] \ 3592 | [ ]_| <-- vector loop. 3593 | | 3594 | v 3595 \ -[ ] <--- middle-block. 3596 \/ | 3597 /\ v 3598 | ->[ ] <--- new preheader. 3599 | | 3600 (opt) v <-- edge from middle to exit iff epilogue is not required. 3601 | [ ] \ 3602 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3603 \ | 3604 \ v 3605 >[ ] <-- exit block(s). 3606 ... 3607 */ 3608 3609 // Get the metadata of the original loop before it gets modified. 3610 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3611 3612 // Workaround! Compute the trip count of the original loop and cache it 3613 // before we start modifying the CFG. This code has a systemic problem 3614 // wherein it tries to run analysis over partially constructed IR; this is 3615 // wrong, and not simply for SCEV. The trip count of the original loop 3616 // simply happens to be prone to hitting this in practice. In theory, we 3617 // can hit the same issue for any SCEV, or ValueTracking query done during 3618 // mutation. See PR49900. 3619 getOrCreateTripCount(OrigLoop); 3620 3621 // Create an empty vector loop, and prepare basic blocks for the runtime 3622 // checks. 3623 Loop *Lp = createVectorLoopSkeleton(""); 3624 3625 // Now, compare the new count to zero. If it is zero skip the vector loop and 3626 // jump to the scalar loop. This check also covers the case where the 3627 // backedge-taken count is uint##_max: adding one to it will overflow leading 3628 // to an incorrect trip count of zero. In this (rare) case we will also jump 3629 // to the scalar loop. 3630 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3631 3632 // Generate the code to check any assumptions that we've made for SCEV 3633 // expressions. 3634 emitSCEVChecks(Lp, LoopScalarPreHeader); 3635 3636 // Generate the code that checks in runtime if arrays overlap. We put the 3637 // checks into a separate block to make the more common case of few elements 3638 // faster. 3639 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3640 3641 // Some loops have a single integer induction variable, while other loops 3642 // don't. One example is c++ iterators that often have multiple pointer 3643 // induction variables. In the code below we also support a case where we 3644 // don't have a single induction variable. 3645 // 3646 // We try to obtain an induction variable from the original loop as hard 3647 // as possible. However if we don't find one that: 3648 // - is an integer 3649 // - counts from zero, stepping by one 3650 // - is the size of the widest induction variable type 3651 // then we create a new one. 3652 OldInduction = Legal->getPrimaryInduction(); 3653 Type *IdxTy = Legal->getWidestInductionType(); 3654 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3655 // The loop step is equal to the vectorization factor (num of SIMD elements) 3656 // times the unroll factor (num of SIMD instructions). 3657 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3658 Value *Step = createStepForVF(Builder, IdxTy, VF, UF); 3659 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3660 Induction = 3661 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3662 getDebugLocFromInstOrOperands(OldInduction)); 3663 3664 // Emit phis for the new starting index of the scalar loop. 3665 createInductionResumeValues(Lp, CountRoundDown); 3666 3667 return completeLoopSkeleton(Lp, OrigLoopID); 3668 } 3669 3670 // Fix up external users of the induction variable. At this point, we are 3671 // in LCSSA form, with all external PHIs that use the IV having one input value, 3672 // coming from the remainder loop. We need those PHIs to also have a correct 3673 // value for the IV when arriving directly from the middle block. 3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3675 const InductionDescriptor &II, 3676 Value *CountRoundDown, Value *EndValue, 3677 BasicBlock *MiddleBlock) { 3678 // There are two kinds of external IV usages - those that use the value 3679 // computed in the last iteration (the PHI) and those that use the penultimate 3680 // value (the value that feeds into the phi from the loop latch). 3681 // We allow both, but they, obviously, have different values. 3682 3683 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3684 3685 DenseMap<Value *, Value *> MissingVals; 3686 3687 // An external user of the last iteration's value should see the value that 3688 // the remainder loop uses to initialize its own IV. 3689 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3690 for (User *U : PostInc->users()) { 3691 Instruction *UI = cast<Instruction>(U); 3692 if (!OrigLoop->contains(UI)) { 3693 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3694 MissingVals[UI] = EndValue; 3695 } 3696 } 3697 3698 // An external user of the penultimate value need to see EndValue - Step. 3699 // The simplest way to get this is to recompute it from the constituent SCEVs, 3700 // that is Start + (Step * (CRD - 1)). 3701 for (User *U : OrigPhi->users()) { 3702 auto *UI = cast<Instruction>(U); 3703 if (!OrigLoop->contains(UI)) { 3704 const DataLayout &DL = 3705 OrigLoop->getHeader()->getModule()->getDataLayout(); 3706 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3707 3708 IRBuilder<> B(MiddleBlock->getTerminator()); 3709 3710 // Fast-math-flags propagate from the original induction instruction. 3711 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3712 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3713 3714 Value *CountMinusOne = B.CreateSub( 3715 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3716 Value *CMO = 3717 !II.getStep()->getType()->isIntegerTy() 3718 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3719 II.getStep()->getType()) 3720 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3721 CMO->setName("cast.cmo"); 3722 Value *Escape = 3723 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3724 Escape->setName("ind.escape"); 3725 MissingVals[UI] = Escape; 3726 } 3727 } 3728 3729 for (auto &I : MissingVals) { 3730 PHINode *PHI = cast<PHINode>(I.first); 3731 // One corner case we have to handle is two IVs "chasing" each-other, 3732 // that is %IV2 = phi [...], [ %IV1, %latch ] 3733 // In this case, if IV1 has an external use, we need to avoid adding both 3734 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3735 // don't already have an incoming value for the middle block. 3736 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3737 PHI->addIncoming(I.second, MiddleBlock); 3738 } 3739 } 3740 3741 namespace { 3742 3743 struct CSEDenseMapInfo { 3744 static bool canHandle(const Instruction *I) { 3745 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3746 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3747 } 3748 3749 static inline Instruction *getEmptyKey() { 3750 return DenseMapInfo<Instruction *>::getEmptyKey(); 3751 } 3752 3753 static inline Instruction *getTombstoneKey() { 3754 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3755 } 3756 3757 static unsigned getHashValue(const Instruction *I) { 3758 assert(canHandle(I) && "Unknown instruction!"); 3759 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3760 I->value_op_end())); 3761 } 3762 3763 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3764 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3765 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3766 return LHS == RHS; 3767 return LHS->isIdenticalTo(RHS); 3768 } 3769 }; 3770 3771 } // end anonymous namespace 3772 3773 ///Perform cse of induction variable instructions. 3774 static void cse(BasicBlock *BB) { 3775 // Perform simple cse. 3776 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3777 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3778 if (!CSEDenseMapInfo::canHandle(&In)) 3779 continue; 3780 3781 // Check if we can replace this instruction with any of the 3782 // visited instructions. 3783 if (Instruction *V = CSEMap.lookup(&In)) { 3784 In.replaceAllUsesWith(V); 3785 In.eraseFromParent(); 3786 continue; 3787 } 3788 3789 CSEMap[&In] = &In; 3790 } 3791 } 3792 3793 InstructionCost 3794 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3795 bool &NeedToScalarize) const { 3796 Function *F = CI->getCalledFunction(); 3797 Type *ScalarRetTy = CI->getType(); 3798 SmallVector<Type *, 4> Tys, ScalarTys; 3799 for (auto &ArgOp : CI->args()) 3800 ScalarTys.push_back(ArgOp->getType()); 3801 3802 // Estimate cost of scalarized vector call. The source operands are assumed 3803 // to be vectors, so we need to extract individual elements from there, 3804 // execute VF scalar calls, and then gather the result into the vector return 3805 // value. 3806 InstructionCost ScalarCallCost = 3807 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3808 if (VF.isScalar()) 3809 return ScalarCallCost; 3810 3811 // Compute corresponding vector type for return value and arguments. 3812 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3813 for (Type *ScalarTy : ScalarTys) 3814 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3815 3816 // Compute costs of unpacking argument values for the scalar calls and 3817 // packing the return values to a vector. 3818 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3819 3820 InstructionCost Cost = 3821 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3822 3823 // If we can't emit a vector call for this function, then the currently found 3824 // cost is the cost we need to return. 3825 NeedToScalarize = true; 3826 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3827 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3828 3829 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3830 return Cost; 3831 3832 // If the corresponding vector cost is cheaper, return its cost. 3833 InstructionCost VectorCallCost = 3834 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3835 if (VectorCallCost < Cost) { 3836 NeedToScalarize = false; 3837 Cost = VectorCallCost; 3838 } 3839 return Cost; 3840 } 3841 3842 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3843 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3844 return Elt; 3845 return VectorType::get(Elt, VF); 3846 } 3847 3848 InstructionCost 3849 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3850 ElementCount VF) const { 3851 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3852 assert(ID && "Expected intrinsic call!"); 3853 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3854 FastMathFlags FMF; 3855 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3856 FMF = FPMO->getFastMathFlags(); 3857 3858 SmallVector<const Value *> Arguments(CI->args()); 3859 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3860 SmallVector<Type *> ParamTys; 3861 std::transform(FTy->param_begin(), FTy->param_end(), 3862 std::back_inserter(ParamTys), 3863 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3864 3865 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3866 dyn_cast<IntrinsicInst>(CI)); 3867 return TTI.getIntrinsicInstrCost(CostAttrs, 3868 TargetTransformInfo::TCK_RecipThroughput); 3869 } 3870 3871 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3872 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3873 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3874 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3875 } 3876 3877 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3878 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3879 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3880 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3881 } 3882 3883 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3884 // For every instruction `I` in MinBWs, truncate the operands, create a 3885 // truncated version of `I` and reextend its result. InstCombine runs 3886 // later and will remove any ext/trunc pairs. 3887 SmallPtrSet<Value *, 4> Erased; 3888 for (const auto &KV : Cost->getMinimalBitwidths()) { 3889 // If the value wasn't vectorized, we must maintain the original scalar 3890 // type. The absence of the value from State indicates that it 3891 // wasn't vectorized. 3892 // FIXME: Should not rely on getVPValue at this point. 3893 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3894 if (!State.hasAnyVectorValue(Def)) 3895 continue; 3896 for (unsigned Part = 0; Part < UF; ++Part) { 3897 Value *I = State.get(Def, Part); 3898 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3899 continue; 3900 Type *OriginalTy = I->getType(); 3901 Type *ScalarTruncatedTy = 3902 IntegerType::get(OriginalTy->getContext(), KV.second); 3903 auto *TruncatedTy = VectorType::get( 3904 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3905 if (TruncatedTy == OriginalTy) 3906 continue; 3907 3908 IRBuilder<> B(cast<Instruction>(I)); 3909 auto ShrinkOperand = [&](Value *V) -> Value * { 3910 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3911 if (ZI->getSrcTy() == TruncatedTy) 3912 return ZI->getOperand(0); 3913 return B.CreateZExtOrTrunc(V, TruncatedTy); 3914 }; 3915 3916 // The actual instruction modification depends on the instruction type, 3917 // unfortunately. 3918 Value *NewI = nullptr; 3919 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3920 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3921 ShrinkOperand(BO->getOperand(1))); 3922 3923 // Any wrapping introduced by shrinking this operation shouldn't be 3924 // considered undefined behavior. So, we can't unconditionally copy 3925 // arithmetic wrapping flags to NewI. 3926 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3927 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3928 NewI = 3929 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3930 ShrinkOperand(CI->getOperand(1))); 3931 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3932 NewI = B.CreateSelect(SI->getCondition(), 3933 ShrinkOperand(SI->getTrueValue()), 3934 ShrinkOperand(SI->getFalseValue())); 3935 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3936 switch (CI->getOpcode()) { 3937 default: 3938 llvm_unreachable("Unhandled cast!"); 3939 case Instruction::Trunc: 3940 NewI = ShrinkOperand(CI->getOperand(0)); 3941 break; 3942 case Instruction::SExt: 3943 NewI = B.CreateSExtOrTrunc( 3944 CI->getOperand(0), 3945 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3946 break; 3947 case Instruction::ZExt: 3948 NewI = B.CreateZExtOrTrunc( 3949 CI->getOperand(0), 3950 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3951 break; 3952 } 3953 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3954 auto Elements0 = 3955 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3956 auto *O0 = B.CreateZExtOrTrunc( 3957 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3958 auto Elements1 = 3959 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3960 auto *O1 = B.CreateZExtOrTrunc( 3961 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3962 3963 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3964 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3965 // Don't do anything with the operands, just extend the result. 3966 continue; 3967 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3968 auto Elements = 3969 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3970 auto *O0 = B.CreateZExtOrTrunc( 3971 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3972 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3973 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3974 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3975 auto Elements = 3976 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3977 auto *O0 = B.CreateZExtOrTrunc( 3978 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3979 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3980 } else { 3981 // If we don't know what to do, be conservative and don't do anything. 3982 continue; 3983 } 3984 3985 // Lastly, extend the result. 3986 NewI->takeName(cast<Instruction>(I)); 3987 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3988 I->replaceAllUsesWith(Res); 3989 cast<Instruction>(I)->eraseFromParent(); 3990 Erased.insert(I); 3991 State.reset(Def, Res, Part); 3992 } 3993 } 3994 3995 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3996 for (const auto &KV : Cost->getMinimalBitwidths()) { 3997 // If the value wasn't vectorized, we must maintain the original scalar 3998 // type. The absence of the value from State indicates that it 3999 // wasn't vectorized. 4000 // FIXME: Should not rely on getVPValue at this point. 4001 VPValue *Def = State.Plan->getVPValue(KV.first, true); 4002 if (!State.hasAnyVectorValue(Def)) 4003 continue; 4004 for (unsigned Part = 0; Part < UF; ++Part) { 4005 Value *I = State.get(Def, Part); 4006 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4007 if (Inst && Inst->use_empty()) { 4008 Value *NewI = Inst->getOperand(0); 4009 Inst->eraseFromParent(); 4010 State.reset(Def, NewI, Part); 4011 } 4012 } 4013 } 4014 } 4015 4016 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4017 // Insert truncates and extends for any truncated instructions as hints to 4018 // InstCombine. 4019 if (VF.isVector()) 4020 truncateToMinimalBitwidths(State); 4021 4022 // Fix widened non-induction PHIs by setting up the PHI operands. 4023 if (OrigPHIsToFix.size()) { 4024 assert(EnableVPlanNativePath && 4025 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4026 fixNonInductionPHIs(State); 4027 } 4028 4029 // At this point every instruction in the original loop is widened to a 4030 // vector form. Now we need to fix the recurrences in the loop. These PHI 4031 // nodes are currently empty because we did not want to introduce cycles. 4032 // This is the second stage of vectorizing recurrences. 4033 fixCrossIterationPHIs(State); 4034 4035 // Forget the original basic block. 4036 PSE.getSE()->forgetLoop(OrigLoop); 4037 4038 // If we inserted an edge from the middle block to the unique exit block, 4039 // update uses outside the loop (phis) to account for the newly inserted 4040 // edge. 4041 if (!Cost->requiresScalarEpilogue(VF)) { 4042 // Fix-up external users of the induction variables. 4043 for (auto &Entry : Legal->getInductionVars()) 4044 fixupIVUsers(Entry.first, Entry.second, 4045 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4046 IVEndValues[Entry.first], LoopMiddleBlock); 4047 4048 fixLCSSAPHIs(State); 4049 } 4050 4051 for (Instruction *PI : PredicatedInstructions) 4052 sinkScalarOperands(&*PI); 4053 4054 // Remove redundant induction instructions. 4055 cse(LoopVectorBody); 4056 4057 // Set/update profile weights for the vector and remainder loops as original 4058 // loop iterations are now distributed among them. Note that original loop 4059 // represented by LoopScalarBody becomes remainder loop after vectorization. 4060 // 4061 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4062 // end up getting slightly roughened result but that should be OK since 4063 // profile is not inherently precise anyway. Note also possible bypass of 4064 // vector code caused by legality checks is ignored, assigning all the weight 4065 // to the vector loop, optimistically. 4066 // 4067 // For scalable vectorization we can't know at compile time how many iterations 4068 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4069 // vscale of '1'. 4070 setProfileInfoAfterUnrolling( 4071 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4072 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4073 } 4074 4075 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4076 // In order to support recurrences we need to be able to vectorize Phi nodes. 4077 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4078 // stage #2: We now need to fix the recurrences by adding incoming edges to 4079 // the currently empty PHI nodes. At this point every instruction in the 4080 // original loop is widened to a vector form so we can use them to construct 4081 // the incoming edges. 4082 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4083 for (VPRecipeBase &R : Header->phis()) { 4084 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4085 fixReduction(ReductionPhi, State); 4086 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4087 fixFirstOrderRecurrence(FOR, State); 4088 } 4089 } 4090 4091 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, 4092 VPTransformState &State) { 4093 // This is the second phase of vectorizing first-order recurrences. An 4094 // overview of the transformation is described below. Suppose we have the 4095 // following loop. 4096 // 4097 // for (int i = 0; i < n; ++i) 4098 // b[i] = a[i] - a[i - 1]; 4099 // 4100 // There is a first-order recurrence on "a". For this loop, the shorthand 4101 // scalar IR looks like: 4102 // 4103 // scalar.ph: 4104 // s_init = a[-1] 4105 // br scalar.body 4106 // 4107 // scalar.body: 4108 // i = phi [0, scalar.ph], [i+1, scalar.body] 4109 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4110 // s2 = a[i] 4111 // b[i] = s2 - s1 4112 // br cond, scalar.body, ... 4113 // 4114 // In this example, s1 is a recurrence because it's value depends on the 4115 // previous iteration. In the first phase of vectorization, we created a 4116 // vector phi v1 for s1. We now complete the vectorization and produce the 4117 // shorthand vector IR shown below (for VF = 4, UF = 1). 4118 // 4119 // vector.ph: 4120 // v_init = vector(..., ..., ..., a[-1]) 4121 // br vector.body 4122 // 4123 // vector.body 4124 // i = phi [0, vector.ph], [i+4, vector.body] 4125 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4126 // v2 = a[i, i+1, i+2, i+3]; 4127 // v3 = vector(v1(3), v2(0, 1, 2)) 4128 // b[i, i+1, i+2, i+3] = v2 - v3 4129 // br cond, vector.body, middle.block 4130 // 4131 // middle.block: 4132 // x = v2(3) 4133 // br scalar.ph 4134 // 4135 // scalar.ph: 4136 // s_init = phi [x, middle.block], [a[-1], otherwise] 4137 // br scalar.body 4138 // 4139 // After execution completes the vector loop, we extract the next value of 4140 // the recurrence (x) to use as the initial value in the scalar loop. 4141 4142 // Extract the last vector element in the middle block. This will be the 4143 // initial value for the recurrence when jumping to the scalar loop. 4144 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4145 Value *Incoming = State.get(PreviousDef, UF - 1); 4146 auto *ExtractForScalar = Incoming; 4147 auto *IdxTy = Builder.getInt32Ty(); 4148 if (VF.isVector()) { 4149 auto *One = ConstantInt::get(IdxTy, 1); 4150 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4151 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4152 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4153 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4154 "vector.recur.extract"); 4155 } 4156 // Extract the second last element in the middle block if the 4157 // Phi is used outside the loop. We need to extract the phi itself 4158 // and not the last element (the phi update in the current iteration). This 4159 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4160 // when the scalar loop is not run at all. 4161 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4162 if (VF.isVector()) { 4163 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4164 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4165 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4166 Incoming, Idx, "vector.recur.extract.for.phi"); 4167 } else if (UF > 1) 4168 // When loop is unrolled without vectorizing, initialize 4169 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4170 // of `Incoming`. This is analogous to the vectorized case above: extracting 4171 // the second last element when VF > 1. 4172 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4173 4174 // Fix the initial value of the original recurrence in the scalar loop. 4175 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4176 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4177 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4178 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4179 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4180 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4181 Start->addIncoming(Incoming, BB); 4182 } 4183 4184 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4185 Phi->setName("scalar.recur"); 4186 4187 // Finally, fix users of the recurrence outside the loop. The users will need 4188 // either the last value of the scalar recurrence or the last value of the 4189 // vector recurrence we extracted in the middle block. Since the loop is in 4190 // LCSSA form, we just need to find all the phi nodes for the original scalar 4191 // recurrence in the exit block, and then add an edge for the middle block. 4192 // Note that LCSSA does not imply single entry when the original scalar loop 4193 // had multiple exiting edges (as we always run the last iteration in the 4194 // scalar epilogue); in that case, there is no edge from middle to exit and 4195 // and thus no phis which needed updated. 4196 if (!Cost->requiresScalarEpilogue(VF)) 4197 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4198 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4199 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4200 } 4201 4202 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4203 VPTransformState &State) { 4204 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4205 // Get it's reduction variable descriptor. 4206 assert(Legal->isReductionVariable(OrigPhi) && 4207 "Unable to find the reduction variable"); 4208 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4209 4210 RecurKind RK = RdxDesc.getRecurrenceKind(); 4211 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4212 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4213 setDebugLocFromInst(ReductionStartValue); 4214 4215 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4216 // This is the vector-clone of the value that leaves the loop. 4217 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4218 4219 // Wrap flags are in general invalid after vectorization, clear them. 4220 clearReductionWrapFlags(RdxDesc, State); 4221 4222 // Before each round, move the insertion point right between 4223 // the PHIs and the values we are going to write. 4224 // This allows us to write both PHINodes and the extractelement 4225 // instructions. 4226 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4227 4228 setDebugLocFromInst(LoopExitInst); 4229 4230 Type *PhiTy = OrigPhi->getType(); 4231 // If tail is folded by masking, the vector value to leave the loop should be 4232 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4233 // instead of the former. For an inloop reduction the reduction will already 4234 // be predicated, and does not need to be handled here. 4235 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4236 for (unsigned Part = 0; Part < UF; ++Part) { 4237 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4238 Value *Sel = nullptr; 4239 for (User *U : VecLoopExitInst->users()) { 4240 if (isa<SelectInst>(U)) { 4241 assert(!Sel && "Reduction exit feeding two selects"); 4242 Sel = U; 4243 } else 4244 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4245 } 4246 assert(Sel && "Reduction exit feeds no select"); 4247 State.reset(LoopExitInstDef, Sel, Part); 4248 4249 // If the target can create a predicated operator for the reduction at no 4250 // extra cost in the loop (for example a predicated vadd), it can be 4251 // cheaper for the select to remain in the loop than be sunk out of it, 4252 // and so use the select value for the phi instead of the old 4253 // LoopExitValue. 4254 if (PreferPredicatedReductionSelect || 4255 TTI->preferPredicatedReductionSelect( 4256 RdxDesc.getOpcode(), PhiTy, 4257 TargetTransformInfo::ReductionFlags())) { 4258 auto *VecRdxPhi = 4259 cast<PHINode>(State.get(PhiR, Part)); 4260 VecRdxPhi->setIncomingValueForBlock( 4261 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4262 } 4263 } 4264 } 4265 4266 // If the vector reduction can be performed in a smaller type, we truncate 4267 // then extend the loop exit value to enable InstCombine to evaluate the 4268 // entire expression in the smaller type. 4269 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4270 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4271 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4272 Builder.SetInsertPoint( 4273 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4274 VectorParts RdxParts(UF); 4275 for (unsigned Part = 0; Part < UF; ++Part) { 4276 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4277 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4278 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4279 : Builder.CreateZExt(Trunc, VecTy); 4280 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4281 if (U != Trunc) { 4282 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4283 RdxParts[Part] = Extnd; 4284 } 4285 } 4286 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4287 for (unsigned Part = 0; Part < UF; ++Part) { 4288 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4289 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4290 } 4291 } 4292 4293 // Reduce all of the unrolled parts into a single vector. 4294 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4295 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4296 4297 // The middle block terminator has already been assigned a DebugLoc here (the 4298 // OrigLoop's single latch terminator). We want the whole middle block to 4299 // appear to execute on this line because: (a) it is all compiler generated, 4300 // (b) these instructions are always executed after evaluating the latch 4301 // conditional branch, and (c) other passes may add new predecessors which 4302 // terminate on this line. This is the easiest way to ensure we don't 4303 // accidentally cause an extra step back into the loop while debugging. 4304 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4305 if (PhiR->isOrdered()) 4306 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4307 else { 4308 // Floating-point operations should have some FMF to enable the reduction. 4309 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4310 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4311 for (unsigned Part = 1; Part < UF; ++Part) { 4312 Value *RdxPart = State.get(LoopExitInstDef, Part); 4313 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4314 ReducedPartRdx = Builder.CreateBinOp( 4315 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4316 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4317 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4318 ReducedPartRdx, RdxPart); 4319 else 4320 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4321 } 4322 } 4323 4324 // Create the reduction after the loop. Note that inloop reductions create the 4325 // target reduction in the loop using a Reduction recipe. 4326 if (VF.isVector() && !PhiR->isInLoop()) { 4327 ReducedPartRdx = 4328 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4329 // If the reduction can be performed in a smaller type, we need to extend 4330 // the reduction to the wider type before we branch to the original loop. 4331 if (PhiTy != RdxDesc.getRecurrenceType()) 4332 ReducedPartRdx = RdxDesc.isSigned() 4333 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4334 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4335 } 4336 4337 // Create a phi node that merges control-flow from the backedge-taken check 4338 // block and the middle block. 4339 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4340 LoopScalarPreHeader->getTerminator()); 4341 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4342 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4343 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4344 4345 // Now, we need to fix the users of the reduction variable 4346 // inside and outside of the scalar remainder loop. 4347 4348 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4349 // in the exit blocks. See comment on analogous loop in 4350 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4351 if (!Cost->requiresScalarEpilogue(VF)) 4352 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4353 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4354 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4355 4356 // Fix the scalar loop reduction variable with the incoming reduction sum 4357 // from the vector body and from the backedge value. 4358 int IncomingEdgeBlockIdx = 4359 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4360 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4361 // Pick the other block. 4362 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4363 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4364 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4365 } 4366 4367 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4368 VPTransformState &State) { 4369 RecurKind RK = RdxDesc.getRecurrenceKind(); 4370 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4371 return; 4372 4373 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4374 assert(LoopExitInstr && "null loop exit instruction"); 4375 SmallVector<Instruction *, 8> Worklist; 4376 SmallPtrSet<Instruction *, 8> Visited; 4377 Worklist.push_back(LoopExitInstr); 4378 Visited.insert(LoopExitInstr); 4379 4380 while (!Worklist.empty()) { 4381 Instruction *Cur = Worklist.pop_back_val(); 4382 if (isa<OverflowingBinaryOperator>(Cur)) 4383 for (unsigned Part = 0; Part < UF; ++Part) { 4384 // FIXME: Should not rely on getVPValue at this point. 4385 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4386 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4387 } 4388 4389 for (User *U : Cur->users()) { 4390 Instruction *UI = cast<Instruction>(U); 4391 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4392 Visited.insert(UI).second) 4393 Worklist.push_back(UI); 4394 } 4395 } 4396 } 4397 4398 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4399 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4400 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4401 // Some phis were already hand updated by the reduction and recurrence 4402 // code above, leave them alone. 4403 continue; 4404 4405 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4406 // Non-instruction incoming values will have only one value. 4407 4408 VPLane Lane = VPLane::getFirstLane(); 4409 if (isa<Instruction>(IncomingValue) && 4410 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4411 VF)) 4412 Lane = VPLane::getLastLaneForVF(VF); 4413 4414 // Can be a loop invariant incoming value or the last scalar value to be 4415 // extracted from the vectorized loop. 4416 // FIXME: Should not rely on getVPValue at this point. 4417 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4418 Value *lastIncomingValue = 4419 OrigLoop->isLoopInvariant(IncomingValue) 4420 ? IncomingValue 4421 : State.get(State.Plan->getVPValue(IncomingValue, true), 4422 VPIteration(UF - 1, Lane)); 4423 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4424 } 4425 } 4426 4427 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4428 // The basic block and loop containing the predicated instruction. 4429 auto *PredBB = PredInst->getParent(); 4430 auto *VectorLoop = LI->getLoopFor(PredBB); 4431 4432 // Initialize a worklist with the operands of the predicated instruction. 4433 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4434 4435 // Holds instructions that we need to analyze again. An instruction may be 4436 // reanalyzed if we don't yet know if we can sink it or not. 4437 SmallVector<Instruction *, 8> InstsToReanalyze; 4438 4439 // Returns true if a given use occurs in the predicated block. Phi nodes use 4440 // their operands in their corresponding predecessor blocks. 4441 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4442 auto *I = cast<Instruction>(U.getUser()); 4443 BasicBlock *BB = I->getParent(); 4444 if (auto *Phi = dyn_cast<PHINode>(I)) 4445 BB = Phi->getIncomingBlock( 4446 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4447 return BB == PredBB; 4448 }; 4449 4450 // Iteratively sink the scalarized operands of the predicated instruction 4451 // into the block we created for it. When an instruction is sunk, it's 4452 // operands are then added to the worklist. The algorithm ends after one pass 4453 // through the worklist doesn't sink a single instruction. 4454 bool Changed; 4455 do { 4456 // Add the instructions that need to be reanalyzed to the worklist, and 4457 // reset the changed indicator. 4458 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4459 InstsToReanalyze.clear(); 4460 Changed = false; 4461 4462 while (!Worklist.empty()) { 4463 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4464 4465 // We can't sink an instruction if it is a phi node, is not in the loop, 4466 // or may have side effects. 4467 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4468 I->mayHaveSideEffects()) 4469 continue; 4470 4471 // If the instruction is already in PredBB, check if we can sink its 4472 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4473 // sinking the scalar instruction I, hence it appears in PredBB; but it 4474 // may have failed to sink I's operands (recursively), which we try 4475 // (again) here. 4476 if (I->getParent() == PredBB) { 4477 Worklist.insert(I->op_begin(), I->op_end()); 4478 continue; 4479 } 4480 4481 // It's legal to sink the instruction if all its uses occur in the 4482 // predicated block. Otherwise, there's nothing to do yet, and we may 4483 // need to reanalyze the instruction. 4484 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4485 InstsToReanalyze.push_back(I); 4486 continue; 4487 } 4488 4489 // Move the instruction to the beginning of the predicated block, and add 4490 // it's operands to the worklist. 4491 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4492 Worklist.insert(I->op_begin(), I->op_end()); 4493 4494 // The sinking may have enabled other instructions to be sunk, so we will 4495 // need to iterate. 4496 Changed = true; 4497 } 4498 } while (Changed); 4499 } 4500 4501 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4502 for (PHINode *OrigPhi : OrigPHIsToFix) { 4503 VPWidenPHIRecipe *VPPhi = 4504 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4505 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4506 // Make sure the builder has a valid insert point. 4507 Builder.SetInsertPoint(NewPhi); 4508 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4509 VPValue *Inc = VPPhi->getIncomingValue(i); 4510 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4511 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4512 } 4513 } 4514 } 4515 4516 bool InnerLoopVectorizer::useOrderedReductions( 4517 const RecurrenceDescriptor &RdxDesc) { 4518 return Cost->useOrderedReductions(RdxDesc); 4519 } 4520 4521 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4522 VPWidenPHIRecipe *PhiR, 4523 VPTransformState &State) { 4524 PHINode *P = cast<PHINode>(PN); 4525 if (EnableVPlanNativePath) { 4526 // Currently we enter here in the VPlan-native path for non-induction 4527 // PHIs where all control flow is uniform. We simply widen these PHIs. 4528 // Create a vector phi with no operands - the vector phi operands will be 4529 // set at the end of vector code generation. 4530 Type *VecTy = (State.VF.isScalar()) 4531 ? PN->getType() 4532 : VectorType::get(PN->getType(), State.VF); 4533 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4534 State.set(PhiR, VecPhi, 0); 4535 OrigPHIsToFix.push_back(P); 4536 4537 return; 4538 } 4539 4540 assert(PN->getParent() == OrigLoop->getHeader() && 4541 "Non-header phis should have been handled elsewhere"); 4542 4543 // In order to support recurrences we need to be able to vectorize Phi nodes. 4544 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4545 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4546 // this value when we vectorize all of the instructions that use the PHI. 4547 4548 assert(!Legal->isReductionVariable(P) && 4549 "reductions should be handled elsewhere"); 4550 4551 setDebugLocFromInst(P); 4552 4553 // This PHINode must be an induction variable. 4554 // Make sure that we know about it. 4555 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4556 4557 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4558 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4559 4560 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4561 // which can be found from the original scalar operations. 4562 switch (II.getKind()) { 4563 case InductionDescriptor::IK_NoInduction: 4564 llvm_unreachable("Unknown induction"); 4565 case InductionDescriptor::IK_IntInduction: 4566 case InductionDescriptor::IK_FpInduction: 4567 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4568 case InductionDescriptor::IK_PtrInduction: { 4569 // Handle the pointer induction variable case. 4570 assert(P->getType()->isPointerTy() && "Unexpected type."); 4571 4572 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4573 // This is the normalized GEP that starts counting at zero. 4574 Value *PtrInd = 4575 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4576 // Determine the number of scalars we need to generate for each unroll 4577 // iteration. If the instruction is uniform, we only need to generate the 4578 // first lane. Otherwise, we generate all VF values. 4579 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4580 assert((IsUniform || !State.VF.isScalable()) && 4581 "Cannot scalarize a scalable VF"); 4582 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4583 4584 for (unsigned Part = 0; Part < UF; ++Part) { 4585 Value *PartStart = 4586 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4587 4588 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4589 Value *Idx = Builder.CreateAdd( 4590 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4591 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4592 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4593 DL, II, State.CFG.PrevBB); 4594 SclrGep->setName("next.gep"); 4595 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4596 } 4597 } 4598 return; 4599 } 4600 assert(isa<SCEVConstant>(II.getStep()) && 4601 "Induction step not a SCEV constant!"); 4602 Type *PhiType = II.getStep()->getType(); 4603 4604 // Build a pointer phi 4605 Value *ScalarStartValue = II.getStartValue(); 4606 Type *ScStValueType = ScalarStartValue->getType(); 4607 PHINode *NewPointerPhi = 4608 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4609 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4610 4611 // A pointer induction, performed by using a gep 4612 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4613 Instruction *InductionLoc = LoopLatch->getTerminator(); 4614 const SCEV *ScalarStep = II.getStep(); 4615 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4616 Value *ScalarStepValue = 4617 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4618 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4619 Value *NumUnrolledElems = 4620 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4621 Value *InductionGEP = GetElementPtrInst::Create( 4622 II.getElementType(), NewPointerPhi, 4623 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4624 InductionLoc); 4625 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4626 4627 // Create UF many actual address geps that use the pointer 4628 // phi as base and a vectorized version of the step value 4629 // (<step*0, ..., step*N>) as offset. 4630 for (unsigned Part = 0; Part < State.UF; ++Part) { 4631 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4632 Value *StartOffsetScalar = 4633 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4634 Value *StartOffset = 4635 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4636 // Create a vector of consecutive numbers from zero to VF. 4637 StartOffset = 4638 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4639 4640 Value *GEP = Builder.CreateGEP( 4641 II.getElementType(), NewPointerPhi, 4642 Builder.CreateMul( 4643 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4644 "vector.gep")); 4645 State.set(PhiR, GEP, Part); 4646 } 4647 } 4648 } 4649 } 4650 4651 /// A helper function for checking whether an integer division-related 4652 /// instruction may divide by zero (in which case it must be predicated if 4653 /// executed conditionally in the scalar code). 4654 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4655 /// Non-zero divisors that are non compile-time constants will not be 4656 /// converted into multiplication, so we will still end up scalarizing 4657 /// the division, but can do so w/o predication. 4658 static bool mayDivideByZero(Instruction &I) { 4659 assert((I.getOpcode() == Instruction::UDiv || 4660 I.getOpcode() == Instruction::SDiv || 4661 I.getOpcode() == Instruction::URem || 4662 I.getOpcode() == Instruction::SRem) && 4663 "Unexpected instruction"); 4664 Value *Divisor = I.getOperand(1); 4665 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4666 return !CInt || CInt->isZero(); 4667 } 4668 4669 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4670 VPUser &ArgOperands, 4671 VPTransformState &State) { 4672 assert(!isa<DbgInfoIntrinsic>(I) && 4673 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4674 setDebugLocFromInst(&I); 4675 4676 Module *M = I.getParent()->getParent()->getParent(); 4677 auto *CI = cast<CallInst>(&I); 4678 4679 SmallVector<Type *, 4> Tys; 4680 for (Value *ArgOperand : CI->args()) 4681 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4682 4683 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4684 4685 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4686 // version of the instruction. 4687 // Is it beneficial to perform intrinsic call compared to lib call? 4688 bool NeedToScalarize = false; 4689 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4690 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4691 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4692 assert((UseVectorIntrinsic || !NeedToScalarize) && 4693 "Instruction should be scalarized elsewhere."); 4694 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4695 "Either the intrinsic cost or vector call cost must be valid"); 4696 4697 for (unsigned Part = 0; Part < UF; ++Part) { 4698 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4699 SmallVector<Value *, 4> Args; 4700 for (auto &I : enumerate(ArgOperands.operands())) { 4701 // Some intrinsics have a scalar argument - don't replace it with a 4702 // vector. 4703 Value *Arg; 4704 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4705 Arg = State.get(I.value(), Part); 4706 else { 4707 Arg = State.get(I.value(), VPIteration(0, 0)); 4708 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4709 TysForDecl.push_back(Arg->getType()); 4710 } 4711 Args.push_back(Arg); 4712 } 4713 4714 Function *VectorF; 4715 if (UseVectorIntrinsic) { 4716 // Use vector version of the intrinsic. 4717 if (VF.isVector()) 4718 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4719 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4720 assert(VectorF && "Can't retrieve vector intrinsic."); 4721 } else { 4722 // Use vector version of the function call. 4723 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4724 #ifndef NDEBUG 4725 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4726 "Can't create vector function."); 4727 #endif 4728 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4729 } 4730 SmallVector<OperandBundleDef, 1> OpBundles; 4731 CI->getOperandBundlesAsDefs(OpBundles); 4732 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4733 4734 if (isa<FPMathOperator>(V)) 4735 V->copyFastMathFlags(CI); 4736 4737 State.set(Def, V, Part); 4738 addMetadata(V, &I); 4739 } 4740 } 4741 4742 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4743 // We should not collect Scalars more than once per VF. Right now, this 4744 // function is called from collectUniformsAndScalars(), which already does 4745 // this check. Collecting Scalars for VF=1 does not make any sense. 4746 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4747 "This function should not be visited twice for the same VF"); 4748 4749 SmallSetVector<Instruction *, 8> Worklist; 4750 4751 // These sets are used to seed the analysis with pointers used by memory 4752 // accesses that will remain scalar. 4753 SmallSetVector<Instruction *, 8> ScalarPtrs; 4754 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4755 auto *Latch = TheLoop->getLoopLatch(); 4756 4757 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4758 // The pointer operands of loads and stores will be scalar as long as the 4759 // memory access is not a gather or scatter operation. The value operand of a 4760 // store will remain scalar if the store is scalarized. 4761 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4762 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4763 assert(WideningDecision != CM_Unknown && 4764 "Widening decision should be ready at this moment"); 4765 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4766 if (Ptr == Store->getValueOperand()) 4767 return WideningDecision == CM_Scalarize; 4768 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4769 "Ptr is neither a value or pointer operand"); 4770 return WideningDecision != CM_GatherScatter; 4771 }; 4772 4773 // A helper that returns true if the given value is a bitcast or 4774 // getelementptr instruction contained in the loop. 4775 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4776 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4777 isa<GetElementPtrInst>(V)) && 4778 !TheLoop->isLoopInvariant(V); 4779 }; 4780 4781 // A helper that evaluates a memory access's use of a pointer. If the use will 4782 // be a scalar use and the pointer is only used by memory accesses, we place 4783 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4784 // PossibleNonScalarPtrs. 4785 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4786 // We only care about bitcast and getelementptr instructions contained in 4787 // the loop. 4788 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4789 return; 4790 4791 // If the pointer has already been identified as scalar (e.g., if it was 4792 // also identified as uniform), there's nothing to do. 4793 auto *I = cast<Instruction>(Ptr); 4794 if (Worklist.count(I)) 4795 return; 4796 4797 // If the use of the pointer will be a scalar use, and all users of the 4798 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4799 // place the pointer in PossibleNonScalarPtrs. 4800 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4801 return isa<LoadInst>(U) || isa<StoreInst>(U); 4802 })) 4803 ScalarPtrs.insert(I); 4804 else 4805 PossibleNonScalarPtrs.insert(I); 4806 }; 4807 4808 // We seed the scalars analysis with three classes of instructions: (1) 4809 // instructions marked uniform-after-vectorization and (2) bitcast, 4810 // getelementptr and (pointer) phi instructions used by memory accesses 4811 // requiring a scalar use. 4812 // 4813 // (1) Add to the worklist all instructions that have been identified as 4814 // uniform-after-vectorization. 4815 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4816 4817 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4818 // memory accesses requiring a scalar use. The pointer operands of loads and 4819 // stores will be scalar as long as the memory accesses is not a gather or 4820 // scatter operation. The value operand of a store will remain scalar if the 4821 // store is scalarized. 4822 for (auto *BB : TheLoop->blocks()) 4823 for (auto &I : *BB) { 4824 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4825 evaluatePtrUse(Load, Load->getPointerOperand()); 4826 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4827 evaluatePtrUse(Store, Store->getPointerOperand()); 4828 evaluatePtrUse(Store, Store->getValueOperand()); 4829 } 4830 } 4831 for (auto *I : ScalarPtrs) 4832 if (!PossibleNonScalarPtrs.count(I)) { 4833 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4834 Worklist.insert(I); 4835 } 4836 4837 // Insert the forced scalars. 4838 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4839 // induction variable when the PHI user is scalarized. 4840 auto ForcedScalar = ForcedScalars.find(VF); 4841 if (ForcedScalar != ForcedScalars.end()) 4842 for (auto *I : ForcedScalar->second) 4843 Worklist.insert(I); 4844 4845 // Expand the worklist by looking through any bitcasts and getelementptr 4846 // instructions we've already identified as scalar. This is similar to the 4847 // expansion step in collectLoopUniforms(); however, here we're only 4848 // expanding to include additional bitcasts and getelementptr instructions. 4849 unsigned Idx = 0; 4850 while (Idx != Worklist.size()) { 4851 Instruction *Dst = Worklist[Idx++]; 4852 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4853 continue; 4854 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4855 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4856 auto *J = cast<Instruction>(U); 4857 return !TheLoop->contains(J) || Worklist.count(J) || 4858 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4859 isScalarUse(J, Src)); 4860 })) { 4861 Worklist.insert(Src); 4862 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4863 } 4864 } 4865 4866 // An induction variable will remain scalar if all users of the induction 4867 // variable and induction variable update remain scalar. 4868 for (auto &Induction : Legal->getInductionVars()) { 4869 auto *Ind = Induction.first; 4870 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4871 4872 // If tail-folding is applied, the primary induction variable will be used 4873 // to feed a vector compare. 4874 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4875 continue; 4876 4877 // Returns true if \p Indvar is a pointer induction that is used directly by 4878 // load/store instruction \p I. 4879 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4880 Instruction *I) { 4881 return Induction.second.getKind() == 4882 InductionDescriptor::IK_PtrInduction && 4883 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4884 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4885 }; 4886 4887 // Determine if all users of the induction variable are scalar after 4888 // vectorization. 4889 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4890 auto *I = cast<Instruction>(U); 4891 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4892 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4893 }); 4894 if (!ScalarInd) 4895 continue; 4896 4897 // Determine if all users of the induction variable update instruction are 4898 // scalar after vectorization. 4899 auto ScalarIndUpdate = 4900 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4901 auto *I = cast<Instruction>(U); 4902 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4903 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4904 }); 4905 if (!ScalarIndUpdate) 4906 continue; 4907 4908 // The induction variable and its update instruction will remain scalar. 4909 Worklist.insert(Ind); 4910 Worklist.insert(IndUpdate); 4911 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4912 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4913 << "\n"); 4914 } 4915 4916 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4917 } 4918 4919 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 4920 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4921 return false; 4922 switch(I->getOpcode()) { 4923 default: 4924 break; 4925 case Instruction::Load: 4926 case Instruction::Store: { 4927 if (!Legal->isMaskRequired(I)) 4928 return false; 4929 auto *Ptr = getLoadStorePointerOperand(I); 4930 auto *Ty = getLoadStoreType(I); 4931 const Align Alignment = getLoadStoreAlignment(I); 4932 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4933 TTI.isLegalMaskedGather(Ty, Alignment)) 4934 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4935 TTI.isLegalMaskedScatter(Ty, Alignment)); 4936 } 4937 case Instruction::UDiv: 4938 case Instruction::SDiv: 4939 case Instruction::SRem: 4940 case Instruction::URem: 4941 return mayDivideByZero(*I); 4942 } 4943 return false; 4944 } 4945 4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4947 Instruction *I, ElementCount VF) { 4948 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4949 assert(getWideningDecision(I, VF) == CM_Unknown && 4950 "Decision should not be set yet."); 4951 auto *Group = getInterleavedAccessGroup(I); 4952 assert(Group && "Must have a group."); 4953 4954 // If the instruction's allocated size doesn't equal it's type size, it 4955 // requires padding and will be scalarized. 4956 auto &DL = I->getModule()->getDataLayout(); 4957 auto *ScalarTy = getLoadStoreType(I); 4958 if (hasIrregularType(ScalarTy, DL)) 4959 return false; 4960 4961 // Check if masking is required. 4962 // A Group may need masking for one of two reasons: it resides in a block that 4963 // needs predication, or it was decided to use masking to deal with gaps 4964 // (either a gap at the end of a load-access that may result in a speculative 4965 // load, or any gaps in a store-access). 4966 bool PredicatedAccessRequiresMasking = 4967 blockNeedsPredicationForAnyReason(I->getParent()) && 4968 Legal->isMaskRequired(I); 4969 bool LoadAccessWithGapsRequiresEpilogMasking = 4970 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4971 !isScalarEpilogueAllowed(); 4972 bool StoreAccessWithGapsRequiresMasking = 4973 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4974 if (!PredicatedAccessRequiresMasking && 4975 !LoadAccessWithGapsRequiresEpilogMasking && 4976 !StoreAccessWithGapsRequiresMasking) 4977 return true; 4978 4979 // If masked interleaving is required, we expect that the user/target had 4980 // enabled it, because otherwise it either wouldn't have been created or 4981 // it should have been invalidated by the CostModel. 4982 assert(useMaskedInterleavedAccesses(TTI) && 4983 "Masked interleave-groups for predicated accesses are not enabled."); 4984 4985 if (Group->isReverse()) 4986 return false; 4987 4988 auto *Ty = getLoadStoreType(I); 4989 const Align Alignment = getLoadStoreAlignment(I); 4990 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4991 : TTI.isLegalMaskedStore(Ty, Alignment); 4992 } 4993 4994 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4995 Instruction *I, ElementCount VF) { 4996 // Get and ensure we have a valid memory instruction. 4997 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4998 4999 auto *Ptr = getLoadStorePointerOperand(I); 5000 auto *ScalarTy = getLoadStoreType(I); 5001 5002 // In order to be widened, the pointer should be consecutive, first of all. 5003 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 5004 return false; 5005 5006 // If the instruction is a store located in a predicated block, it will be 5007 // scalarized. 5008 if (isScalarWithPredication(I)) 5009 return false; 5010 5011 // If the instruction's allocated size doesn't equal it's type size, it 5012 // requires padding and will be scalarized. 5013 auto &DL = I->getModule()->getDataLayout(); 5014 if (hasIrregularType(ScalarTy, DL)) 5015 return false; 5016 5017 return true; 5018 } 5019 5020 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5021 // We should not collect Uniforms more than once per VF. Right now, 5022 // this function is called from collectUniformsAndScalars(), which 5023 // already does this check. Collecting Uniforms for VF=1 does not make any 5024 // sense. 5025 5026 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5027 "This function should not be visited twice for the same VF"); 5028 5029 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5030 // not analyze again. Uniforms.count(VF) will return 1. 5031 Uniforms[VF].clear(); 5032 5033 // We now know that the loop is vectorizable! 5034 // Collect instructions inside the loop that will remain uniform after 5035 // vectorization. 5036 5037 // Global values, params and instructions outside of current loop are out of 5038 // scope. 5039 auto isOutOfScope = [&](Value *V) -> bool { 5040 Instruction *I = dyn_cast<Instruction>(V); 5041 return (!I || !TheLoop->contains(I)); 5042 }; 5043 5044 // Worklist containing uniform instructions demanding lane 0. 5045 SetVector<Instruction *> Worklist; 5046 BasicBlock *Latch = TheLoop->getLoopLatch(); 5047 5048 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5049 // that are scalar with predication must not be considered uniform after 5050 // vectorization, because that would create an erroneous replicating region 5051 // where only a single instance out of VF should be formed. 5052 // TODO: optimize such seldom cases if found important, see PR40816. 5053 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5054 if (isOutOfScope(I)) { 5055 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5056 << *I << "\n"); 5057 return; 5058 } 5059 if (isScalarWithPredication(I)) { 5060 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5061 << *I << "\n"); 5062 return; 5063 } 5064 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5065 Worklist.insert(I); 5066 }; 5067 5068 // Start with the conditional branch. If the branch condition is an 5069 // instruction contained in the loop that is only used by the branch, it is 5070 // uniform. 5071 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5072 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5073 addToWorklistIfAllowed(Cmp); 5074 5075 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5076 InstWidening WideningDecision = getWideningDecision(I, VF); 5077 assert(WideningDecision != CM_Unknown && 5078 "Widening decision should be ready at this moment"); 5079 5080 // A uniform memory op is itself uniform. We exclude uniform stores 5081 // here as they demand the last lane, not the first one. 5082 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5083 assert(WideningDecision == CM_Scalarize); 5084 return true; 5085 } 5086 5087 return (WideningDecision == CM_Widen || 5088 WideningDecision == CM_Widen_Reverse || 5089 WideningDecision == CM_Interleave); 5090 }; 5091 5092 5093 // Returns true if Ptr is the pointer operand of a memory access instruction 5094 // I, and I is known to not require scalarization. 5095 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5096 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5097 }; 5098 5099 // Holds a list of values which are known to have at least one uniform use. 5100 // Note that there may be other uses which aren't uniform. A "uniform use" 5101 // here is something which only demands lane 0 of the unrolled iterations; 5102 // it does not imply that all lanes produce the same value (e.g. this is not 5103 // the usual meaning of uniform) 5104 SetVector<Value *> HasUniformUse; 5105 5106 // Scan the loop for instructions which are either a) known to have only 5107 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5108 for (auto *BB : TheLoop->blocks()) 5109 for (auto &I : *BB) { 5110 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5111 switch (II->getIntrinsicID()) { 5112 case Intrinsic::sideeffect: 5113 case Intrinsic::experimental_noalias_scope_decl: 5114 case Intrinsic::assume: 5115 case Intrinsic::lifetime_start: 5116 case Intrinsic::lifetime_end: 5117 if (TheLoop->hasLoopInvariantOperands(&I)) 5118 addToWorklistIfAllowed(&I); 5119 break; 5120 default: 5121 break; 5122 } 5123 } 5124 5125 // ExtractValue instructions must be uniform, because the operands are 5126 // known to be loop-invariant. 5127 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5128 assert(isOutOfScope(EVI->getAggregateOperand()) && 5129 "Expected aggregate value to be loop invariant"); 5130 addToWorklistIfAllowed(EVI); 5131 continue; 5132 } 5133 5134 // If there's no pointer operand, there's nothing to do. 5135 auto *Ptr = getLoadStorePointerOperand(&I); 5136 if (!Ptr) 5137 continue; 5138 5139 // A uniform memory op is itself uniform. We exclude uniform stores 5140 // here as they demand the last lane, not the first one. 5141 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5142 addToWorklistIfAllowed(&I); 5143 5144 if (isUniformDecision(&I, VF)) { 5145 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5146 HasUniformUse.insert(Ptr); 5147 } 5148 } 5149 5150 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5151 // demanding) users. Since loops are assumed to be in LCSSA form, this 5152 // disallows uses outside the loop as well. 5153 for (auto *V : HasUniformUse) { 5154 if (isOutOfScope(V)) 5155 continue; 5156 auto *I = cast<Instruction>(V); 5157 auto UsersAreMemAccesses = 5158 llvm::all_of(I->users(), [&](User *U) -> bool { 5159 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5160 }); 5161 if (UsersAreMemAccesses) 5162 addToWorklistIfAllowed(I); 5163 } 5164 5165 // Expand Worklist in topological order: whenever a new instruction 5166 // is added , its users should be already inside Worklist. It ensures 5167 // a uniform instruction will only be used by uniform instructions. 5168 unsigned idx = 0; 5169 while (idx != Worklist.size()) { 5170 Instruction *I = Worklist[idx++]; 5171 5172 for (auto OV : I->operand_values()) { 5173 // isOutOfScope operands cannot be uniform instructions. 5174 if (isOutOfScope(OV)) 5175 continue; 5176 // First order recurrence Phi's should typically be considered 5177 // non-uniform. 5178 auto *OP = dyn_cast<PHINode>(OV); 5179 if (OP && Legal->isFirstOrderRecurrence(OP)) 5180 continue; 5181 // If all the users of the operand are uniform, then add the 5182 // operand into the uniform worklist. 5183 auto *OI = cast<Instruction>(OV); 5184 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5185 auto *J = cast<Instruction>(U); 5186 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5187 })) 5188 addToWorklistIfAllowed(OI); 5189 } 5190 } 5191 5192 // For an instruction to be added into Worklist above, all its users inside 5193 // the loop should also be in Worklist. However, this condition cannot be 5194 // true for phi nodes that form a cyclic dependence. We must process phi 5195 // nodes separately. An induction variable will remain uniform if all users 5196 // of the induction variable and induction variable update remain uniform. 5197 // The code below handles both pointer and non-pointer induction variables. 5198 for (auto &Induction : Legal->getInductionVars()) { 5199 auto *Ind = Induction.first; 5200 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5201 5202 // Determine if all users of the induction variable are uniform after 5203 // vectorization. 5204 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5205 auto *I = cast<Instruction>(U); 5206 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5207 isVectorizedMemAccessUse(I, Ind); 5208 }); 5209 if (!UniformInd) 5210 continue; 5211 5212 // Determine if all users of the induction variable update instruction are 5213 // uniform after vectorization. 5214 auto UniformIndUpdate = 5215 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5216 auto *I = cast<Instruction>(U); 5217 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5218 isVectorizedMemAccessUse(I, IndUpdate); 5219 }); 5220 if (!UniformIndUpdate) 5221 continue; 5222 5223 // The induction variable and its update instruction will remain uniform. 5224 addToWorklistIfAllowed(Ind); 5225 addToWorklistIfAllowed(IndUpdate); 5226 } 5227 5228 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5229 } 5230 5231 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5232 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5233 5234 if (Legal->getRuntimePointerChecking()->Need) { 5235 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5236 "runtime pointer checks needed. Enable vectorization of this " 5237 "loop with '#pragma clang loop vectorize(enable)' when " 5238 "compiling with -Os/-Oz", 5239 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5240 return true; 5241 } 5242 5243 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5244 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5245 "runtime SCEV checks needed. Enable vectorization of this " 5246 "loop with '#pragma clang loop vectorize(enable)' when " 5247 "compiling with -Os/-Oz", 5248 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5249 return true; 5250 } 5251 5252 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5253 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5254 reportVectorizationFailure("Runtime stride check for small trip count", 5255 "runtime stride == 1 checks needed. Enable vectorization of " 5256 "this loop without such check by compiling with -Os/-Oz", 5257 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5258 return true; 5259 } 5260 5261 return false; 5262 } 5263 5264 ElementCount 5265 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5266 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5267 return ElementCount::getScalable(0); 5268 5269 if (Hints->isScalableVectorizationDisabled()) { 5270 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5271 "ScalableVectorizationDisabled", ORE, TheLoop); 5272 return ElementCount::getScalable(0); 5273 } 5274 5275 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5276 5277 auto MaxScalableVF = ElementCount::getScalable( 5278 std::numeric_limits<ElementCount::ScalarTy>::max()); 5279 5280 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5281 // FIXME: While for scalable vectors this is currently sufficient, this should 5282 // be replaced by a more detailed mechanism that filters out specific VFs, 5283 // instead of invalidating vectorization for a whole set of VFs based on the 5284 // MaxVF. 5285 5286 // Disable scalable vectorization if the loop contains unsupported reductions. 5287 if (!canVectorizeReductions(MaxScalableVF)) { 5288 reportVectorizationInfo( 5289 "Scalable vectorization not supported for the reduction " 5290 "operations found in this loop.", 5291 "ScalableVFUnfeasible", ORE, TheLoop); 5292 return ElementCount::getScalable(0); 5293 } 5294 5295 // Disable scalable vectorization if the loop contains any instructions 5296 // with element types not supported for scalable vectors. 5297 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5298 return !Ty->isVoidTy() && 5299 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5300 })) { 5301 reportVectorizationInfo("Scalable vectorization is not supported " 5302 "for all element types found in this loop.", 5303 "ScalableVFUnfeasible", ORE, TheLoop); 5304 return ElementCount::getScalable(0); 5305 } 5306 5307 if (Legal->isSafeForAnyVectorWidth()) 5308 return MaxScalableVF; 5309 5310 // Limit MaxScalableVF by the maximum safe dependence distance. 5311 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5312 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5313 MaxVScale = 5314 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5315 MaxScalableVF = ElementCount::getScalable( 5316 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5317 if (!MaxScalableVF) 5318 reportVectorizationInfo( 5319 "Max legal vector width too small, scalable vectorization " 5320 "unfeasible.", 5321 "ScalableVFUnfeasible", ORE, TheLoop); 5322 5323 return MaxScalableVF; 5324 } 5325 5326 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5327 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5328 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5329 unsigned SmallestType, WidestType; 5330 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5331 5332 // Get the maximum safe dependence distance in bits computed by LAA. 5333 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5334 // the memory accesses that is most restrictive (involved in the smallest 5335 // dependence distance). 5336 unsigned MaxSafeElements = 5337 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5338 5339 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5340 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5341 5342 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5343 << ".\n"); 5344 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5345 << ".\n"); 5346 5347 // First analyze the UserVF, fall back if the UserVF should be ignored. 5348 if (UserVF) { 5349 auto MaxSafeUserVF = 5350 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5351 5352 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5353 // If `VF=vscale x N` is safe, then so is `VF=N` 5354 if (UserVF.isScalable()) 5355 return FixedScalableVFPair( 5356 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5357 else 5358 return UserVF; 5359 } 5360 5361 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5362 5363 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5364 // is better to ignore the hint and let the compiler choose a suitable VF. 5365 if (!UserVF.isScalable()) { 5366 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5367 << " is unsafe, clamping to max safe VF=" 5368 << MaxSafeFixedVF << ".\n"); 5369 ORE->emit([&]() { 5370 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5371 TheLoop->getStartLoc(), 5372 TheLoop->getHeader()) 5373 << "User-specified vectorization factor " 5374 << ore::NV("UserVectorizationFactor", UserVF) 5375 << " is unsafe, clamping to maximum safe vectorization factor " 5376 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5377 }); 5378 return MaxSafeFixedVF; 5379 } 5380 5381 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5382 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5383 << " is ignored because scalable vectors are not " 5384 "available.\n"); 5385 ORE->emit([&]() { 5386 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5387 TheLoop->getStartLoc(), 5388 TheLoop->getHeader()) 5389 << "User-specified vectorization factor " 5390 << ore::NV("UserVectorizationFactor", UserVF) 5391 << " is ignored because the target does not support scalable " 5392 "vectors. The compiler will pick a more suitable value."; 5393 }); 5394 } else { 5395 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5396 << " is unsafe. Ignoring scalable UserVF.\n"); 5397 ORE->emit([&]() { 5398 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5399 TheLoop->getStartLoc(), 5400 TheLoop->getHeader()) 5401 << "User-specified vectorization factor " 5402 << ore::NV("UserVectorizationFactor", UserVF) 5403 << " is unsafe. Ignoring the hint to let the compiler pick a " 5404 "more suitable value."; 5405 }); 5406 } 5407 } 5408 5409 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5410 << " / " << WidestType << " bits.\n"); 5411 5412 FixedScalableVFPair Result(ElementCount::getFixed(1), 5413 ElementCount::getScalable(0)); 5414 if (auto MaxVF = 5415 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5416 MaxSafeFixedVF, FoldTailByMasking)) 5417 Result.FixedVF = MaxVF; 5418 5419 if (auto MaxVF = 5420 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5421 MaxSafeScalableVF, FoldTailByMasking)) 5422 if (MaxVF.isScalable()) { 5423 Result.ScalableVF = MaxVF; 5424 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5425 << "\n"); 5426 } 5427 5428 return Result; 5429 } 5430 5431 FixedScalableVFPair 5432 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5433 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5434 // TODO: It may by useful to do since it's still likely to be dynamically 5435 // uniform if the target can skip. 5436 reportVectorizationFailure( 5437 "Not inserting runtime ptr check for divergent target", 5438 "runtime pointer checks needed. Not enabled for divergent target", 5439 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5440 return FixedScalableVFPair::getNone(); 5441 } 5442 5443 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5444 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5445 if (TC == 1) { 5446 reportVectorizationFailure("Single iteration (non) loop", 5447 "loop trip count is one, irrelevant for vectorization", 5448 "SingleIterationLoop", ORE, TheLoop); 5449 return FixedScalableVFPair::getNone(); 5450 } 5451 5452 switch (ScalarEpilogueStatus) { 5453 case CM_ScalarEpilogueAllowed: 5454 return computeFeasibleMaxVF(TC, UserVF, false); 5455 case CM_ScalarEpilogueNotAllowedUsePredicate: 5456 LLVM_FALLTHROUGH; 5457 case CM_ScalarEpilogueNotNeededUsePredicate: 5458 LLVM_DEBUG( 5459 dbgs() << "LV: vector predicate hint/switch found.\n" 5460 << "LV: Not allowing scalar epilogue, creating predicated " 5461 << "vector loop.\n"); 5462 break; 5463 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5464 // fallthrough as a special case of OptForSize 5465 case CM_ScalarEpilogueNotAllowedOptSize: 5466 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5467 LLVM_DEBUG( 5468 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5469 else 5470 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5471 << "count.\n"); 5472 5473 // Bail if runtime checks are required, which are not good when optimising 5474 // for size. 5475 if (runtimeChecksRequired()) 5476 return FixedScalableVFPair::getNone(); 5477 5478 break; 5479 } 5480 5481 // The only loops we can vectorize without a scalar epilogue, are loops with 5482 // a bottom-test and a single exiting block. We'd have to handle the fact 5483 // that not every instruction executes on the last iteration. This will 5484 // require a lane mask which varies through the vector loop body. (TODO) 5485 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5486 // If there was a tail-folding hint/switch, but we can't fold the tail by 5487 // masking, fallback to a vectorization with a scalar epilogue. 5488 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5489 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5490 "scalar epilogue instead.\n"); 5491 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5492 return computeFeasibleMaxVF(TC, UserVF, false); 5493 } 5494 return FixedScalableVFPair::getNone(); 5495 } 5496 5497 // Now try the tail folding 5498 5499 // Invalidate interleave groups that require an epilogue if we can't mask 5500 // the interleave-group. 5501 if (!useMaskedInterleavedAccesses(TTI)) { 5502 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5503 "No decisions should have been taken at this point"); 5504 // Note: There is no need to invalidate any cost modeling decisions here, as 5505 // non where taken so far. 5506 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5507 } 5508 5509 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5510 // Avoid tail folding if the trip count is known to be a multiple of any VF 5511 // we chose. 5512 // FIXME: The condition below pessimises the case for fixed-width vectors, 5513 // when scalable VFs are also candidates for vectorization. 5514 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5515 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5516 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5517 "MaxFixedVF must be a power of 2"); 5518 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5519 : MaxFixedVF.getFixedValue(); 5520 ScalarEvolution *SE = PSE.getSE(); 5521 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5522 const SCEV *ExitCount = SE->getAddExpr( 5523 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5524 const SCEV *Rem = SE->getURemExpr( 5525 SE->applyLoopGuards(ExitCount, TheLoop), 5526 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5527 if (Rem->isZero()) { 5528 // Accept MaxFixedVF if we do not have a tail. 5529 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5530 return MaxFactors; 5531 } 5532 } 5533 5534 // For scalable vectors, don't use tail folding as this is currently not yet 5535 // supported. The code is likely to have ended up here if the tripcount is 5536 // low, in which case it makes sense not to use scalable vectors. 5537 if (MaxFactors.ScalableVF.isVector()) 5538 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5539 5540 // If we don't know the precise trip count, or if the trip count that we 5541 // found modulo the vectorization factor is not zero, try to fold the tail 5542 // by masking. 5543 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5544 if (Legal->prepareToFoldTailByMasking()) { 5545 FoldTailByMasking = true; 5546 return MaxFactors; 5547 } 5548 5549 // If there was a tail-folding hint/switch, but we can't fold the tail by 5550 // masking, fallback to a vectorization with a scalar epilogue. 5551 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5552 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5553 "scalar epilogue instead.\n"); 5554 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5555 return MaxFactors; 5556 } 5557 5558 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5559 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5560 return FixedScalableVFPair::getNone(); 5561 } 5562 5563 if (TC == 0) { 5564 reportVectorizationFailure( 5565 "Unable to calculate the loop count due to complex control flow", 5566 "unable to calculate the loop count due to complex control flow", 5567 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5568 return FixedScalableVFPair::getNone(); 5569 } 5570 5571 reportVectorizationFailure( 5572 "Cannot optimize for size and vectorize at the same time.", 5573 "cannot optimize for size and vectorize at the same time. " 5574 "Enable vectorization of this loop with '#pragma clang loop " 5575 "vectorize(enable)' when compiling with -Os/-Oz", 5576 "NoTailLoopWithOptForSize", ORE, TheLoop); 5577 return FixedScalableVFPair::getNone(); 5578 } 5579 5580 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5581 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5582 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5583 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5584 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5585 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5586 : TargetTransformInfo::RGK_FixedWidthVector); 5587 5588 // Convenience function to return the minimum of two ElementCounts. 5589 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5590 assert((LHS.isScalable() == RHS.isScalable()) && 5591 "Scalable flags must match"); 5592 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5593 }; 5594 5595 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5596 // Note that both WidestRegister and WidestType may not be a powers of 2. 5597 auto MaxVectorElementCount = ElementCount::get( 5598 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5599 ComputeScalableMaxVF); 5600 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5601 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5602 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5603 5604 if (!MaxVectorElementCount) { 5605 LLVM_DEBUG(dbgs() << "LV: The target has no " 5606 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5607 << " vector registers.\n"); 5608 return ElementCount::getFixed(1); 5609 } 5610 5611 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5612 if (ConstTripCount && 5613 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5614 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5615 // If loop trip count (TC) is known at compile time there is no point in 5616 // choosing VF greater than TC (as done in the loop below). Select maximum 5617 // power of two which doesn't exceed TC. 5618 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5619 // when the TC is less than or equal to the known number of lanes. 5620 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5621 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5622 "exceeding the constant trip count: " 5623 << ClampedConstTripCount << "\n"); 5624 return ElementCount::getFixed(ClampedConstTripCount); 5625 } 5626 5627 ElementCount MaxVF = MaxVectorElementCount; 5628 if (TTI.shouldMaximizeVectorBandwidth() || 5629 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5630 auto MaxVectorElementCountMaxBW = ElementCount::get( 5631 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5632 ComputeScalableMaxVF); 5633 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5634 5635 // Collect all viable vectorization factors larger than the default MaxVF 5636 // (i.e. MaxVectorElementCount). 5637 SmallVector<ElementCount, 8> VFs; 5638 for (ElementCount VS = MaxVectorElementCount * 2; 5639 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5640 VFs.push_back(VS); 5641 5642 // For each VF calculate its register usage. 5643 auto RUs = calculateRegisterUsage(VFs); 5644 5645 // Select the largest VF which doesn't require more registers than existing 5646 // ones. 5647 for (int i = RUs.size() - 1; i >= 0; --i) { 5648 bool Selected = true; 5649 for (auto &pair : RUs[i].MaxLocalUsers) { 5650 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5651 if (pair.second > TargetNumRegisters) 5652 Selected = false; 5653 } 5654 if (Selected) { 5655 MaxVF = VFs[i]; 5656 break; 5657 } 5658 } 5659 if (ElementCount MinVF = 5660 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5661 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5662 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5663 << ") with target's minimum: " << MinVF << '\n'); 5664 MaxVF = MinVF; 5665 } 5666 } 5667 } 5668 return MaxVF; 5669 } 5670 5671 bool LoopVectorizationCostModel::isMoreProfitable( 5672 const VectorizationFactor &A, const VectorizationFactor &B) const { 5673 InstructionCost CostA = A.Cost; 5674 InstructionCost CostB = B.Cost; 5675 5676 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5677 5678 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5679 MaxTripCount) { 5680 // If we are folding the tail and the trip count is a known (possibly small) 5681 // constant, the trip count will be rounded up to an integer number of 5682 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5683 // which we compare directly. When not folding the tail, the total cost will 5684 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5685 // approximated with the per-lane cost below instead of using the tripcount 5686 // as here. 5687 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5688 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5689 return RTCostA < RTCostB; 5690 } 5691 5692 // Improve estimate for the vector width if it is scalable. 5693 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5694 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5695 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5696 if (A.Width.isScalable()) 5697 EstimatedWidthA *= VScale.getValue(); 5698 if (B.Width.isScalable()) 5699 EstimatedWidthB *= VScale.getValue(); 5700 } 5701 5702 // Assume vscale may be larger than 1 (or the value being tuned for), 5703 // so that scalable vectorization is slightly favorable over fixed-width 5704 // vectorization. 5705 if (A.Width.isScalable() && !B.Width.isScalable()) 5706 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5707 5708 // To avoid the need for FP division: 5709 // (CostA / A.Width) < (CostB / B.Width) 5710 // <=> (CostA * B.Width) < (CostB * A.Width) 5711 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5712 } 5713 5714 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5715 const ElementCountSet &VFCandidates) { 5716 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5717 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5718 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5719 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5720 "Expected Scalar VF to be a candidate"); 5721 5722 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5723 VectorizationFactor ChosenFactor = ScalarCost; 5724 5725 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5726 if (ForceVectorization && VFCandidates.size() > 1) { 5727 // Ignore scalar width, because the user explicitly wants vectorization. 5728 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5729 // evaluation. 5730 ChosenFactor.Cost = InstructionCost::getMax(); 5731 } 5732 5733 SmallVector<InstructionVFPair> InvalidCosts; 5734 for (const auto &i : VFCandidates) { 5735 // The cost for scalar VF=1 is already calculated, so ignore it. 5736 if (i.isScalar()) 5737 continue; 5738 5739 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5740 VectorizationFactor Candidate(i, C.first); 5741 5742 #ifndef NDEBUG 5743 unsigned AssumedMinimumVscale = 1; 5744 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5745 AssumedMinimumVscale = VScale.getValue(); 5746 unsigned Width = 5747 Candidate.Width.isScalable() 5748 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5749 : Candidate.Width.getFixedValue(); 5750 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5751 << " costs: " << (Candidate.Cost / Width)); 5752 if (i.isScalable()) 5753 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5754 << AssumedMinimumVscale << ")"); 5755 LLVM_DEBUG(dbgs() << ".\n"); 5756 #endif 5757 5758 if (!C.second && !ForceVectorization) { 5759 LLVM_DEBUG( 5760 dbgs() << "LV: Not considering vector loop of width " << i 5761 << " because it will not generate any vector instructions.\n"); 5762 continue; 5763 } 5764 5765 // If profitable add it to ProfitableVF list. 5766 if (isMoreProfitable(Candidate, ScalarCost)) 5767 ProfitableVFs.push_back(Candidate); 5768 5769 if (isMoreProfitable(Candidate, ChosenFactor)) 5770 ChosenFactor = Candidate; 5771 } 5772 5773 // Emit a report of VFs with invalid costs in the loop. 5774 if (!InvalidCosts.empty()) { 5775 // Group the remarks per instruction, keeping the instruction order from 5776 // InvalidCosts. 5777 std::map<Instruction *, unsigned> Numbering; 5778 unsigned I = 0; 5779 for (auto &Pair : InvalidCosts) 5780 if (!Numbering.count(Pair.first)) 5781 Numbering[Pair.first] = I++; 5782 5783 // Sort the list, first on instruction(number) then on VF. 5784 llvm::sort(InvalidCosts, 5785 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5786 if (Numbering[A.first] != Numbering[B.first]) 5787 return Numbering[A.first] < Numbering[B.first]; 5788 ElementCountComparator ECC; 5789 return ECC(A.second, B.second); 5790 }); 5791 5792 // For a list of ordered instruction-vf pairs: 5793 // [(load, vf1), (load, vf2), (store, vf1)] 5794 // Group the instructions together to emit separate remarks for: 5795 // load (vf1, vf2) 5796 // store (vf1) 5797 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5798 auto Subset = ArrayRef<InstructionVFPair>(); 5799 do { 5800 if (Subset.empty()) 5801 Subset = Tail.take_front(1); 5802 5803 Instruction *I = Subset.front().first; 5804 5805 // If the next instruction is different, or if there are no other pairs, 5806 // emit a remark for the collated subset. e.g. 5807 // [(load, vf1), (load, vf2))] 5808 // to emit: 5809 // remark: invalid costs for 'load' at VF=(vf, vf2) 5810 if (Subset == Tail || Tail[Subset.size()].first != I) { 5811 std::string OutString; 5812 raw_string_ostream OS(OutString); 5813 assert(!Subset.empty() && "Unexpected empty range"); 5814 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5815 for (auto &Pair : Subset) 5816 OS << (Pair.second == Subset.front().second ? "" : ", ") 5817 << Pair.second; 5818 OS << "):"; 5819 if (auto *CI = dyn_cast<CallInst>(I)) 5820 OS << " call to " << CI->getCalledFunction()->getName(); 5821 else 5822 OS << " " << I->getOpcodeName(); 5823 OS.flush(); 5824 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5825 Tail = Tail.drop_front(Subset.size()); 5826 Subset = {}; 5827 } else 5828 // Grow the subset by one element 5829 Subset = Tail.take_front(Subset.size() + 1); 5830 } while (!Tail.empty()); 5831 } 5832 5833 if (!EnableCondStoresVectorization && NumPredStores) { 5834 reportVectorizationFailure("There are conditional stores.", 5835 "store that is conditionally executed prevents vectorization", 5836 "ConditionalStore", ORE, TheLoop); 5837 ChosenFactor = ScalarCost; 5838 } 5839 5840 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5841 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5842 << "LV: Vectorization seems to be not beneficial, " 5843 << "but was forced by a user.\n"); 5844 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5845 return ChosenFactor; 5846 } 5847 5848 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5849 const Loop &L, ElementCount VF) const { 5850 // Cross iteration phis such as reductions need special handling and are 5851 // currently unsupported. 5852 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5853 return Legal->isFirstOrderRecurrence(&Phi) || 5854 Legal->isReductionVariable(&Phi); 5855 })) 5856 return false; 5857 5858 // Phis with uses outside of the loop require special handling and are 5859 // currently unsupported. 5860 for (auto &Entry : Legal->getInductionVars()) { 5861 // Look for uses of the value of the induction at the last iteration. 5862 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5863 for (User *U : PostInc->users()) 5864 if (!L.contains(cast<Instruction>(U))) 5865 return false; 5866 // Look for uses of penultimate value of the induction. 5867 for (User *U : Entry.first->users()) 5868 if (!L.contains(cast<Instruction>(U))) 5869 return false; 5870 } 5871 5872 // Induction variables that are widened require special handling that is 5873 // currently not supported. 5874 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5875 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5876 this->isProfitableToScalarize(Entry.first, VF)); 5877 })) 5878 return false; 5879 5880 // Epilogue vectorization code has not been auditted to ensure it handles 5881 // non-latch exits properly. It may be fine, but it needs auditted and 5882 // tested. 5883 if (L.getExitingBlock() != L.getLoopLatch()) 5884 return false; 5885 5886 return true; 5887 } 5888 5889 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5890 const ElementCount VF) const { 5891 // FIXME: We need a much better cost-model to take different parameters such 5892 // as register pressure, code size increase and cost of extra branches into 5893 // account. For now we apply a very crude heuristic and only consider loops 5894 // with vectorization factors larger than a certain value. 5895 // We also consider epilogue vectorization unprofitable for targets that don't 5896 // consider interleaving beneficial (eg. MVE). 5897 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5898 return false; 5899 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5900 return true; 5901 return false; 5902 } 5903 5904 VectorizationFactor 5905 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5906 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5907 VectorizationFactor Result = VectorizationFactor::Disabled(); 5908 if (!EnableEpilogueVectorization) { 5909 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5910 return Result; 5911 } 5912 5913 if (!isScalarEpilogueAllowed()) { 5914 LLVM_DEBUG( 5915 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5916 "allowed.\n";); 5917 return Result; 5918 } 5919 5920 // Not really a cost consideration, but check for unsupported cases here to 5921 // simplify the logic. 5922 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5923 LLVM_DEBUG( 5924 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5925 "not a supported candidate.\n";); 5926 return Result; 5927 } 5928 5929 if (EpilogueVectorizationForceVF > 1) { 5930 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5931 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5932 if (LVP.hasPlanWithVF(ForcedEC)) 5933 return {ForcedEC, 0}; 5934 else { 5935 LLVM_DEBUG( 5936 dbgs() 5937 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5938 return Result; 5939 } 5940 } 5941 5942 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5943 TheLoop->getHeader()->getParent()->hasMinSize()) { 5944 LLVM_DEBUG( 5945 dbgs() 5946 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5947 return Result; 5948 } 5949 5950 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5951 if (MainLoopVF.isScalable()) 5952 LLVM_DEBUG( 5953 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5954 "yet supported. Converting to fixed-width (VF=" 5955 << FixedMainLoopVF << ") instead\n"); 5956 5957 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5958 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5959 "this loop\n"); 5960 return Result; 5961 } 5962 5963 for (auto &NextVF : ProfitableVFs) 5964 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5965 (Result.Width.getFixedValue() == 1 || 5966 isMoreProfitable(NextVF, Result)) && 5967 LVP.hasPlanWithVF(NextVF.Width)) 5968 Result = NextVF; 5969 5970 if (Result != VectorizationFactor::Disabled()) 5971 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5972 << Result.Width.getFixedValue() << "\n";); 5973 return Result; 5974 } 5975 5976 std::pair<unsigned, unsigned> 5977 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5978 unsigned MinWidth = -1U; 5979 unsigned MaxWidth = 8; 5980 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5981 for (Type *T : ElementTypesInLoop) { 5982 MinWidth = std::min<unsigned>( 5983 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5984 MaxWidth = std::max<unsigned>( 5985 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5986 } 5987 return {MinWidth, MaxWidth}; 5988 } 5989 5990 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5991 ElementTypesInLoop.clear(); 5992 // For each block. 5993 for (BasicBlock *BB : TheLoop->blocks()) { 5994 // For each instruction in the loop. 5995 for (Instruction &I : BB->instructionsWithoutDebug()) { 5996 Type *T = I.getType(); 5997 5998 // Skip ignored values. 5999 if (ValuesToIgnore.count(&I)) 6000 continue; 6001 6002 // Only examine Loads, Stores and PHINodes. 6003 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6004 continue; 6005 6006 // Examine PHI nodes that are reduction variables. Update the type to 6007 // account for the recurrence type. 6008 if (auto *PN = dyn_cast<PHINode>(&I)) { 6009 if (!Legal->isReductionVariable(PN)) 6010 continue; 6011 const RecurrenceDescriptor &RdxDesc = 6012 Legal->getReductionVars().find(PN)->second; 6013 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6014 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6015 RdxDesc.getRecurrenceType(), 6016 TargetTransformInfo::ReductionFlags())) 6017 continue; 6018 T = RdxDesc.getRecurrenceType(); 6019 } 6020 6021 // Examine the stored values. 6022 if (auto *ST = dyn_cast<StoreInst>(&I)) 6023 T = ST->getValueOperand()->getType(); 6024 6025 // Ignore loaded pointer types and stored pointer types that are not 6026 // vectorizable. 6027 // 6028 // FIXME: The check here attempts to predict whether a load or store will 6029 // be vectorized. We only know this for certain after a VF has 6030 // been selected. Here, we assume that if an access can be 6031 // vectorized, it will be. We should also look at extending this 6032 // optimization to non-pointer types. 6033 // 6034 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6035 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6036 continue; 6037 6038 ElementTypesInLoop.insert(T); 6039 } 6040 } 6041 } 6042 6043 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6044 unsigned LoopCost) { 6045 // -- The interleave heuristics -- 6046 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6047 // There are many micro-architectural considerations that we can't predict 6048 // at this level. For example, frontend pressure (on decode or fetch) due to 6049 // code size, or the number and capabilities of the execution ports. 6050 // 6051 // We use the following heuristics to select the interleave count: 6052 // 1. If the code has reductions, then we interleave to break the cross 6053 // iteration dependency. 6054 // 2. If the loop is really small, then we interleave to reduce the loop 6055 // overhead. 6056 // 3. We don't interleave if we think that we will spill registers to memory 6057 // due to the increased register pressure. 6058 6059 if (!isScalarEpilogueAllowed()) 6060 return 1; 6061 6062 // We used the distance for the interleave count. 6063 if (Legal->getMaxSafeDepDistBytes() != -1U) 6064 return 1; 6065 6066 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6067 const bool HasReductions = !Legal->getReductionVars().empty(); 6068 // Do not interleave loops with a relatively small known or estimated trip 6069 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6070 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6071 // because with the above conditions interleaving can expose ILP and break 6072 // cross iteration dependences for reductions. 6073 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6074 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6075 return 1; 6076 6077 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6078 // We divide by these constants so assume that we have at least one 6079 // instruction that uses at least one register. 6080 for (auto& pair : R.MaxLocalUsers) { 6081 pair.second = std::max(pair.second, 1U); 6082 } 6083 6084 // We calculate the interleave count using the following formula. 6085 // Subtract the number of loop invariants from the number of available 6086 // registers. These registers are used by all of the interleaved instances. 6087 // Next, divide the remaining registers by the number of registers that is 6088 // required by the loop, in order to estimate how many parallel instances 6089 // fit without causing spills. All of this is rounded down if necessary to be 6090 // a power of two. We want power of two interleave count to simplify any 6091 // addressing operations or alignment considerations. 6092 // We also want power of two interleave counts to ensure that the induction 6093 // variable of the vector loop wraps to zero, when tail is folded by masking; 6094 // this currently happens when OptForSize, in which case IC is set to 1 above. 6095 unsigned IC = UINT_MAX; 6096 6097 for (auto& pair : R.MaxLocalUsers) { 6098 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6099 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6100 << " registers of " 6101 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6102 if (VF.isScalar()) { 6103 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6104 TargetNumRegisters = ForceTargetNumScalarRegs; 6105 } else { 6106 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6107 TargetNumRegisters = ForceTargetNumVectorRegs; 6108 } 6109 unsigned MaxLocalUsers = pair.second; 6110 unsigned LoopInvariantRegs = 0; 6111 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6112 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6113 6114 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6115 // Don't count the induction variable as interleaved. 6116 if (EnableIndVarRegisterHeur) { 6117 TmpIC = 6118 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6119 std::max(1U, (MaxLocalUsers - 1))); 6120 } 6121 6122 IC = std::min(IC, TmpIC); 6123 } 6124 6125 // Clamp the interleave ranges to reasonable counts. 6126 unsigned MaxInterleaveCount = 6127 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6128 6129 // Check if the user has overridden the max. 6130 if (VF.isScalar()) { 6131 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6132 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6133 } else { 6134 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6135 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6136 } 6137 6138 // If trip count is known or estimated compile time constant, limit the 6139 // interleave count to be less than the trip count divided by VF, provided it 6140 // is at least 1. 6141 // 6142 // For scalable vectors we can't know if interleaving is beneficial. It may 6143 // not be beneficial for small loops if none of the lanes in the second vector 6144 // iterations is enabled. However, for larger loops, there is likely to be a 6145 // similar benefit as for fixed-width vectors. For now, we choose to leave 6146 // the InterleaveCount as if vscale is '1', although if some information about 6147 // the vector is known (e.g. min vector size), we can make a better decision. 6148 if (BestKnownTC) { 6149 MaxInterleaveCount = 6150 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6151 // Make sure MaxInterleaveCount is greater than 0. 6152 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6153 } 6154 6155 assert(MaxInterleaveCount > 0 && 6156 "Maximum interleave count must be greater than 0"); 6157 6158 // Clamp the calculated IC to be between the 1 and the max interleave count 6159 // that the target and trip count allows. 6160 if (IC > MaxInterleaveCount) 6161 IC = MaxInterleaveCount; 6162 else 6163 // Make sure IC is greater than 0. 6164 IC = std::max(1u, IC); 6165 6166 assert(IC > 0 && "Interleave count must be greater than 0."); 6167 6168 // If we did not calculate the cost for VF (because the user selected the VF) 6169 // then we calculate the cost of VF here. 6170 if (LoopCost == 0) { 6171 InstructionCost C = expectedCost(VF).first; 6172 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6173 LoopCost = *C.getValue(); 6174 } 6175 6176 assert(LoopCost && "Non-zero loop cost expected"); 6177 6178 // Interleave if we vectorized this loop and there is a reduction that could 6179 // benefit from interleaving. 6180 if (VF.isVector() && HasReductions) { 6181 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6182 return IC; 6183 } 6184 6185 // Note that if we've already vectorized the loop we will have done the 6186 // runtime check and so interleaving won't require further checks. 6187 bool InterleavingRequiresRuntimePointerCheck = 6188 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6189 6190 // We want to interleave small loops in order to reduce the loop overhead and 6191 // potentially expose ILP opportunities. 6192 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6193 << "LV: IC is " << IC << '\n' 6194 << "LV: VF is " << VF << '\n'); 6195 const bool AggressivelyInterleaveReductions = 6196 TTI.enableAggressiveInterleaving(HasReductions); 6197 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6198 // We assume that the cost overhead is 1 and we use the cost model 6199 // to estimate the cost of the loop and interleave until the cost of the 6200 // loop overhead is about 5% of the cost of the loop. 6201 unsigned SmallIC = 6202 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6203 6204 // Interleave until store/load ports (estimated by max interleave count) are 6205 // saturated. 6206 unsigned NumStores = Legal->getNumStores(); 6207 unsigned NumLoads = Legal->getNumLoads(); 6208 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6209 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6210 6211 // There is little point in interleaving for reductions containing selects 6212 // and compares when VF=1 since it may just create more overhead than it's 6213 // worth for loops with small trip counts. This is because we still have to 6214 // do the final reduction after the loop. 6215 bool HasSelectCmpReductions = 6216 HasReductions && 6217 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6218 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6219 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6220 RdxDesc.getRecurrenceKind()); 6221 }); 6222 if (HasSelectCmpReductions) { 6223 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6224 return 1; 6225 } 6226 6227 // If we have a scalar reduction (vector reductions are already dealt with 6228 // by this point), we can increase the critical path length if the loop 6229 // we're interleaving is inside another loop. For tree-wise reductions 6230 // set the limit to 2, and for ordered reductions it's best to disable 6231 // interleaving entirely. 6232 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6233 bool HasOrderedReductions = 6234 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6235 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6236 return RdxDesc.isOrdered(); 6237 }); 6238 if (HasOrderedReductions) { 6239 LLVM_DEBUG( 6240 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6241 return 1; 6242 } 6243 6244 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6245 SmallIC = std::min(SmallIC, F); 6246 StoresIC = std::min(StoresIC, F); 6247 LoadsIC = std::min(LoadsIC, F); 6248 } 6249 6250 if (EnableLoadStoreRuntimeInterleave && 6251 std::max(StoresIC, LoadsIC) > SmallIC) { 6252 LLVM_DEBUG( 6253 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6254 return std::max(StoresIC, LoadsIC); 6255 } 6256 6257 // If there are scalar reductions and TTI has enabled aggressive 6258 // interleaving for reductions, we will interleave to expose ILP. 6259 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6260 AggressivelyInterleaveReductions) { 6261 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6262 // Interleave no less than SmallIC but not as aggressive as the normal IC 6263 // to satisfy the rare situation when resources are too limited. 6264 return std::max(IC / 2, SmallIC); 6265 } else { 6266 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6267 return SmallIC; 6268 } 6269 } 6270 6271 // Interleave if this is a large loop (small loops are already dealt with by 6272 // this point) that could benefit from interleaving. 6273 if (AggressivelyInterleaveReductions) { 6274 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6275 return IC; 6276 } 6277 6278 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6279 return 1; 6280 } 6281 6282 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6283 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6284 // This function calculates the register usage by measuring the highest number 6285 // of values that are alive at a single location. Obviously, this is a very 6286 // rough estimation. We scan the loop in a topological order in order and 6287 // assign a number to each instruction. We use RPO to ensure that defs are 6288 // met before their users. We assume that each instruction that has in-loop 6289 // users starts an interval. We record every time that an in-loop value is 6290 // used, so we have a list of the first and last occurrences of each 6291 // instruction. Next, we transpose this data structure into a multi map that 6292 // holds the list of intervals that *end* at a specific location. This multi 6293 // map allows us to perform a linear search. We scan the instructions linearly 6294 // and record each time that a new interval starts, by placing it in a set. 6295 // If we find this value in the multi-map then we remove it from the set. 6296 // The max register usage is the maximum size of the set. 6297 // We also search for instructions that are defined outside the loop, but are 6298 // used inside the loop. We need this number separately from the max-interval 6299 // usage number because when we unroll, loop-invariant values do not take 6300 // more register. 6301 LoopBlocksDFS DFS(TheLoop); 6302 DFS.perform(LI); 6303 6304 RegisterUsage RU; 6305 6306 // Each 'key' in the map opens a new interval. The values 6307 // of the map are the index of the 'last seen' usage of the 6308 // instruction that is the key. 6309 using IntervalMap = DenseMap<Instruction *, unsigned>; 6310 6311 // Maps instruction to its index. 6312 SmallVector<Instruction *, 64> IdxToInstr; 6313 // Marks the end of each interval. 6314 IntervalMap EndPoint; 6315 // Saves the list of instruction indices that are used in the loop. 6316 SmallPtrSet<Instruction *, 8> Ends; 6317 // Saves the list of values that are used in the loop but are 6318 // defined outside the loop, such as arguments and constants. 6319 SmallPtrSet<Value *, 8> LoopInvariants; 6320 6321 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6322 for (Instruction &I : BB->instructionsWithoutDebug()) { 6323 IdxToInstr.push_back(&I); 6324 6325 // Save the end location of each USE. 6326 for (Value *U : I.operands()) { 6327 auto *Instr = dyn_cast<Instruction>(U); 6328 6329 // Ignore non-instruction values such as arguments, constants, etc. 6330 if (!Instr) 6331 continue; 6332 6333 // If this instruction is outside the loop then record it and continue. 6334 if (!TheLoop->contains(Instr)) { 6335 LoopInvariants.insert(Instr); 6336 continue; 6337 } 6338 6339 // Overwrite previous end points. 6340 EndPoint[Instr] = IdxToInstr.size(); 6341 Ends.insert(Instr); 6342 } 6343 } 6344 } 6345 6346 // Saves the list of intervals that end with the index in 'key'. 6347 using InstrList = SmallVector<Instruction *, 2>; 6348 DenseMap<unsigned, InstrList> TransposeEnds; 6349 6350 // Transpose the EndPoints to a list of values that end at each index. 6351 for (auto &Interval : EndPoint) 6352 TransposeEnds[Interval.second].push_back(Interval.first); 6353 6354 SmallPtrSet<Instruction *, 8> OpenIntervals; 6355 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6356 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6357 6358 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6359 6360 // A lambda that gets the register usage for the given type and VF. 6361 const auto &TTICapture = TTI; 6362 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6363 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6364 return 0; 6365 InstructionCost::CostType RegUsage = 6366 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6367 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6368 "Nonsensical values for register usage."); 6369 return RegUsage; 6370 }; 6371 6372 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6373 Instruction *I = IdxToInstr[i]; 6374 6375 // Remove all of the instructions that end at this location. 6376 InstrList &List = TransposeEnds[i]; 6377 for (Instruction *ToRemove : List) 6378 OpenIntervals.erase(ToRemove); 6379 6380 // Ignore instructions that are never used within the loop. 6381 if (!Ends.count(I)) 6382 continue; 6383 6384 // Skip ignored values. 6385 if (ValuesToIgnore.count(I)) 6386 continue; 6387 6388 // For each VF find the maximum usage of registers. 6389 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6390 // Count the number of live intervals. 6391 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6392 6393 if (VFs[j].isScalar()) { 6394 for (auto Inst : OpenIntervals) { 6395 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6396 if (RegUsage.find(ClassID) == RegUsage.end()) 6397 RegUsage[ClassID] = 1; 6398 else 6399 RegUsage[ClassID] += 1; 6400 } 6401 } else { 6402 collectUniformsAndScalars(VFs[j]); 6403 for (auto Inst : OpenIntervals) { 6404 // Skip ignored values for VF > 1. 6405 if (VecValuesToIgnore.count(Inst)) 6406 continue; 6407 if (isScalarAfterVectorization(Inst, VFs[j])) { 6408 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6409 if (RegUsage.find(ClassID) == RegUsage.end()) 6410 RegUsage[ClassID] = 1; 6411 else 6412 RegUsage[ClassID] += 1; 6413 } else { 6414 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6415 if (RegUsage.find(ClassID) == RegUsage.end()) 6416 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6417 else 6418 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6419 } 6420 } 6421 } 6422 6423 for (auto& pair : RegUsage) { 6424 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6425 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6426 else 6427 MaxUsages[j][pair.first] = pair.second; 6428 } 6429 } 6430 6431 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6432 << OpenIntervals.size() << '\n'); 6433 6434 // Add the current instruction to the list of open intervals. 6435 OpenIntervals.insert(I); 6436 } 6437 6438 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6439 SmallMapVector<unsigned, unsigned, 4> Invariant; 6440 6441 for (auto Inst : LoopInvariants) { 6442 unsigned Usage = 6443 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6444 unsigned ClassID = 6445 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6446 if (Invariant.find(ClassID) == Invariant.end()) 6447 Invariant[ClassID] = Usage; 6448 else 6449 Invariant[ClassID] += Usage; 6450 } 6451 6452 LLVM_DEBUG({ 6453 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6454 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6455 << " item\n"; 6456 for (const auto &pair : MaxUsages[i]) { 6457 dbgs() << "LV(REG): RegisterClass: " 6458 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6459 << " registers\n"; 6460 } 6461 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6462 << " item\n"; 6463 for (const auto &pair : Invariant) { 6464 dbgs() << "LV(REG): RegisterClass: " 6465 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6466 << " registers\n"; 6467 } 6468 }); 6469 6470 RU.LoopInvariantRegs = Invariant; 6471 RU.MaxLocalUsers = MaxUsages[i]; 6472 RUs[i] = RU; 6473 } 6474 6475 return RUs; 6476 } 6477 6478 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6479 // TODO: Cost model for emulated masked load/store is completely 6480 // broken. This hack guides the cost model to use an artificially 6481 // high enough value to practically disable vectorization with such 6482 // operations, except where previously deployed legality hack allowed 6483 // using very low cost values. This is to avoid regressions coming simply 6484 // from moving "masked load/store" check from legality to cost model. 6485 // Masked Load/Gather emulation was previously never allowed. 6486 // Limited number of Masked Store/Scatter emulation was allowed. 6487 assert(isPredicatedInst(I) && 6488 "Expecting a scalar emulated instruction"); 6489 return isa<LoadInst>(I) || 6490 (isa<StoreInst>(I) && 6491 NumPredStores > NumberOfStoresToPredicate); 6492 } 6493 6494 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6495 // If we aren't vectorizing the loop, or if we've already collected the 6496 // instructions to scalarize, there's nothing to do. Collection may already 6497 // have occurred if we have a user-selected VF and are now computing the 6498 // expected cost for interleaving. 6499 if (VF.isScalar() || VF.isZero() || 6500 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6501 return; 6502 6503 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6504 // not profitable to scalarize any instructions, the presence of VF in the 6505 // map will indicate that we've analyzed it already. 6506 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6507 6508 // Find all the instructions that are scalar with predication in the loop and 6509 // determine if it would be better to not if-convert the blocks they are in. 6510 // If so, we also record the instructions to scalarize. 6511 for (BasicBlock *BB : TheLoop->blocks()) { 6512 if (!blockNeedsPredicationForAnyReason(BB)) 6513 continue; 6514 for (Instruction &I : *BB) 6515 if (isScalarWithPredication(&I)) { 6516 ScalarCostsTy ScalarCosts; 6517 // Do not apply discount if scalable, because that would lead to 6518 // invalid scalarization costs. 6519 // Do not apply discount logic if hacked cost is needed 6520 // for emulated masked memrefs. 6521 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && 6522 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6523 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6524 // Remember that BB will remain after vectorization. 6525 PredicatedBBsAfterVectorization.insert(BB); 6526 } 6527 } 6528 } 6529 6530 int LoopVectorizationCostModel::computePredInstDiscount( 6531 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6532 assert(!isUniformAfterVectorization(PredInst, VF) && 6533 "Instruction marked uniform-after-vectorization will be predicated"); 6534 6535 // Initialize the discount to zero, meaning that the scalar version and the 6536 // vector version cost the same. 6537 InstructionCost Discount = 0; 6538 6539 // Holds instructions to analyze. The instructions we visit are mapped in 6540 // ScalarCosts. Those instructions are the ones that would be scalarized if 6541 // we find that the scalar version costs less. 6542 SmallVector<Instruction *, 8> Worklist; 6543 6544 // Returns true if the given instruction can be scalarized. 6545 auto canBeScalarized = [&](Instruction *I) -> bool { 6546 // We only attempt to scalarize instructions forming a single-use chain 6547 // from the original predicated block that would otherwise be vectorized. 6548 // Although not strictly necessary, we give up on instructions we know will 6549 // already be scalar to avoid traversing chains that are unlikely to be 6550 // beneficial. 6551 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6552 isScalarAfterVectorization(I, VF)) 6553 return false; 6554 6555 // If the instruction is scalar with predication, it will be analyzed 6556 // separately. We ignore it within the context of PredInst. 6557 if (isScalarWithPredication(I)) 6558 return false; 6559 6560 // If any of the instruction's operands are uniform after vectorization, 6561 // the instruction cannot be scalarized. This prevents, for example, a 6562 // masked load from being scalarized. 6563 // 6564 // We assume we will only emit a value for lane zero of an instruction 6565 // marked uniform after vectorization, rather than VF identical values. 6566 // Thus, if we scalarize an instruction that uses a uniform, we would 6567 // create uses of values corresponding to the lanes we aren't emitting code 6568 // for. This behavior can be changed by allowing getScalarValue to clone 6569 // the lane zero values for uniforms rather than asserting. 6570 for (Use &U : I->operands()) 6571 if (auto *J = dyn_cast<Instruction>(U.get())) 6572 if (isUniformAfterVectorization(J, VF)) 6573 return false; 6574 6575 // Otherwise, we can scalarize the instruction. 6576 return true; 6577 }; 6578 6579 // Compute the expected cost discount from scalarizing the entire expression 6580 // feeding the predicated instruction. We currently only consider expressions 6581 // that are single-use instruction chains. 6582 Worklist.push_back(PredInst); 6583 while (!Worklist.empty()) { 6584 Instruction *I = Worklist.pop_back_val(); 6585 6586 // If we've already analyzed the instruction, there's nothing to do. 6587 if (ScalarCosts.find(I) != ScalarCosts.end()) 6588 continue; 6589 6590 // Compute the cost of the vector instruction. Note that this cost already 6591 // includes the scalarization overhead of the predicated instruction. 6592 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6593 6594 // Compute the cost of the scalarized instruction. This cost is the cost of 6595 // the instruction as if it wasn't if-converted and instead remained in the 6596 // predicated block. We will scale this cost by block probability after 6597 // computing the scalarization overhead. 6598 InstructionCost ScalarCost = 6599 VF.getFixedValue() * 6600 getInstructionCost(I, ElementCount::getFixed(1)).first; 6601 6602 // Compute the scalarization overhead of needed insertelement instructions 6603 // and phi nodes. 6604 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6605 ScalarCost += TTI.getScalarizationOverhead( 6606 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6607 APInt::getAllOnes(VF.getFixedValue()), true, false); 6608 ScalarCost += 6609 VF.getFixedValue() * 6610 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6611 } 6612 6613 // Compute the scalarization overhead of needed extractelement 6614 // instructions. For each of the instruction's operands, if the operand can 6615 // be scalarized, add it to the worklist; otherwise, account for the 6616 // overhead. 6617 for (Use &U : I->operands()) 6618 if (auto *J = dyn_cast<Instruction>(U.get())) { 6619 assert(VectorType::isValidElementType(J->getType()) && 6620 "Instruction has non-scalar type"); 6621 if (canBeScalarized(J)) 6622 Worklist.push_back(J); 6623 else if (needsExtract(J, VF)) { 6624 ScalarCost += TTI.getScalarizationOverhead( 6625 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6626 APInt::getAllOnes(VF.getFixedValue()), false, true); 6627 } 6628 } 6629 6630 // Scale the total scalar cost by block probability. 6631 ScalarCost /= getReciprocalPredBlockProb(); 6632 6633 // Compute the discount. A non-negative discount means the vector version 6634 // of the instruction costs more, and scalarizing would be beneficial. 6635 Discount += VectorCost - ScalarCost; 6636 ScalarCosts[I] = ScalarCost; 6637 } 6638 6639 return *Discount.getValue(); 6640 } 6641 6642 LoopVectorizationCostModel::VectorizationCostTy 6643 LoopVectorizationCostModel::expectedCost( 6644 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6645 VectorizationCostTy Cost; 6646 6647 // For each block. 6648 for (BasicBlock *BB : TheLoop->blocks()) { 6649 VectorizationCostTy BlockCost; 6650 6651 // For each instruction in the old loop. 6652 for (Instruction &I : BB->instructionsWithoutDebug()) { 6653 // Skip ignored values. 6654 if (ValuesToIgnore.count(&I) || 6655 (VF.isVector() && VecValuesToIgnore.count(&I))) 6656 continue; 6657 6658 VectorizationCostTy C = getInstructionCost(&I, VF); 6659 6660 // Check if we should override the cost. 6661 if (C.first.isValid() && 6662 ForceTargetInstructionCost.getNumOccurrences() > 0) 6663 C.first = InstructionCost(ForceTargetInstructionCost); 6664 6665 // Keep a list of instructions with invalid costs. 6666 if (Invalid && !C.first.isValid()) 6667 Invalid->emplace_back(&I, VF); 6668 6669 BlockCost.first += C.first; 6670 BlockCost.second |= C.second; 6671 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6672 << " for VF " << VF << " For instruction: " << I 6673 << '\n'); 6674 } 6675 6676 // If we are vectorizing a predicated block, it will have been 6677 // if-converted. This means that the block's instructions (aside from 6678 // stores and instructions that may divide by zero) will now be 6679 // unconditionally executed. For the scalar case, we may not always execute 6680 // the predicated block, if it is an if-else block. Thus, scale the block's 6681 // cost by the probability of executing it. blockNeedsPredication from 6682 // Legal is used so as to not include all blocks in tail folded loops. 6683 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6684 BlockCost.first /= getReciprocalPredBlockProb(); 6685 6686 Cost.first += BlockCost.first; 6687 Cost.second |= BlockCost.second; 6688 } 6689 6690 return Cost; 6691 } 6692 6693 /// Gets Address Access SCEV after verifying that the access pattern 6694 /// is loop invariant except the induction variable dependence. 6695 /// 6696 /// This SCEV can be sent to the Target in order to estimate the address 6697 /// calculation cost. 6698 static const SCEV *getAddressAccessSCEV( 6699 Value *Ptr, 6700 LoopVectorizationLegality *Legal, 6701 PredicatedScalarEvolution &PSE, 6702 const Loop *TheLoop) { 6703 6704 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6705 if (!Gep) 6706 return nullptr; 6707 6708 // We are looking for a gep with all loop invariant indices except for one 6709 // which should be an induction variable. 6710 auto SE = PSE.getSE(); 6711 unsigned NumOperands = Gep->getNumOperands(); 6712 for (unsigned i = 1; i < NumOperands; ++i) { 6713 Value *Opd = Gep->getOperand(i); 6714 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6715 !Legal->isInductionVariable(Opd)) 6716 return nullptr; 6717 } 6718 6719 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6720 return PSE.getSCEV(Ptr); 6721 } 6722 6723 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6724 return Legal->hasStride(I->getOperand(0)) || 6725 Legal->hasStride(I->getOperand(1)); 6726 } 6727 6728 InstructionCost 6729 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6730 ElementCount VF) { 6731 assert(VF.isVector() && 6732 "Scalarization cost of instruction implies vectorization."); 6733 if (VF.isScalable()) 6734 return InstructionCost::getInvalid(); 6735 6736 Type *ValTy = getLoadStoreType(I); 6737 auto SE = PSE.getSE(); 6738 6739 unsigned AS = getLoadStoreAddressSpace(I); 6740 Value *Ptr = getLoadStorePointerOperand(I); 6741 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6742 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6743 // that it is being called from this specific place. 6744 6745 // Figure out whether the access is strided and get the stride value 6746 // if it's known in compile time 6747 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6748 6749 // Get the cost of the scalar memory instruction and address computation. 6750 InstructionCost Cost = 6751 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6752 6753 // Don't pass *I here, since it is scalar but will actually be part of a 6754 // vectorized loop where the user of it is a vectorized instruction. 6755 const Align Alignment = getLoadStoreAlignment(I); 6756 Cost += VF.getKnownMinValue() * 6757 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6758 AS, TTI::TCK_RecipThroughput); 6759 6760 // Get the overhead of the extractelement and insertelement instructions 6761 // we might create due to scalarization. 6762 Cost += getScalarizationOverhead(I, VF); 6763 6764 // If we have a predicated load/store, it will need extra i1 extracts and 6765 // conditional branches, but may not be executed for each vector lane. Scale 6766 // the cost by the probability of executing the predicated block. 6767 if (isPredicatedInst(I)) { 6768 Cost /= getReciprocalPredBlockProb(); 6769 6770 // Add the cost of an i1 extract and a branch 6771 auto *Vec_i1Ty = 6772 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6773 Cost += TTI.getScalarizationOverhead( 6774 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6775 /*Insert=*/false, /*Extract=*/true); 6776 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6777 6778 if (useEmulatedMaskMemRefHack(I)) 6779 // Artificially setting to a high enough value to practically disable 6780 // vectorization with such operations. 6781 Cost = 3000000; 6782 } 6783 6784 return Cost; 6785 } 6786 6787 InstructionCost 6788 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6789 ElementCount VF) { 6790 Type *ValTy = getLoadStoreType(I); 6791 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6792 Value *Ptr = getLoadStorePointerOperand(I); 6793 unsigned AS = getLoadStoreAddressSpace(I); 6794 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6795 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6796 6797 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6798 "Stride should be 1 or -1 for consecutive memory access"); 6799 const Align Alignment = getLoadStoreAlignment(I); 6800 InstructionCost Cost = 0; 6801 if (Legal->isMaskRequired(I)) 6802 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6803 CostKind); 6804 else 6805 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6806 CostKind, I); 6807 6808 bool Reverse = ConsecutiveStride < 0; 6809 if (Reverse) 6810 Cost += 6811 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6812 return Cost; 6813 } 6814 6815 InstructionCost 6816 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6817 ElementCount VF) { 6818 assert(Legal->isUniformMemOp(*I)); 6819 6820 Type *ValTy = getLoadStoreType(I); 6821 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6822 const Align Alignment = getLoadStoreAlignment(I); 6823 unsigned AS = getLoadStoreAddressSpace(I); 6824 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6825 if (isa<LoadInst>(I)) { 6826 return TTI.getAddressComputationCost(ValTy) + 6827 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6828 CostKind) + 6829 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6830 } 6831 StoreInst *SI = cast<StoreInst>(I); 6832 6833 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6834 return TTI.getAddressComputationCost(ValTy) + 6835 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6836 CostKind) + 6837 (isLoopInvariantStoreValue 6838 ? 0 6839 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6840 VF.getKnownMinValue() - 1)); 6841 } 6842 6843 InstructionCost 6844 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6845 ElementCount VF) { 6846 Type *ValTy = getLoadStoreType(I); 6847 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6848 const Align Alignment = getLoadStoreAlignment(I); 6849 const Value *Ptr = getLoadStorePointerOperand(I); 6850 6851 return TTI.getAddressComputationCost(VectorTy) + 6852 TTI.getGatherScatterOpCost( 6853 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6854 TargetTransformInfo::TCK_RecipThroughput, I); 6855 } 6856 6857 InstructionCost 6858 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6859 ElementCount VF) { 6860 // TODO: Once we have support for interleaving with scalable vectors 6861 // we can calculate the cost properly here. 6862 if (VF.isScalable()) 6863 return InstructionCost::getInvalid(); 6864 6865 Type *ValTy = getLoadStoreType(I); 6866 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6867 unsigned AS = getLoadStoreAddressSpace(I); 6868 6869 auto Group = getInterleavedAccessGroup(I); 6870 assert(Group && "Fail to get an interleaved access group."); 6871 6872 unsigned InterleaveFactor = Group->getFactor(); 6873 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6874 6875 // Holds the indices of existing members in the interleaved group. 6876 SmallVector<unsigned, 4> Indices; 6877 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6878 if (Group->getMember(IF)) 6879 Indices.push_back(IF); 6880 6881 // Calculate the cost of the whole interleaved group. 6882 bool UseMaskForGaps = 6883 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6884 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6885 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6886 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6887 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6888 6889 if (Group->isReverse()) { 6890 // TODO: Add support for reversed masked interleaved access. 6891 assert(!Legal->isMaskRequired(I) && 6892 "Reverse masked interleaved access not supported."); 6893 Cost += 6894 Group->getNumMembers() * 6895 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6896 } 6897 return Cost; 6898 } 6899 6900 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6901 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6902 using namespace llvm::PatternMatch; 6903 // Early exit for no inloop reductions 6904 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6905 return None; 6906 auto *VectorTy = cast<VectorType>(Ty); 6907 6908 // We are looking for a pattern of, and finding the minimal acceptable cost: 6909 // reduce(mul(ext(A), ext(B))) or 6910 // reduce(mul(A, B)) or 6911 // reduce(ext(A)) or 6912 // reduce(A). 6913 // The basic idea is that we walk down the tree to do that, finding the root 6914 // reduction instruction in InLoopReductionImmediateChains. From there we find 6915 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6916 // of the components. If the reduction cost is lower then we return it for the 6917 // reduction instruction and 0 for the other instructions in the pattern. If 6918 // it is not we return an invalid cost specifying the orignal cost method 6919 // should be used. 6920 Instruction *RetI = I; 6921 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6922 if (!RetI->hasOneUser()) 6923 return None; 6924 RetI = RetI->user_back(); 6925 } 6926 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6927 RetI->user_back()->getOpcode() == Instruction::Add) { 6928 if (!RetI->hasOneUser()) 6929 return None; 6930 RetI = RetI->user_back(); 6931 } 6932 6933 // Test if the found instruction is a reduction, and if not return an invalid 6934 // cost specifying the parent to use the original cost modelling. 6935 if (!InLoopReductionImmediateChains.count(RetI)) 6936 return None; 6937 6938 // Find the reduction this chain is a part of and calculate the basic cost of 6939 // the reduction on its own. 6940 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6941 Instruction *ReductionPhi = LastChain; 6942 while (!isa<PHINode>(ReductionPhi)) 6943 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6944 6945 const RecurrenceDescriptor &RdxDesc = 6946 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6947 6948 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6949 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6950 6951 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6952 // normal fmul instruction to the cost of the fadd reduction. 6953 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6954 BaseCost += 6955 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6956 6957 // If we're using ordered reductions then we can just return the base cost 6958 // here, since getArithmeticReductionCost calculates the full ordered 6959 // reduction cost when FP reassociation is not allowed. 6960 if (useOrderedReductions(RdxDesc)) 6961 return BaseCost; 6962 6963 // Get the operand that was not the reduction chain and match it to one of the 6964 // patterns, returning the better cost if it is found. 6965 Instruction *RedOp = RetI->getOperand(1) == LastChain 6966 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6967 : dyn_cast<Instruction>(RetI->getOperand(1)); 6968 6969 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6970 6971 Instruction *Op0, *Op1; 6972 if (RedOp && 6973 match(RedOp, 6974 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6975 match(Op0, m_ZExtOrSExt(m_Value())) && 6976 Op0->getOpcode() == Op1->getOpcode() && 6977 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6978 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6979 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6980 6981 // Matched reduce(ext(mul(ext(A), ext(B))) 6982 // Note that the extend opcodes need to all match, or if A==B they will have 6983 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6984 // which is equally fine. 6985 bool IsUnsigned = isa<ZExtInst>(Op0); 6986 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6987 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6988 6989 InstructionCost ExtCost = 6990 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6991 TTI::CastContextHint::None, CostKind, Op0); 6992 InstructionCost MulCost = 6993 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6994 InstructionCost Ext2Cost = 6995 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6996 TTI::CastContextHint::None, CostKind, RedOp); 6997 6998 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6999 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7000 CostKind); 7001 7002 if (RedCost.isValid() && 7003 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7004 return I == RetI ? RedCost : 0; 7005 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7006 !TheLoop->isLoopInvariant(RedOp)) { 7007 // Matched reduce(ext(A)) 7008 bool IsUnsigned = isa<ZExtInst>(RedOp); 7009 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7010 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7011 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7012 CostKind); 7013 7014 InstructionCost ExtCost = 7015 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7016 TTI::CastContextHint::None, CostKind, RedOp); 7017 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7018 return I == RetI ? RedCost : 0; 7019 } else if (RedOp && 7020 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7021 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7022 Op0->getOpcode() == Op1->getOpcode() && 7023 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7024 bool IsUnsigned = isa<ZExtInst>(Op0); 7025 Type *Op0Ty = Op0->getOperand(0)->getType(); 7026 Type *Op1Ty = Op1->getOperand(0)->getType(); 7027 Type *LargestOpTy = 7028 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7029 : Op0Ty; 7030 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7031 7032 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7033 // different sizes. We take the largest type as the ext to reduce, and add 7034 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7035 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7036 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7037 TTI::CastContextHint::None, CostKind, Op0); 7038 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7039 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7040 TTI::CastContextHint::None, CostKind, Op1); 7041 InstructionCost MulCost = 7042 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7043 7044 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7045 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7046 CostKind); 7047 InstructionCost ExtraExtCost = 0; 7048 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7049 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7050 ExtraExtCost = TTI.getCastInstrCost( 7051 ExtraExtOp->getOpcode(), ExtType, 7052 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7053 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7054 } 7055 7056 if (RedCost.isValid() && 7057 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7058 return I == RetI ? RedCost : 0; 7059 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7060 // Matched reduce(mul()) 7061 InstructionCost MulCost = 7062 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7063 7064 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7065 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7066 CostKind); 7067 7068 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7069 return I == RetI ? RedCost : 0; 7070 } 7071 } 7072 7073 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7074 } 7075 7076 InstructionCost 7077 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7078 ElementCount VF) { 7079 // Calculate scalar cost only. Vectorization cost should be ready at this 7080 // moment. 7081 if (VF.isScalar()) { 7082 Type *ValTy = getLoadStoreType(I); 7083 const Align Alignment = getLoadStoreAlignment(I); 7084 unsigned AS = getLoadStoreAddressSpace(I); 7085 7086 return TTI.getAddressComputationCost(ValTy) + 7087 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7088 TTI::TCK_RecipThroughput, I); 7089 } 7090 return getWideningCost(I, VF); 7091 } 7092 7093 LoopVectorizationCostModel::VectorizationCostTy 7094 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7095 ElementCount VF) { 7096 // If we know that this instruction will remain uniform, check the cost of 7097 // the scalar version. 7098 if (isUniformAfterVectorization(I, VF)) 7099 VF = ElementCount::getFixed(1); 7100 7101 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7102 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7103 7104 // Forced scalars do not have any scalarization overhead. 7105 auto ForcedScalar = ForcedScalars.find(VF); 7106 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7107 auto InstSet = ForcedScalar->second; 7108 if (InstSet.count(I)) 7109 return VectorizationCostTy( 7110 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7111 VF.getKnownMinValue()), 7112 false); 7113 } 7114 7115 Type *VectorTy; 7116 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7117 7118 bool TypeNotScalarized = false; 7119 if (VF.isVector() && VectorTy->isVectorTy()) { 7120 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7121 if (NumParts) 7122 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7123 else 7124 C = InstructionCost::getInvalid(); 7125 } 7126 return VectorizationCostTy(C, TypeNotScalarized); 7127 } 7128 7129 InstructionCost 7130 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7131 ElementCount VF) const { 7132 7133 // There is no mechanism yet to create a scalable scalarization loop, 7134 // so this is currently Invalid. 7135 if (VF.isScalable()) 7136 return InstructionCost::getInvalid(); 7137 7138 if (VF.isScalar()) 7139 return 0; 7140 7141 InstructionCost Cost = 0; 7142 Type *RetTy = ToVectorTy(I->getType(), VF); 7143 if (!RetTy->isVoidTy() && 7144 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7145 Cost += TTI.getScalarizationOverhead( 7146 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7147 false); 7148 7149 // Some targets keep addresses scalar. 7150 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7151 return Cost; 7152 7153 // Some targets support efficient element stores. 7154 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7155 return Cost; 7156 7157 // Collect operands to consider. 7158 CallInst *CI = dyn_cast<CallInst>(I); 7159 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7160 7161 // Skip operands that do not require extraction/scalarization and do not incur 7162 // any overhead. 7163 SmallVector<Type *> Tys; 7164 for (auto *V : filterExtractingOperands(Ops, VF)) 7165 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7166 return Cost + TTI.getOperandsScalarizationOverhead( 7167 filterExtractingOperands(Ops, VF), Tys); 7168 } 7169 7170 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7171 if (VF.isScalar()) 7172 return; 7173 NumPredStores = 0; 7174 for (BasicBlock *BB : TheLoop->blocks()) { 7175 // For each instruction in the old loop. 7176 for (Instruction &I : *BB) { 7177 Value *Ptr = getLoadStorePointerOperand(&I); 7178 if (!Ptr) 7179 continue; 7180 7181 // TODO: We should generate better code and update the cost model for 7182 // predicated uniform stores. Today they are treated as any other 7183 // predicated store (see added test cases in 7184 // invariant-store-vectorization.ll). 7185 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7186 NumPredStores++; 7187 7188 if (Legal->isUniformMemOp(I)) { 7189 // TODO: Avoid replicating loads and stores instead of 7190 // relying on instcombine to remove them. 7191 // Load: Scalar load + broadcast 7192 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7193 InstructionCost Cost; 7194 if (isa<StoreInst>(&I) && VF.isScalable() && 7195 isLegalGatherOrScatter(&I)) { 7196 Cost = getGatherScatterCost(&I, VF); 7197 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7198 } else { 7199 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7200 "Cannot yet scalarize uniform stores"); 7201 Cost = getUniformMemOpCost(&I, VF); 7202 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7203 } 7204 continue; 7205 } 7206 7207 // We assume that widening is the best solution when possible. 7208 if (memoryInstructionCanBeWidened(&I, VF)) { 7209 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7210 int ConsecutiveStride = Legal->isConsecutivePtr( 7211 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7212 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7213 "Expected consecutive stride."); 7214 InstWidening Decision = 7215 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7216 setWideningDecision(&I, VF, Decision, Cost); 7217 continue; 7218 } 7219 7220 // Choose between Interleaving, Gather/Scatter or Scalarization. 7221 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7222 unsigned NumAccesses = 1; 7223 if (isAccessInterleaved(&I)) { 7224 auto Group = getInterleavedAccessGroup(&I); 7225 assert(Group && "Fail to get an interleaved access group."); 7226 7227 // Make one decision for the whole group. 7228 if (getWideningDecision(&I, VF) != CM_Unknown) 7229 continue; 7230 7231 NumAccesses = Group->getNumMembers(); 7232 if (interleavedAccessCanBeWidened(&I, VF)) 7233 InterleaveCost = getInterleaveGroupCost(&I, VF); 7234 } 7235 7236 InstructionCost GatherScatterCost = 7237 isLegalGatherOrScatter(&I) 7238 ? getGatherScatterCost(&I, VF) * NumAccesses 7239 : InstructionCost::getInvalid(); 7240 7241 InstructionCost ScalarizationCost = 7242 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7243 7244 // Choose better solution for the current VF, 7245 // write down this decision and use it during vectorization. 7246 InstructionCost Cost; 7247 InstWidening Decision; 7248 if (InterleaveCost <= GatherScatterCost && 7249 InterleaveCost < ScalarizationCost) { 7250 Decision = CM_Interleave; 7251 Cost = InterleaveCost; 7252 } else if (GatherScatterCost < ScalarizationCost) { 7253 Decision = CM_GatherScatter; 7254 Cost = GatherScatterCost; 7255 } else { 7256 Decision = CM_Scalarize; 7257 Cost = ScalarizationCost; 7258 } 7259 // If the instructions belongs to an interleave group, the whole group 7260 // receives the same decision. The whole group receives the cost, but 7261 // the cost will actually be assigned to one instruction. 7262 if (auto Group = getInterleavedAccessGroup(&I)) 7263 setWideningDecision(Group, VF, Decision, Cost); 7264 else 7265 setWideningDecision(&I, VF, Decision, Cost); 7266 } 7267 } 7268 7269 // Make sure that any load of address and any other address computation 7270 // remains scalar unless there is gather/scatter support. This avoids 7271 // inevitable extracts into address registers, and also has the benefit of 7272 // activating LSR more, since that pass can't optimize vectorized 7273 // addresses. 7274 if (TTI.prefersVectorizedAddressing()) 7275 return; 7276 7277 // Start with all scalar pointer uses. 7278 SmallPtrSet<Instruction *, 8> AddrDefs; 7279 for (BasicBlock *BB : TheLoop->blocks()) 7280 for (Instruction &I : *BB) { 7281 Instruction *PtrDef = 7282 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7283 if (PtrDef && TheLoop->contains(PtrDef) && 7284 getWideningDecision(&I, VF) != CM_GatherScatter) 7285 AddrDefs.insert(PtrDef); 7286 } 7287 7288 // Add all instructions used to generate the addresses. 7289 SmallVector<Instruction *, 4> Worklist; 7290 append_range(Worklist, AddrDefs); 7291 while (!Worklist.empty()) { 7292 Instruction *I = Worklist.pop_back_val(); 7293 for (auto &Op : I->operands()) 7294 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7295 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7296 AddrDefs.insert(InstOp).second) 7297 Worklist.push_back(InstOp); 7298 } 7299 7300 for (auto *I : AddrDefs) { 7301 if (isa<LoadInst>(I)) { 7302 // Setting the desired widening decision should ideally be handled in 7303 // by cost functions, but since this involves the task of finding out 7304 // if the loaded register is involved in an address computation, it is 7305 // instead changed here when we know this is the case. 7306 InstWidening Decision = getWideningDecision(I, VF); 7307 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7308 // Scalarize a widened load of address. 7309 setWideningDecision( 7310 I, VF, CM_Scalarize, 7311 (VF.getKnownMinValue() * 7312 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7313 else if (auto Group = getInterleavedAccessGroup(I)) { 7314 // Scalarize an interleave group of address loads. 7315 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7316 if (Instruction *Member = Group->getMember(I)) 7317 setWideningDecision( 7318 Member, VF, CM_Scalarize, 7319 (VF.getKnownMinValue() * 7320 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7321 } 7322 } 7323 } else 7324 // Make sure I gets scalarized and a cost estimate without 7325 // scalarization overhead. 7326 ForcedScalars[VF].insert(I); 7327 } 7328 } 7329 7330 InstructionCost 7331 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7332 Type *&VectorTy) { 7333 Type *RetTy = I->getType(); 7334 if (canTruncateToMinimalBitwidth(I, VF)) 7335 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7336 auto SE = PSE.getSE(); 7337 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7338 7339 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7340 ElementCount VF) -> bool { 7341 if (VF.isScalar()) 7342 return true; 7343 7344 auto Scalarized = InstsToScalarize.find(VF); 7345 assert(Scalarized != InstsToScalarize.end() && 7346 "VF not yet analyzed for scalarization profitability"); 7347 return !Scalarized->second.count(I) && 7348 llvm::all_of(I->users(), [&](User *U) { 7349 auto *UI = cast<Instruction>(U); 7350 return !Scalarized->second.count(UI); 7351 }); 7352 }; 7353 (void) hasSingleCopyAfterVectorization; 7354 7355 if (isScalarAfterVectorization(I, VF)) { 7356 // With the exception of GEPs and PHIs, after scalarization there should 7357 // only be one copy of the instruction generated in the loop. This is 7358 // because the VF is either 1, or any instructions that need scalarizing 7359 // have already been dealt with by the the time we get here. As a result, 7360 // it means we don't have to multiply the instruction cost by VF. 7361 assert(I->getOpcode() == Instruction::GetElementPtr || 7362 I->getOpcode() == Instruction::PHI || 7363 (I->getOpcode() == Instruction::BitCast && 7364 I->getType()->isPointerTy()) || 7365 hasSingleCopyAfterVectorization(I, VF)); 7366 VectorTy = RetTy; 7367 } else 7368 VectorTy = ToVectorTy(RetTy, VF); 7369 7370 // TODO: We need to estimate the cost of intrinsic calls. 7371 switch (I->getOpcode()) { 7372 case Instruction::GetElementPtr: 7373 // We mark this instruction as zero-cost because the cost of GEPs in 7374 // vectorized code depends on whether the corresponding memory instruction 7375 // is scalarized or not. Therefore, we handle GEPs with the memory 7376 // instruction cost. 7377 return 0; 7378 case Instruction::Br: { 7379 // In cases of scalarized and predicated instructions, there will be VF 7380 // predicated blocks in the vectorized loop. Each branch around these 7381 // blocks requires also an extract of its vector compare i1 element. 7382 bool ScalarPredicatedBB = false; 7383 BranchInst *BI = cast<BranchInst>(I); 7384 if (VF.isVector() && BI->isConditional() && 7385 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7386 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7387 ScalarPredicatedBB = true; 7388 7389 if (ScalarPredicatedBB) { 7390 // Not possible to scalarize scalable vector with predicated instructions. 7391 if (VF.isScalable()) 7392 return InstructionCost::getInvalid(); 7393 // Return cost for branches around scalarized and predicated blocks. 7394 auto *Vec_i1Ty = 7395 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7396 return ( 7397 TTI.getScalarizationOverhead( 7398 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7399 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7400 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7401 // The back-edge branch will remain, as will all scalar branches. 7402 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7403 else 7404 // This branch will be eliminated by if-conversion. 7405 return 0; 7406 // Note: We currently assume zero cost for an unconditional branch inside 7407 // a predicated block since it will become a fall-through, although we 7408 // may decide in the future to call TTI for all branches. 7409 } 7410 case Instruction::PHI: { 7411 auto *Phi = cast<PHINode>(I); 7412 7413 // First-order recurrences are replaced by vector shuffles inside the loop. 7414 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7415 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7416 return TTI.getShuffleCost( 7417 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7418 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7419 7420 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7421 // converted into select instructions. We require N - 1 selects per phi 7422 // node, where N is the number of incoming values. 7423 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7424 return (Phi->getNumIncomingValues() - 1) * 7425 TTI.getCmpSelInstrCost( 7426 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7427 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7428 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7429 7430 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7431 } 7432 case Instruction::UDiv: 7433 case Instruction::SDiv: 7434 case Instruction::URem: 7435 case Instruction::SRem: 7436 // If we have a predicated instruction, it may not be executed for each 7437 // vector lane. Get the scalarization cost and scale this amount by the 7438 // probability of executing the predicated block. If the instruction is not 7439 // predicated, we fall through to the next case. 7440 if (VF.isVector() && isScalarWithPredication(I)) { 7441 InstructionCost Cost = 0; 7442 7443 // These instructions have a non-void type, so account for the phi nodes 7444 // that we will create. This cost is likely to be zero. The phi node 7445 // cost, if any, should be scaled by the block probability because it 7446 // models a copy at the end of each predicated block. 7447 Cost += VF.getKnownMinValue() * 7448 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7449 7450 // The cost of the non-predicated instruction. 7451 Cost += VF.getKnownMinValue() * 7452 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7453 7454 // The cost of insertelement and extractelement instructions needed for 7455 // scalarization. 7456 Cost += getScalarizationOverhead(I, VF); 7457 7458 // Scale the cost by the probability of executing the predicated blocks. 7459 // This assumes the predicated block for each vector lane is equally 7460 // likely. 7461 return Cost / getReciprocalPredBlockProb(); 7462 } 7463 LLVM_FALLTHROUGH; 7464 case Instruction::Add: 7465 case Instruction::FAdd: 7466 case Instruction::Sub: 7467 case Instruction::FSub: 7468 case Instruction::Mul: 7469 case Instruction::FMul: 7470 case Instruction::FDiv: 7471 case Instruction::FRem: 7472 case Instruction::Shl: 7473 case Instruction::LShr: 7474 case Instruction::AShr: 7475 case Instruction::And: 7476 case Instruction::Or: 7477 case Instruction::Xor: { 7478 // Since we will replace the stride by 1 the multiplication should go away. 7479 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7480 return 0; 7481 7482 // Detect reduction patterns 7483 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7484 return *RedCost; 7485 7486 // Certain instructions can be cheaper to vectorize if they have a constant 7487 // second vector operand. One example of this are shifts on x86. 7488 Value *Op2 = I->getOperand(1); 7489 TargetTransformInfo::OperandValueProperties Op2VP; 7490 TargetTransformInfo::OperandValueKind Op2VK = 7491 TTI.getOperandInfo(Op2, Op2VP); 7492 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7493 Op2VK = TargetTransformInfo::OK_UniformValue; 7494 7495 SmallVector<const Value *, 4> Operands(I->operand_values()); 7496 return TTI.getArithmeticInstrCost( 7497 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7498 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7499 } 7500 case Instruction::FNeg: { 7501 return TTI.getArithmeticInstrCost( 7502 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7503 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7504 TargetTransformInfo::OP_None, I->getOperand(0), I); 7505 } 7506 case Instruction::Select: { 7507 SelectInst *SI = cast<SelectInst>(I); 7508 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7509 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7510 7511 const Value *Op0, *Op1; 7512 using namespace llvm::PatternMatch; 7513 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7514 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7515 // select x, y, false --> x & y 7516 // select x, true, y --> x | y 7517 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7518 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7519 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7520 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7521 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7522 Op1->getType()->getScalarSizeInBits() == 1); 7523 7524 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7525 return TTI.getArithmeticInstrCost( 7526 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7527 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7528 } 7529 7530 Type *CondTy = SI->getCondition()->getType(); 7531 if (!ScalarCond) 7532 CondTy = VectorType::get(CondTy, VF); 7533 7534 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7535 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7536 Pred = Cmp->getPredicate(); 7537 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7538 CostKind, I); 7539 } 7540 case Instruction::ICmp: 7541 case Instruction::FCmp: { 7542 Type *ValTy = I->getOperand(0)->getType(); 7543 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7544 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7545 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7546 VectorTy = ToVectorTy(ValTy, VF); 7547 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7548 cast<CmpInst>(I)->getPredicate(), CostKind, 7549 I); 7550 } 7551 case Instruction::Store: 7552 case Instruction::Load: { 7553 ElementCount Width = VF; 7554 if (Width.isVector()) { 7555 InstWidening Decision = getWideningDecision(I, Width); 7556 assert(Decision != CM_Unknown && 7557 "CM decision should be taken at this point"); 7558 if (Decision == CM_Scalarize) 7559 Width = ElementCount::getFixed(1); 7560 } 7561 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7562 return getMemoryInstructionCost(I, VF); 7563 } 7564 case Instruction::BitCast: 7565 if (I->getType()->isPointerTy()) 7566 return 0; 7567 LLVM_FALLTHROUGH; 7568 case Instruction::ZExt: 7569 case Instruction::SExt: 7570 case Instruction::FPToUI: 7571 case Instruction::FPToSI: 7572 case Instruction::FPExt: 7573 case Instruction::PtrToInt: 7574 case Instruction::IntToPtr: 7575 case Instruction::SIToFP: 7576 case Instruction::UIToFP: 7577 case Instruction::Trunc: 7578 case Instruction::FPTrunc: { 7579 // Computes the CastContextHint from a Load/Store instruction. 7580 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7581 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7582 "Expected a load or a store!"); 7583 7584 if (VF.isScalar() || !TheLoop->contains(I)) 7585 return TTI::CastContextHint::Normal; 7586 7587 switch (getWideningDecision(I, VF)) { 7588 case LoopVectorizationCostModel::CM_GatherScatter: 7589 return TTI::CastContextHint::GatherScatter; 7590 case LoopVectorizationCostModel::CM_Interleave: 7591 return TTI::CastContextHint::Interleave; 7592 case LoopVectorizationCostModel::CM_Scalarize: 7593 case LoopVectorizationCostModel::CM_Widen: 7594 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7595 : TTI::CastContextHint::Normal; 7596 case LoopVectorizationCostModel::CM_Widen_Reverse: 7597 return TTI::CastContextHint::Reversed; 7598 case LoopVectorizationCostModel::CM_Unknown: 7599 llvm_unreachable("Instr did not go through cost modelling?"); 7600 } 7601 7602 llvm_unreachable("Unhandled case!"); 7603 }; 7604 7605 unsigned Opcode = I->getOpcode(); 7606 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7607 // For Trunc, the context is the only user, which must be a StoreInst. 7608 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7609 if (I->hasOneUse()) 7610 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7611 CCH = ComputeCCH(Store); 7612 } 7613 // For Z/Sext, the context is the operand, which must be a LoadInst. 7614 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7615 Opcode == Instruction::FPExt) { 7616 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7617 CCH = ComputeCCH(Load); 7618 } 7619 7620 // We optimize the truncation of induction variables having constant 7621 // integer steps. The cost of these truncations is the same as the scalar 7622 // operation. 7623 if (isOptimizableIVTruncate(I, VF)) { 7624 auto *Trunc = cast<TruncInst>(I); 7625 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7626 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7627 } 7628 7629 // Detect reduction patterns 7630 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7631 return *RedCost; 7632 7633 Type *SrcScalarTy = I->getOperand(0)->getType(); 7634 Type *SrcVecTy = 7635 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7636 if (canTruncateToMinimalBitwidth(I, VF)) { 7637 // This cast is going to be shrunk. This may remove the cast or it might 7638 // turn it into slightly different cast. For example, if MinBW == 16, 7639 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7640 // 7641 // Calculate the modified src and dest types. 7642 Type *MinVecTy = VectorTy; 7643 if (Opcode == Instruction::Trunc) { 7644 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7645 VectorTy = 7646 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7647 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7648 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7649 VectorTy = 7650 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7651 } 7652 } 7653 7654 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7655 } 7656 case Instruction::Call: { 7657 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7658 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7659 return *RedCost; 7660 bool NeedToScalarize; 7661 CallInst *CI = cast<CallInst>(I); 7662 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7663 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7664 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7665 return std::min(CallCost, IntrinsicCost); 7666 } 7667 return CallCost; 7668 } 7669 case Instruction::ExtractValue: 7670 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7671 case Instruction::Alloca: 7672 // We cannot easily widen alloca to a scalable alloca, as 7673 // the result would need to be a vector of pointers. 7674 if (VF.isScalable()) 7675 return InstructionCost::getInvalid(); 7676 LLVM_FALLTHROUGH; 7677 default: 7678 // This opcode is unknown. Assume that it is the same as 'mul'. 7679 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7680 } // end of switch. 7681 } 7682 7683 char LoopVectorize::ID = 0; 7684 7685 static const char lv_name[] = "Loop Vectorization"; 7686 7687 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7688 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7689 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7690 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7691 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7692 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7693 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7694 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7695 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7696 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7698 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7699 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7700 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7701 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7702 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7703 7704 namespace llvm { 7705 7706 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7707 7708 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7709 bool VectorizeOnlyWhenForced) { 7710 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7711 } 7712 7713 } // end namespace llvm 7714 7715 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7716 // Check if the pointer operand of a load or store instruction is 7717 // consecutive. 7718 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7719 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7720 return false; 7721 } 7722 7723 void LoopVectorizationCostModel::collectValuesToIgnore() { 7724 // Ignore ephemeral values. 7725 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7726 7727 // Ignore type-promoting instructions we identified during reduction 7728 // detection. 7729 for (auto &Reduction : Legal->getReductionVars()) { 7730 const RecurrenceDescriptor &RedDes = Reduction.second; 7731 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7732 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7733 } 7734 // Ignore type-casting instructions we identified during induction 7735 // detection. 7736 for (auto &Induction : Legal->getInductionVars()) { 7737 const InductionDescriptor &IndDes = Induction.second; 7738 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7739 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7740 } 7741 } 7742 7743 void LoopVectorizationCostModel::collectInLoopReductions() { 7744 for (auto &Reduction : Legal->getReductionVars()) { 7745 PHINode *Phi = Reduction.first; 7746 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7747 7748 // We don't collect reductions that are type promoted (yet). 7749 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7750 continue; 7751 7752 // If the target would prefer this reduction to happen "in-loop", then we 7753 // want to record it as such. 7754 unsigned Opcode = RdxDesc.getOpcode(); 7755 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7756 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7757 TargetTransformInfo::ReductionFlags())) 7758 continue; 7759 7760 // Check that we can correctly put the reductions into the loop, by 7761 // finding the chain of operations that leads from the phi to the loop 7762 // exit value. 7763 SmallVector<Instruction *, 4> ReductionOperations = 7764 RdxDesc.getReductionOpChain(Phi, TheLoop); 7765 bool InLoop = !ReductionOperations.empty(); 7766 if (InLoop) { 7767 InLoopReductionChains[Phi] = ReductionOperations; 7768 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7769 Instruction *LastChain = Phi; 7770 for (auto *I : ReductionOperations) { 7771 InLoopReductionImmediateChains[I] = LastChain; 7772 LastChain = I; 7773 } 7774 } 7775 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7776 << " reduction for phi: " << *Phi << "\n"); 7777 } 7778 } 7779 7780 // TODO: we could return a pair of values that specify the max VF and 7781 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7782 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7783 // doesn't have a cost model that can choose which plan to execute if 7784 // more than one is generated. 7785 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7786 LoopVectorizationCostModel &CM) { 7787 unsigned WidestType; 7788 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7789 return WidestVectorRegBits / WidestType; 7790 } 7791 7792 VectorizationFactor 7793 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7794 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7795 ElementCount VF = UserVF; 7796 // Outer loop handling: They may require CFG and instruction level 7797 // transformations before even evaluating whether vectorization is profitable. 7798 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7799 // the vectorization pipeline. 7800 if (!OrigLoop->isInnermost()) { 7801 // If the user doesn't provide a vectorization factor, determine a 7802 // reasonable one. 7803 if (UserVF.isZero()) { 7804 VF = ElementCount::getFixed(determineVPlanVF( 7805 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7806 .getFixedSize(), 7807 CM)); 7808 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7809 7810 // Make sure we have a VF > 1 for stress testing. 7811 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7812 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7813 << "overriding computed VF.\n"); 7814 VF = ElementCount::getFixed(4); 7815 } 7816 } 7817 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7818 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7819 "VF needs to be a power of two"); 7820 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7821 << "VF " << VF << " to build VPlans.\n"); 7822 buildVPlans(VF, VF); 7823 7824 // For VPlan build stress testing, we bail out after VPlan construction. 7825 if (VPlanBuildStressTest) 7826 return VectorizationFactor::Disabled(); 7827 7828 return {VF, 0 /*Cost*/}; 7829 } 7830 7831 LLVM_DEBUG( 7832 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7833 "VPlan-native path.\n"); 7834 return VectorizationFactor::Disabled(); 7835 } 7836 7837 Optional<VectorizationFactor> 7838 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7839 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7840 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7841 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7842 return None; 7843 7844 // Invalidate interleave groups if all blocks of loop will be predicated. 7845 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7846 !useMaskedInterleavedAccesses(*TTI)) { 7847 LLVM_DEBUG( 7848 dbgs() 7849 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7850 "which requires masked-interleaved support.\n"); 7851 if (CM.InterleaveInfo.invalidateGroups()) 7852 // Invalidating interleave groups also requires invalidating all decisions 7853 // based on them, which includes widening decisions and uniform and scalar 7854 // values. 7855 CM.invalidateCostModelingDecisions(); 7856 } 7857 7858 ElementCount MaxUserVF = 7859 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7860 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7861 if (!UserVF.isZero() && UserVFIsLegal) { 7862 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7863 "VF needs to be a power of two"); 7864 // Collect the instructions (and their associated costs) that will be more 7865 // profitable to scalarize. 7866 if (CM.selectUserVectorizationFactor(UserVF)) { 7867 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7868 CM.collectInLoopReductions(); 7869 buildVPlansWithVPRecipes(UserVF, UserVF); 7870 LLVM_DEBUG(printPlans(dbgs())); 7871 return {{UserVF, 0}}; 7872 } else 7873 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7874 "InvalidCost", ORE, OrigLoop); 7875 } 7876 7877 // Populate the set of Vectorization Factor Candidates. 7878 ElementCountSet VFCandidates; 7879 for (auto VF = ElementCount::getFixed(1); 7880 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7881 VFCandidates.insert(VF); 7882 for (auto VF = ElementCount::getScalable(1); 7883 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7884 VFCandidates.insert(VF); 7885 7886 for (const auto &VF : VFCandidates) { 7887 // Collect Uniform and Scalar instructions after vectorization with VF. 7888 CM.collectUniformsAndScalars(VF); 7889 7890 // Collect the instructions (and their associated costs) that will be more 7891 // profitable to scalarize. 7892 if (VF.isVector()) 7893 CM.collectInstsToScalarize(VF); 7894 } 7895 7896 CM.collectInLoopReductions(); 7897 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7898 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7899 7900 LLVM_DEBUG(printPlans(dbgs())); 7901 if (!MaxFactors.hasVector()) 7902 return VectorizationFactor::Disabled(); 7903 7904 // Select the optimal vectorization factor. 7905 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7906 7907 // Check if it is profitable to vectorize with runtime checks. 7908 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7909 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7910 bool PragmaThresholdReached = 7911 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7912 bool ThresholdReached = 7913 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7914 if ((ThresholdReached && !Hints.allowReordering()) || 7915 PragmaThresholdReached) { 7916 ORE->emit([&]() { 7917 return OptimizationRemarkAnalysisAliasing( 7918 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7919 OrigLoop->getHeader()) 7920 << "loop not vectorized: cannot prove it is safe to reorder " 7921 "memory operations"; 7922 }); 7923 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7924 Hints.emitRemarkWithHints(); 7925 return VectorizationFactor::Disabled(); 7926 } 7927 } 7928 return SelectedVF; 7929 } 7930 7931 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7932 assert(count_if(VPlans, 7933 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7934 1 && 7935 "Best VF has not a single VPlan."); 7936 7937 for (const VPlanPtr &Plan : VPlans) { 7938 if (Plan->hasVF(VF)) 7939 return *Plan.get(); 7940 } 7941 llvm_unreachable("No plan found!"); 7942 } 7943 7944 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7945 VPlan &BestVPlan, 7946 InnerLoopVectorizer &ILV, 7947 DominatorTree *DT) { 7948 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7949 << '\n'); 7950 7951 // Perform the actual loop transformation. 7952 7953 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7954 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7955 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7956 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7957 State.CanonicalIV = ILV.Induction; 7958 ILV.collectPoisonGeneratingRecipes(State); 7959 7960 ILV.printDebugTracesAtStart(); 7961 7962 //===------------------------------------------------===// 7963 // 7964 // Notice: any optimization or new instruction that go 7965 // into the code below should also be implemented in 7966 // the cost-model. 7967 // 7968 //===------------------------------------------------===// 7969 7970 // 2. Copy and widen instructions from the old loop into the new loop. 7971 BestVPlan.execute(&State); 7972 7973 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7974 // predication, updating analyses. 7975 ILV.fixVectorizedLoop(State); 7976 7977 ILV.printDebugTracesAtEnd(); 7978 } 7979 7980 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7981 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7982 for (const auto &Plan : VPlans) 7983 if (PrintVPlansInDotFormat) 7984 Plan->printDOT(O); 7985 else 7986 Plan->print(O); 7987 } 7988 #endif 7989 7990 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7991 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7992 7993 // We create new control-flow for the vectorized loop, so the original exit 7994 // conditions will be dead after vectorization if it's only used by the 7995 // terminator 7996 SmallVector<BasicBlock*> ExitingBlocks; 7997 OrigLoop->getExitingBlocks(ExitingBlocks); 7998 for (auto *BB : ExitingBlocks) { 7999 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8000 if (!Cmp || !Cmp->hasOneUse()) 8001 continue; 8002 8003 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8004 if (!DeadInstructions.insert(Cmp).second) 8005 continue; 8006 8007 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8008 // TODO: can recurse through operands in general 8009 for (Value *Op : Cmp->operands()) { 8010 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8011 DeadInstructions.insert(cast<Instruction>(Op)); 8012 } 8013 } 8014 8015 // We create new "steps" for induction variable updates to which the original 8016 // induction variables map. An original update instruction will be dead if 8017 // all its users except the induction variable are dead. 8018 auto *Latch = OrigLoop->getLoopLatch(); 8019 for (auto &Induction : Legal->getInductionVars()) { 8020 PHINode *Ind = Induction.first; 8021 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8022 8023 // If the tail is to be folded by masking, the primary induction variable, 8024 // if exists, isn't dead: it will be used for masking. Don't kill it. 8025 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8026 continue; 8027 8028 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8029 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8030 })) 8031 DeadInstructions.insert(IndUpdate); 8032 } 8033 } 8034 8035 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8036 8037 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8038 8039 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx, 8040 Value *Step, 8041 Instruction::BinaryOps BinOp) { 8042 // When unrolling and the VF is 1, we only need to add a simple scalar. 8043 Type *Ty = Val->getType(); 8044 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8045 8046 if (Ty->isFloatingPointTy()) { 8047 // Floating-point operations inherit FMF via the builder's flags. 8048 Value *MulOp = Builder.CreateFMul(StartIdx, Step); 8049 return Builder.CreateBinOp(BinOp, Val, MulOp); 8050 } 8051 return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction"); 8052 } 8053 8054 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8055 SmallVector<Metadata *, 4> MDs; 8056 // Reserve first location for self reference to the LoopID metadata node. 8057 MDs.push_back(nullptr); 8058 bool IsUnrollMetadata = false; 8059 MDNode *LoopID = L->getLoopID(); 8060 if (LoopID) { 8061 // First find existing loop unrolling disable metadata. 8062 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8063 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8064 if (MD) { 8065 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8066 IsUnrollMetadata = 8067 S && S->getString().startswith("llvm.loop.unroll.disable"); 8068 } 8069 MDs.push_back(LoopID->getOperand(i)); 8070 } 8071 } 8072 8073 if (!IsUnrollMetadata) { 8074 // Add runtime unroll disable metadata. 8075 LLVMContext &Context = L->getHeader()->getContext(); 8076 SmallVector<Metadata *, 1> DisableOperands; 8077 DisableOperands.push_back( 8078 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8079 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8080 MDs.push_back(DisableNode); 8081 MDNode *NewLoopID = MDNode::get(Context, MDs); 8082 // Set operand 0 to refer to the loop id itself. 8083 NewLoopID->replaceOperandWith(0, NewLoopID); 8084 L->setLoopID(NewLoopID); 8085 } 8086 } 8087 8088 //===--------------------------------------------------------------------===// 8089 // EpilogueVectorizerMainLoop 8090 //===--------------------------------------------------------------------===// 8091 8092 /// This function is partially responsible for generating the control flow 8093 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8094 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8095 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8096 Loop *Lp = createVectorLoopSkeleton(""); 8097 8098 // Generate the code to check the minimum iteration count of the vector 8099 // epilogue (see below). 8100 EPI.EpilogueIterationCountCheck = 8101 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8102 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8103 8104 // Generate the code to check any assumptions that we've made for SCEV 8105 // expressions. 8106 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8107 8108 // Generate the code that checks at runtime if arrays overlap. We put the 8109 // checks into a separate block to make the more common case of few elements 8110 // faster. 8111 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8112 8113 // Generate the iteration count check for the main loop, *after* the check 8114 // for the epilogue loop, so that the path-length is shorter for the case 8115 // that goes directly through the vector epilogue. The longer-path length for 8116 // the main loop is compensated for, by the gain from vectorizing the larger 8117 // trip count. Note: the branch will get updated later on when we vectorize 8118 // the epilogue. 8119 EPI.MainLoopIterationCountCheck = 8120 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8121 8122 // Generate the induction variable. 8123 OldInduction = Legal->getPrimaryInduction(); 8124 Type *IdxTy = Legal->getWidestInductionType(); 8125 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8126 8127 IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); 8128 Value *Step = getRuntimeVF(B, IdxTy, VF * UF); 8129 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8130 EPI.VectorTripCount = CountRoundDown; 8131 Induction = 8132 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8133 getDebugLocFromInstOrOperands(OldInduction)); 8134 8135 // Skip induction resume value creation here because they will be created in 8136 // the second pass. If we created them here, they wouldn't be used anyway, 8137 // because the vplan in the second pass still contains the inductions from the 8138 // original loop. 8139 8140 return completeLoopSkeleton(Lp, OrigLoopID); 8141 } 8142 8143 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8144 LLVM_DEBUG({ 8145 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8146 << "Main Loop VF:" << EPI.MainLoopVF 8147 << ", Main Loop UF:" << EPI.MainLoopUF 8148 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8149 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8150 }); 8151 } 8152 8153 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8154 DEBUG_WITH_TYPE(VerboseDebug, { 8155 dbgs() << "intermediate fn:\n" 8156 << *OrigLoop->getHeader()->getParent() << "\n"; 8157 }); 8158 } 8159 8160 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8161 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8162 assert(L && "Expected valid Loop."); 8163 assert(Bypass && "Expected valid bypass basic block."); 8164 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8165 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8166 Value *Count = getOrCreateTripCount(L); 8167 // Reuse existing vector loop preheader for TC checks. 8168 // Note that new preheader block is generated for vector loop. 8169 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8170 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8171 8172 // Generate code to check if the loop's trip count is less than VF * UF of the 8173 // main vector loop. 8174 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8175 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8176 8177 Value *CheckMinIters = Builder.CreateICmp( 8178 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8179 "min.iters.check"); 8180 8181 if (!ForEpilogue) 8182 TCCheckBlock->setName("vector.main.loop.iter.check"); 8183 8184 // Create new preheader for vector loop. 8185 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8186 DT, LI, nullptr, "vector.ph"); 8187 8188 if (ForEpilogue) { 8189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8190 DT->getNode(Bypass)->getIDom()) && 8191 "TC check is expected to dominate Bypass"); 8192 8193 // Update dominator for Bypass & LoopExit. 8194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8195 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8196 // For loops with multiple exits, there's no edge from the middle block 8197 // to exit blocks (as the epilogue must run) and thus no need to update 8198 // the immediate dominator of the exit blocks. 8199 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8200 8201 LoopBypassBlocks.push_back(TCCheckBlock); 8202 8203 // Save the trip count so we don't have to regenerate it in the 8204 // vec.epilog.iter.check. This is safe to do because the trip count 8205 // generated here dominates the vector epilog iter check. 8206 EPI.TripCount = Count; 8207 } 8208 8209 ReplaceInstWithInst( 8210 TCCheckBlock->getTerminator(), 8211 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8212 8213 return TCCheckBlock; 8214 } 8215 8216 //===--------------------------------------------------------------------===// 8217 // EpilogueVectorizerEpilogueLoop 8218 //===--------------------------------------------------------------------===// 8219 8220 /// This function is partially responsible for generating the control flow 8221 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8222 BasicBlock * 8223 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8224 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8225 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8226 8227 // Now, compare the remaining count and if there aren't enough iterations to 8228 // execute the vectorized epilogue skip to the scalar part. 8229 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8230 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8231 LoopVectorPreHeader = 8232 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8233 LI, nullptr, "vec.epilog.ph"); 8234 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8235 VecEpilogueIterationCountCheck); 8236 8237 // Adjust the control flow taking the state info from the main loop 8238 // vectorization into account. 8239 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8240 "expected this to be saved from the previous pass."); 8241 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8242 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8243 8244 DT->changeImmediateDominator(LoopVectorPreHeader, 8245 EPI.MainLoopIterationCountCheck); 8246 8247 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8248 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8249 8250 if (EPI.SCEVSafetyCheck) 8251 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8252 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8253 if (EPI.MemSafetyCheck) 8254 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8255 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8256 8257 DT->changeImmediateDominator( 8258 VecEpilogueIterationCountCheck, 8259 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8260 8261 DT->changeImmediateDominator(LoopScalarPreHeader, 8262 EPI.EpilogueIterationCountCheck); 8263 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8264 // If there is an epilogue which must run, there's no edge from the 8265 // middle block to exit blocks and thus no need to update the immediate 8266 // dominator of the exit blocks. 8267 DT->changeImmediateDominator(LoopExitBlock, 8268 EPI.EpilogueIterationCountCheck); 8269 8270 // Keep track of bypass blocks, as they feed start values to the induction 8271 // phis in the scalar loop preheader. 8272 if (EPI.SCEVSafetyCheck) 8273 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8274 if (EPI.MemSafetyCheck) 8275 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8276 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8277 8278 // Generate a resume induction for the vector epilogue and put it in the 8279 // vector epilogue preheader 8280 Type *IdxTy = Legal->getWidestInductionType(); 8281 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8282 LoopVectorPreHeader->getFirstNonPHI()); 8283 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8284 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8285 EPI.MainLoopIterationCountCheck); 8286 8287 // Generate the induction variable. 8288 OldInduction = Legal->getPrimaryInduction(); 8289 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8290 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8291 Value *StartIdx = EPResumeVal; 8292 Induction = 8293 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8294 getDebugLocFromInstOrOperands(OldInduction)); 8295 8296 // Generate induction resume values. These variables save the new starting 8297 // indexes for the scalar loop. They are used to test if there are any tail 8298 // iterations left once the vector loop has completed. 8299 // Note that when the vectorized epilogue is skipped due to iteration count 8300 // check, then the resume value for the induction variable comes from 8301 // the trip count of the main vector loop, hence passing the AdditionalBypass 8302 // argument. 8303 createInductionResumeValues(Lp, CountRoundDown, 8304 {VecEpilogueIterationCountCheck, 8305 EPI.VectorTripCount} /* AdditionalBypass */); 8306 8307 AddRuntimeUnrollDisableMetaData(Lp); 8308 return completeLoopSkeleton(Lp, OrigLoopID); 8309 } 8310 8311 BasicBlock * 8312 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8313 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8314 8315 assert(EPI.TripCount && 8316 "Expected trip count to have been safed in the first pass."); 8317 assert( 8318 (!isa<Instruction>(EPI.TripCount) || 8319 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8320 "saved trip count does not dominate insertion point."); 8321 Value *TC = EPI.TripCount; 8322 IRBuilder<> Builder(Insert->getTerminator()); 8323 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8324 8325 // Generate code to check if the loop's trip count is less than VF * UF of the 8326 // vector epilogue loop. 8327 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8328 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8329 8330 Value *CheckMinIters = 8331 Builder.CreateICmp(P, Count, 8332 createStepForVF(Builder, Count->getType(), 8333 EPI.EpilogueVF, EPI.EpilogueUF), 8334 "min.epilog.iters.check"); 8335 8336 ReplaceInstWithInst( 8337 Insert->getTerminator(), 8338 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8339 8340 LoopBypassBlocks.push_back(Insert); 8341 return Insert; 8342 } 8343 8344 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8345 LLVM_DEBUG({ 8346 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8347 << "Epilogue Loop VF:" << EPI.EpilogueVF 8348 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8349 }); 8350 } 8351 8352 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8353 DEBUG_WITH_TYPE(VerboseDebug, { 8354 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8355 }); 8356 } 8357 8358 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8359 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8360 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8361 bool PredicateAtRangeStart = Predicate(Range.Start); 8362 8363 for (ElementCount TmpVF = Range.Start * 2; 8364 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8365 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8366 Range.End = TmpVF; 8367 break; 8368 } 8369 8370 return PredicateAtRangeStart; 8371 } 8372 8373 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8374 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8375 /// of VF's starting at a given VF and extending it as much as possible. Each 8376 /// vectorization decision can potentially shorten this sub-range during 8377 /// buildVPlan(). 8378 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8379 ElementCount MaxVF) { 8380 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8381 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8382 VFRange SubRange = {VF, MaxVFPlusOne}; 8383 VPlans.push_back(buildVPlan(SubRange)); 8384 VF = SubRange.End; 8385 } 8386 } 8387 8388 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8389 VPlanPtr &Plan) { 8390 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8391 8392 // Look for cached value. 8393 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8394 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8395 if (ECEntryIt != EdgeMaskCache.end()) 8396 return ECEntryIt->second; 8397 8398 VPValue *SrcMask = createBlockInMask(Src, Plan); 8399 8400 // The terminator has to be a branch inst! 8401 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8402 assert(BI && "Unexpected terminator found"); 8403 8404 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8405 return EdgeMaskCache[Edge] = SrcMask; 8406 8407 // If source is an exiting block, we know the exit edge is dynamically dead 8408 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8409 // adding uses of an otherwise potentially dead instruction. 8410 if (OrigLoop->isLoopExiting(Src)) 8411 return EdgeMaskCache[Edge] = SrcMask; 8412 8413 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8414 assert(EdgeMask && "No Edge Mask found for condition"); 8415 8416 if (BI->getSuccessor(0) != Dst) 8417 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8418 8419 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8420 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8421 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8422 // The select version does not introduce new UB if SrcMask is false and 8423 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8424 VPValue *False = Plan->getOrAddVPValue( 8425 ConstantInt::getFalse(BI->getCondition()->getType())); 8426 EdgeMask = 8427 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8428 } 8429 8430 return EdgeMaskCache[Edge] = EdgeMask; 8431 } 8432 8433 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8434 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8435 8436 // Look for cached value. 8437 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8438 if (BCEntryIt != BlockMaskCache.end()) 8439 return BCEntryIt->second; 8440 8441 // All-one mask is modelled as no-mask following the convention for masked 8442 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8443 VPValue *BlockMask = nullptr; 8444 8445 if (OrigLoop->getHeader() == BB) { 8446 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8447 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8448 8449 // Introduce the early-exit compare IV <= BTC to form header block mask. 8450 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8451 // Start by constructing the desired canonical IV in the header block. 8452 VPValue *IV = nullptr; 8453 if (Legal->getPrimaryInduction()) 8454 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8455 else { 8456 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8457 auto *IVRecipe = new VPWidenCanonicalIVRecipe(); 8458 HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); 8459 IV = IVRecipe; 8460 } 8461 8462 // Create the block in mask as the first non-phi instruction in the block. 8463 VPBuilder::InsertPointGuard Guard(Builder); 8464 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8465 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8466 8467 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8468 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8469 8470 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8471 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8472 // as a second argument, we only pass the IV here and extract the 8473 // tripcount from the transform state where codegen of the VP instructions 8474 // happen. 8475 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8476 } else { 8477 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8478 } 8479 return BlockMaskCache[BB] = BlockMask; 8480 } 8481 8482 // This is the block mask. We OR all incoming edges. 8483 for (auto *Predecessor : predecessors(BB)) { 8484 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8485 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8486 return BlockMaskCache[BB] = EdgeMask; 8487 8488 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8489 BlockMask = EdgeMask; 8490 continue; 8491 } 8492 8493 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8494 } 8495 8496 return BlockMaskCache[BB] = BlockMask; 8497 } 8498 8499 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8500 ArrayRef<VPValue *> Operands, 8501 VFRange &Range, 8502 VPlanPtr &Plan) { 8503 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8504 "Must be called with either a load or store"); 8505 8506 auto willWiden = [&](ElementCount VF) -> bool { 8507 if (VF.isScalar()) 8508 return false; 8509 LoopVectorizationCostModel::InstWidening Decision = 8510 CM.getWideningDecision(I, VF); 8511 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8512 "CM decision should be taken at this point."); 8513 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8514 return true; 8515 if (CM.isScalarAfterVectorization(I, VF) || 8516 CM.isProfitableToScalarize(I, VF)) 8517 return false; 8518 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8519 }; 8520 8521 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8522 return nullptr; 8523 8524 VPValue *Mask = nullptr; 8525 if (Legal->isMaskRequired(I)) 8526 Mask = createBlockInMask(I->getParent(), Plan); 8527 8528 // Determine if the pointer operand of the access is either consecutive or 8529 // reverse consecutive. 8530 LoopVectorizationCostModel::InstWidening Decision = 8531 CM.getWideningDecision(I, Range.Start); 8532 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8533 bool Consecutive = 8534 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8535 8536 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8537 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8538 Consecutive, Reverse); 8539 8540 StoreInst *Store = cast<StoreInst>(I); 8541 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8542 Mask, Consecutive, Reverse); 8543 } 8544 8545 VPWidenIntOrFpInductionRecipe * 8546 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8547 ArrayRef<VPValue *> Operands) const { 8548 // Check if this is an integer or fp induction. If so, build the recipe that 8549 // produces its scalar and vector values. 8550 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8551 assert(II->getStartValue() == 8552 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8553 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8554 } 8555 8556 return nullptr; 8557 } 8558 8559 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8560 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8561 VPlan &Plan) const { 8562 // Optimize the special case where the source is a constant integer 8563 // induction variable. Notice that we can only optimize the 'trunc' case 8564 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8565 // (c) other casts depend on pointer size. 8566 8567 // Determine whether \p K is a truncation based on an induction variable that 8568 // can be optimized. 8569 auto isOptimizableIVTruncate = 8570 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8571 return [=](ElementCount VF) -> bool { 8572 return CM.isOptimizableIVTruncate(K, VF); 8573 }; 8574 }; 8575 8576 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8577 isOptimizableIVTruncate(I), Range)) { 8578 8579 auto *Phi = cast<PHINode>(I->getOperand(0)); 8580 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8581 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8582 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8583 } 8584 return nullptr; 8585 } 8586 8587 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8588 ArrayRef<VPValue *> Operands, 8589 VPlanPtr &Plan) { 8590 // If all incoming values are equal, the incoming VPValue can be used directly 8591 // instead of creating a new VPBlendRecipe. 8592 VPValue *FirstIncoming = Operands[0]; 8593 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8594 return FirstIncoming == Inc; 8595 })) { 8596 return Operands[0]; 8597 } 8598 8599 // We know that all PHIs in non-header blocks are converted into selects, so 8600 // we don't have to worry about the insertion order and we can just use the 8601 // builder. At this point we generate the predication tree. There may be 8602 // duplications since this is a simple recursive scan, but future 8603 // optimizations will clean it up. 8604 SmallVector<VPValue *, 2> OperandsWithMask; 8605 unsigned NumIncoming = Phi->getNumIncomingValues(); 8606 8607 for (unsigned In = 0; In < NumIncoming; In++) { 8608 VPValue *EdgeMask = 8609 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8610 assert((EdgeMask || NumIncoming == 1) && 8611 "Multiple predecessors with one having a full mask"); 8612 OperandsWithMask.push_back(Operands[In]); 8613 if (EdgeMask) 8614 OperandsWithMask.push_back(EdgeMask); 8615 } 8616 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8617 } 8618 8619 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8620 ArrayRef<VPValue *> Operands, 8621 VFRange &Range) const { 8622 8623 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8624 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8625 Range); 8626 8627 if (IsPredicated) 8628 return nullptr; 8629 8630 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8631 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8632 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8633 ID == Intrinsic::pseudoprobe || 8634 ID == Intrinsic::experimental_noalias_scope_decl)) 8635 return nullptr; 8636 8637 auto willWiden = [&](ElementCount VF) -> bool { 8638 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8639 // The following case may be scalarized depending on the VF. 8640 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8641 // version of the instruction. 8642 // Is it beneficial to perform intrinsic call compared to lib call? 8643 bool NeedToScalarize = false; 8644 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8645 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8646 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8647 return UseVectorIntrinsic || !NeedToScalarize; 8648 }; 8649 8650 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8651 return nullptr; 8652 8653 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8654 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8655 } 8656 8657 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8658 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8659 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8660 // Instruction should be widened, unless it is scalar after vectorization, 8661 // scalarization is profitable or it is predicated. 8662 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8663 return CM.isScalarAfterVectorization(I, VF) || 8664 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8665 }; 8666 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8667 Range); 8668 } 8669 8670 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8671 ArrayRef<VPValue *> Operands) const { 8672 auto IsVectorizableOpcode = [](unsigned Opcode) { 8673 switch (Opcode) { 8674 case Instruction::Add: 8675 case Instruction::And: 8676 case Instruction::AShr: 8677 case Instruction::BitCast: 8678 case Instruction::FAdd: 8679 case Instruction::FCmp: 8680 case Instruction::FDiv: 8681 case Instruction::FMul: 8682 case Instruction::FNeg: 8683 case Instruction::FPExt: 8684 case Instruction::FPToSI: 8685 case Instruction::FPToUI: 8686 case Instruction::FPTrunc: 8687 case Instruction::FRem: 8688 case Instruction::FSub: 8689 case Instruction::ICmp: 8690 case Instruction::IntToPtr: 8691 case Instruction::LShr: 8692 case Instruction::Mul: 8693 case Instruction::Or: 8694 case Instruction::PtrToInt: 8695 case Instruction::SDiv: 8696 case Instruction::Select: 8697 case Instruction::SExt: 8698 case Instruction::Shl: 8699 case Instruction::SIToFP: 8700 case Instruction::SRem: 8701 case Instruction::Sub: 8702 case Instruction::Trunc: 8703 case Instruction::UDiv: 8704 case Instruction::UIToFP: 8705 case Instruction::URem: 8706 case Instruction::Xor: 8707 case Instruction::ZExt: 8708 return true; 8709 } 8710 return false; 8711 }; 8712 8713 if (!IsVectorizableOpcode(I->getOpcode())) 8714 return nullptr; 8715 8716 // Success: widen this instruction. 8717 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8718 } 8719 8720 void VPRecipeBuilder::fixHeaderPhis() { 8721 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8722 for (VPWidenPHIRecipe *R : PhisToFix) { 8723 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8724 VPRecipeBase *IncR = 8725 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8726 R->addOperand(IncR->getVPSingleValue()); 8727 } 8728 } 8729 8730 VPBasicBlock *VPRecipeBuilder::handleReplication( 8731 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8732 VPlanPtr &Plan) { 8733 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8734 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8735 Range); 8736 8737 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8738 [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, 8739 Range); 8740 8741 // Even if the instruction is not marked as uniform, there are certain 8742 // intrinsic calls that can be effectively treated as such, so we check for 8743 // them here. Conservatively, we only do this for scalable vectors, since 8744 // for fixed-width VFs we can always fall back on full scalarization. 8745 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8746 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8747 case Intrinsic::assume: 8748 case Intrinsic::lifetime_start: 8749 case Intrinsic::lifetime_end: 8750 // For scalable vectors if one of the operands is variant then we still 8751 // want to mark as uniform, which will generate one instruction for just 8752 // the first lane of the vector. We can't scalarize the call in the same 8753 // way as for fixed-width vectors because we don't know how many lanes 8754 // there are. 8755 // 8756 // The reasons for doing it this way for scalable vectors are: 8757 // 1. For the assume intrinsic generating the instruction for the first 8758 // lane is still be better than not generating any at all. For 8759 // example, the input may be a splat across all lanes. 8760 // 2. For the lifetime start/end intrinsics the pointer operand only 8761 // does anything useful when the input comes from a stack object, 8762 // which suggests it should always be uniform. For non-stack objects 8763 // the effect is to poison the object, which still allows us to 8764 // remove the call. 8765 IsUniform = true; 8766 break; 8767 default: 8768 break; 8769 } 8770 } 8771 8772 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8773 IsUniform, IsPredicated); 8774 setRecipe(I, Recipe); 8775 Plan->addVPValue(I, Recipe); 8776 8777 // Find if I uses a predicated instruction. If so, it will use its scalar 8778 // value. Avoid hoisting the insert-element which packs the scalar value into 8779 // a vector value, as that happens iff all users use the vector value. 8780 for (VPValue *Op : Recipe->operands()) { 8781 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8782 if (!PredR) 8783 continue; 8784 auto *RepR = 8785 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8786 assert(RepR->isPredicated() && 8787 "expected Replicate recipe to be predicated"); 8788 RepR->setAlsoPack(false); 8789 } 8790 8791 // Finalize the recipe for Instr, first if it is not predicated. 8792 if (!IsPredicated) { 8793 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8794 VPBB->appendRecipe(Recipe); 8795 return VPBB; 8796 } 8797 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8798 8799 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8800 assert(SingleSucc && "VPBB must have a single successor when handling " 8801 "predicated replication."); 8802 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8803 // Record predicated instructions for above packing optimizations. 8804 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8805 VPBlockUtils::insertBlockAfter(Region, VPBB); 8806 auto *RegSucc = new VPBasicBlock(); 8807 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8808 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8809 return RegSucc; 8810 } 8811 8812 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8813 VPRecipeBase *PredRecipe, 8814 VPlanPtr &Plan) { 8815 // Instructions marked for predication are replicated and placed under an 8816 // if-then construct to prevent side-effects. 8817 8818 // Generate recipes to compute the block mask for this region. 8819 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8820 8821 // Build the triangular if-then region. 8822 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8823 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8824 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8825 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8826 auto *PHIRecipe = Instr->getType()->isVoidTy() 8827 ? nullptr 8828 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8829 if (PHIRecipe) { 8830 Plan->removeVPValueFor(Instr); 8831 Plan->addVPValue(Instr, PHIRecipe); 8832 } 8833 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8834 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8835 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8836 8837 // Note: first set Entry as region entry and then connect successors starting 8838 // from it in order, to propagate the "parent" of each VPBasicBlock. 8839 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8840 VPBlockUtils::connectBlocks(Pred, Exit); 8841 8842 return Region; 8843 } 8844 8845 VPRecipeOrVPValueTy 8846 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8847 ArrayRef<VPValue *> Operands, 8848 VFRange &Range, VPlanPtr &Plan) { 8849 // First, check for specific widening recipes that deal with calls, memory 8850 // operations, inductions and Phi nodes. 8851 if (auto *CI = dyn_cast<CallInst>(Instr)) 8852 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8853 8854 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8855 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8856 8857 VPRecipeBase *Recipe; 8858 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8859 if (Phi->getParent() != OrigLoop->getHeader()) 8860 return tryToBlend(Phi, Operands, Plan); 8861 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8862 return toVPRecipeResult(Recipe); 8863 8864 VPWidenPHIRecipe *PhiRecipe = nullptr; 8865 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8866 VPValue *StartV = Operands[0]; 8867 if (Legal->isReductionVariable(Phi)) { 8868 const RecurrenceDescriptor &RdxDesc = 8869 Legal->getReductionVars().find(Phi)->second; 8870 assert(RdxDesc.getRecurrenceStartValue() == 8871 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8872 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8873 CM.isInLoopReduction(Phi), 8874 CM.useOrderedReductions(RdxDesc)); 8875 } else { 8876 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8877 } 8878 8879 // Record the incoming value from the backedge, so we can add the incoming 8880 // value from the backedge after all recipes have been created. 8881 recordRecipeOf(cast<Instruction>( 8882 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8883 PhisToFix.push_back(PhiRecipe); 8884 } else { 8885 // TODO: record start and backedge value for remaining pointer induction 8886 // phis. 8887 assert(Phi->getType()->isPointerTy() && 8888 "only pointer phis should be handled here"); 8889 PhiRecipe = new VPWidenPHIRecipe(Phi); 8890 } 8891 8892 return toVPRecipeResult(PhiRecipe); 8893 } 8894 8895 if (isa<TruncInst>(Instr) && 8896 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8897 Range, *Plan))) 8898 return toVPRecipeResult(Recipe); 8899 8900 if (!shouldWiden(Instr, Range)) 8901 return nullptr; 8902 8903 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8904 return toVPRecipeResult(new VPWidenGEPRecipe( 8905 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8906 8907 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8908 bool InvariantCond = 8909 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8910 return toVPRecipeResult(new VPWidenSelectRecipe( 8911 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8912 } 8913 8914 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8915 } 8916 8917 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8918 ElementCount MaxVF) { 8919 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8920 8921 // Collect instructions from the original loop that will become trivially dead 8922 // in the vectorized loop. We don't need to vectorize these instructions. For 8923 // example, original induction update instructions can become dead because we 8924 // separately emit induction "steps" when generating code for the new loop. 8925 // Similarly, we create a new latch condition when setting up the structure 8926 // of the new loop, so the old one can become dead. 8927 SmallPtrSet<Instruction *, 4> DeadInstructions; 8928 collectTriviallyDeadInstructions(DeadInstructions); 8929 8930 // Add assume instructions we need to drop to DeadInstructions, to prevent 8931 // them from being added to the VPlan. 8932 // TODO: We only need to drop assumes in blocks that get flattend. If the 8933 // control flow is preserved, we should keep them. 8934 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8935 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8936 8937 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8938 // Dead instructions do not need sinking. Remove them from SinkAfter. 8939 for (Instruction *I : DeadInstructions) 8940 SinkAfter.erase(I); 8941 8942 // Cannot sink instructions after dead instructions (there won't be any 8943 // recipes for them). Instead, find the first non-dead previous instruction. 8944 for (auto &P : Legal->getSinkAfter()) { 8945 Instruction *SinkTarget = P.second; 8946 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8947 (void)FirstInst; 8948 while (DeadInstructions.contains(SinkTarget)) { 8949 assert( 8950 SinkTarget != FirstInst && 8951 "Must find a live instruction (at least the one feeding the " 8952 "first-order recurrence PHI) before reaching beginning of the block"); 8953 SinkTarget = SinkTarget->getPrevNode(); 8954 assert(SinkTarget != P.first && 8955 "sink source equals target, no sinking required"); 8956 } 8957 P.second = SinkTarget; 8958 } 8959 8960 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8961 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8962 VFRange SubRange = {VF, MaxVFPlusOne}; 8963 VPlans.push_back( 8964 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8965 VF = SubRange.End; 8966 } 8967 } 8968 8969 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8970 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8971 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8972 8973 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8974 8975 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8976 8977 // --------------------------------------------------------------------------- 8978 // Pre-construction: record ingredients whose recipes we'll need to further 8979 // process after constructing the initial VPlan. 8980 // --------------------------------------------------------------------------- 8981 8982 // Mark instructions we'll need to sink later and their targets as 8983 // ingredients whose recipe we'll need to record. 8984 for (auto &Entry : SinkAfter) { 8985 RecipeBuilder.recordRecipeOf(Entry.first); 8986 RecipeBuilder.recordRecipeOf(Entry.second); 8987 } 8988 for (auto &Reduction : CM.getInLoopReductionChains()) { 8989 PHINode *Phi = Reduction.first; 8990 RecurKind Kind = 8991 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8992 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8993 8994 RecipeBuilder.recordRecipeOf(Phi); 8995 for (auto &R : ReductionOperations) { 8996 RecipeBuilder.recordRecipeOf(R); 8997 // For min/max reducitons, where we have a pair of icmp/select, we also 8998 // need to record the ICmp recipe, so it can be removed later. 8999 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9000 "Only min/max recurrences allowed for inloop reductions"); 9001 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9002 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9003 } 9004 } 9005 9006 // For each interleave group which is relevant for this (possibly trimmed) 9007 // Range, add it to the set of groups to be later applied to the VPlan and add 9008 // placeholders for its members' Recipes which we'll be replacing with a 9009 // single VPInterleaveRecipe. 9010 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9011 auto applyIG = [IG, this](ElementCount VF) -> bool { 9012 return (VF.isVector() && // Query is illegal for VF == 1 9013 CM.getWideningDecision(IG->getInsertPos(), VF) == 9014 LoopVectorizationCostModel::CM_Interleave); 9015 }; 9016 if (!getDecisionAndClampRange(applyIG, Range)) 9017 continue; 9018 InterleaveGroups.insert(IG); 9019 for (unsigned i = 0; i < IG->getFactor(); i++) 9020 if (Instruction *Member = IG->getMember(i)) 9021 RecipeBuilder.recordRecipeOf(Member); 9022 }; 9023 9024 // --------------------------------------------------------------------------- 9025 // Build initial VPlan: Scan the body of the loop in a topological order to 9026 // visit each basic block after having visited its predecessor basic blocks. 9027 // --------------------------------------------------------------------------- 9028 9029 // Create initial VPlan skeleton, with separate header and latch blocks. 9030 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9031 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9032 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9033 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9034 auto Plan = std::make_unique<VPlan>(TopRegion); 9035 9036 // Scan the body of the loop in a topological order to visit each basic block 9037 // after having visited its predecessor basic blocks. 9038 LoopBlocksDFS DFS(OrigLoop); 9039 DFS.perform(LI); 9040 9041 VPBasicBlock *VPBB = HeaderVPBB; 9042 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9043 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9044 // Relevant instructions from basic block BB will be grouped into VPRecipe 9045 // ingredients and fill a new VPBasicBlock. 9046 unsigned VPBBsForBB = 0; 9047 VPBB->setName(BB->getName()); 9048 Builder.setInsertPoint(VPBB); 9049 9050 // Introduce each ingredient into VPlan. 9051 // TODO: Model and preserve debug instrinsics in VPlan. 9052 for (Instruction &I : BB->instructionsWithoutDebug()) { 9053 Instruction *Instr = &I; 9054 9055 // First filter out irrelevant instructions, to ensure no recipes are 9056 // built for them. 9057 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9058 continue; 9059 9060 SmallVector<VPValue *, 4> Operands; 9061 auto *Phi = dyn_cast<PHINode>(Instr); 9062 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9063 Operands.push_back(Plan->getOrAddVPValue( 9064 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9065 } else { 9066 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9067 Operands = {OpRange.begin(), OpRange.end()}; 9068 } 9069 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9070 Instr, Operands, Range, Plan)) { 9071 // If Instr can be simplified to an existing VPValue, use it. 9072 if (RecipeOrValue.is<VPValue *>()) { 9073 auto *VPV = RecipeOrValue.get<VPValue *>(); 9074 Plan->addVPValue(Instr, VPV); 9075 // If the re-used value is a recipe, register the recipe for the 9076 // instruction, in case the recipe for Instr needs to be recorded. 9077 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9078 RecipeBuilder.setRecipe(Instr, R); 9079 continue; 9080 } 9081 // Otherwise, add the new recipe. 9082 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9083 for (auto *Def : Recipe->definedValues()) { 9084 auto *UV = Def->getUnderlyingValue(); 9085 Plan->addVPValue(UV, Def); 9086 } 9087 9088 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9089 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9090 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9091 // of the header block. That can happen for truncates of induction 9092 // variables. Those recipes are moved to the phi section of the header 9093 // block after applying SinkAfter, which relies on the original 9094 // position of the trunc. 9095 assert(isa<TruncInst>(Instr)); 9096 InductionsToMove.push_back( 9097 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9098 } 9099 RecipeBuilder.setRecipe(Instr, Recipe); 9100 VPBB->appendRecipe(Recipe); 9101 continue; 9102 } 9103 9104 // Otherwise, if all widening options failed, Instruction is to be 9105 // replicated. This may create a successor for VPBB. 9106 VPBasicBlock *NextVPBB = 9107 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9108 if (NextVPBB != VPBB) { 9109 VPBB = NextVPBB; 9110 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9111 : ""); 9112 } 9113 } 9114 9115 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9116 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9117 } 9118 9119 // Fold the last, empty block into its predecessor. 9120 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9121 assert(VPBB && "expected to fold last (empty) block"); 9122 // After here, VPBB should not be used. 9123 VPBB = nullptr; 9124 9125 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9126 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9127 "entry block must be set to a VPRegionBlock having a non-empty entry " 9128 "VPBasicBlock"); 9129 RecipeBuilder.fixHeaderPhis(); 9130 9131 // --------------------------------------------------------------------------- 9132 // Transform initial VPlan: Apply previously taken decisions, in order, to 9133 // bring the VPlan to its final state. 9134 // --------------------------------------------------------------------------- 9135 9136 // Apply Sink-After legal constraints. 9137 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9138 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9139 if (Region && Region->isReplicator()) { 9140 assert(Region->getNumSuccessors() == 1 && 9141 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9142 assert(R->getParent()->size() == 1 && 9143 "A recipe in an original replicator region must be the only " 9144 "recipe in its block"); 9145 return Region; 9146 } 9147 return nullptr; 9148 }; 9149 for (auto &Entry : SinkAfter) { 9150 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9151 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9152 9153 auto *TargetRegion = GetReplicateRegion(Target); 9154 auto *SinkRegion = GetReplicateRegion(Sink); 9155 if (!SinkRegion) { 9156 // If the sink source is not a replicate region, sink the recipe directly. 9157 if (TargetRegion) { 9158 // The target is in a replication region, make sure to move Sink to 9159 // the block after it, not into the replication region itself. 9160 VPBasicBlock *NextBlock = 9161 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9162 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9163 } else 9164 Sink->moveAfter(Target); 9165 continue; 9166 } 9167 9168 // The sink source is in a replicate region. Unhook the region from the CFG. 9169 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9170 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9171 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9172 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9173 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9174 9175 if (TargetRegion) { 9176 // The target recipe is also in a replicate region, move the sink region 9177 // after the target region. 9178 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9179 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9180 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9181 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9182 } else { 9183 // The sink source is in a replicate region, we need to move the whole 9184 // replicate region, which should only contain a single recipe in the 9185 // main block. 9186 auto *SplitBlock = 9187 Target->getParent()->splitAt(std::next(Target->getIterator())); 9188 9189 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9190 9191 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9192 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9193 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9194 } 9195 } 9196 9197 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9198 9199 // Now that sink-after is done, move induction recipes for optimized truncates 9200 // to the phi section of the header block. 9201 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9202 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9203 9204 // Adjust the recipes for any inloop reductions. 9205 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9206 RecipeBuilder, Range.Start); 9207 9208 // Introduce a recipe to combine the incoming and previous values of a 9209 // first-order recurrence. 9210 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9211 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9212 if (!RecurPhi) 9213 continue; 9214 9215 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9216 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9217 auto *Region = GetReplicateRegion(PrevRecipe); 9218 if (Region) 9219 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9220 if (Region || PrevRecipe->isPhi()) 9221 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9222 else 9223 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9224 9225 auto *RecurSplice = cast<VPInstruction>( 9226 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9227 {RecurPhi, RecurPhi->getBackedgeValue()})); 9228 9229 RecurPhi->replaceAllUsesWith(RecurSplice); 9230 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9231 // all users. 9232 RecurSplice->setOperand(0, RecurPhi); 9233 } 9234 9235 // Interleave memory: for each Interleave Group we marked earlier as relevant 9236 // for this VPlan, replace the Recipes widening its memory instructions with a 9237 // single VPInterleaveRecipe at its insertion point. 9238 for (auto IG : InterleaveGroups) { 9239 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9240 RecipeBuilder.getRecipe(IG->getInsertPos())); 9241 SmallVector<VPValue *, 4> StoredValues; 9242 for (unsigned i = 0; i < IG->getFactor(); ++i) 9243 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9244 auto *StoreR = 9245 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9246 StoredValues.push_back(StoreR->getStoredValue()); 9247 } 9248 9249 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9250 Recipe->getMask()); 9251 VPIG->insertBefore(Recipe); 9252 unsigned J = 0; 9253 for (unsigned i = 0; i < IG->getFactor(); ++i) 9254 if (Instruction *Member = IG->getMember(i)) { 9255 if (!Member->getType()->isVoidTy()) { 9256 VPValue *OriginalV = Plan->getVPValue(Member); 9257 Plan->removeVPValueFor(Member); 9258 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9259 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9260 J++; 9261 } 9262 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9263 } 9264 } 9265 9266 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9267 // in ways that accessing values using original IR values is incorrect. 9268 Plan->disableValue2VPValue(); 9269 9270 VPlanTransforms::sinkScalarOperands(*Plan); 9271 VPlanTransforms::mergeReplicateRegions(*Plan); 9272 9273 std::string PlanName; 9274 raw_string_ostream RSO(PlanName); 9275 ElementCount VF = Range.Start; 9276 Plan->addVF(VF); 9277 RSO << "Initial VPlan for VF={" << VF; 9278 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9279 Plan->addVF(VF); 9280 RSO << "," << VF; 9281 } 9282 RSO << "},UF>=1"; 9283 RSO.flush(); 9284 Plan->setName(PlanName); 9285 9286 // Fold Exit block into its predecessor if possible. 9287 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9288 // VPBasicBlock as exit. 9289 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9290 9291 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9292 return Plan; 9293 } 9294 9295 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9296 // Outer loop handling: They may require CFG and instruction level 9297 // transformations before even evaluating whether vectorization is profitable. 9298 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9299 // the vectorization pipeline. 9300 assert(!OrigLoop->isInnermost()); 9301 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9302 9303 // Create new empty VPlan 9304 auto Plan = std::make_unique<VPlan>(); 9305 9306 // Build hierarchical CFG 9307 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9308 HCFGBuilder.buildHierarchicalCFG(); 9309 9310 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9311 VF *= 2) 9312 Plan->addVF(VF); 9313 9314 if (EnableVPlanPredication) { 9315 VPlanPredicator VPP(*Plan); 9316 VPP.predicate(); 9317 9318 // Avoid running transformation to recipes until masked code generation in 9319 // VPlan-native path is in place. 9320 return Plan; 9321 } 9322 9323 SmallPtrSet<Instruction *, 1> DeadInstructions; 9324 VPlanTransforms::VPInstructionsToVPRecipes( 9325 OrigLoop, Plan, 9326 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9327 DeadInstructions, *PSE.getSE()); 9328 return Plan; 9329 } 9330 9331 // Adjust the recipes for reductions. For in-loop reductions the chain of 9332 // instructions leading from the loop exit instr to the phi need to be converted 9333 // to reductions, with one operand being vector and the other being the scalar 9334 // reduction chain. For other reductions, a select is introduced between the phi 9335 // and live-out recipes when folding the tail. 9336 void LoopVectorizationPlanner::adjustRecipesForReductions( 9337 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9338 ElementCount MinVF) { 9339 for (auto &Reduction : CM.getInLoopReductionChains()) { 9340 PHINode *Phi = Reduction.first; 9341 const RecurrenceDescriptor &RdxDesc = 9342 Legal->getReductionVars().find(Phi)->second; 9343 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9344 9345 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9346 continue; 9347 9348 // ReductionOperations are orders top-down from the phi's use to the 9349 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9350 // which of the two operands will remain scalar and which will be reduced. 9351 // For minmax the chain will be the select instructions. 9352 Instruction *Chain = Phi; 9353 for (Instruction *R : ReductionOperations) { 9354 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9355 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9356 9357 VPValue *ChainOp = Plan->getVPValue(Chain); 9358 unsigned FirstOpId; 9359 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9360 "Only min/max recurrences allowed for inloop reductions"); 9361 // Recognize a call to the llvm.fmuladd intrinsic. 9362 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9363 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9364 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9365 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9366 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9367 "Expected to replace a VPWidenSelectSC"); 9368 FirstOpId = 1; 9369 } else { 9370 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9371 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9372 "Expected to replace a VPWidenSC"); 9373 FirstOpId = 0; 9374 } 9375 unsigned VecOpId = 9376 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9377 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9378 9379 auto *CondOp = CM.foldTailByMasking() 9380 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9381 : nullptr; 9382 9383 if (IsFMulAdd) { 9384 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9385 // need to create an fmul recipe to use as the vector operand for the 9386 // fadd reduction. 9387 VPInstruction *FMulRecipe = new VPInstruction( 9388 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9389 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9390 WidenRecipe->getParent()->insert(FMulRecipe, 9391 WidenRecipe->getIterator()); 9392 VecOp = FMulRecipe; 9393 } 9394 VPReductionRecipe *RedRecipe = 9395 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9396 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9397 Plan->removeVPValueFor(R); 9398 Plan->addVPValue(R, RedRecipe); 9399 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9400 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9401 WidenRecipe->eraseFromParent(); 9402 9403 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9404 VPRecipeBase *CompareRecipe = 9405 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9406 assert(isa<VPWidenRecipe>(CompareRecipe) && 9407 "Expected to replace a VPWidenSC"); 9408 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9409 "Expected no remaining users"); 9410 CompareRecipe->eraseFromParent(); 9411 } 9412 Chain = R; 9413 } 9414 } 9415 9416 // If tail is folded by masking, introduce selects between the phi 9417 // and the live-out instruction of each reduction, at the end of the latch. 9418 if (CM.foldTailByMasking()) { 9419 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9420 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9421 if (!PhiR || PhiR->isInLoop()) 9422 continue; 9423 Builder.setInsertPoint(LatchVPBB); 9424 VPValue *Cond = 9425 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9426 VPValue *Red = PhiR->getBackedgeValue(); 9427 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9428 } 9429 } 9430 } 9431 9432 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9433 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9434 VPSlotTracker &SlotTracker) const { 9435 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9436 IG->getInsertPos()->printAsOperand(O, false); 9437 O << ", "; 9438 getAddr()->printAsOperand(O, SlotTracker); 9439 VPValue *Mask = getMask(); 9440 if (Mask) { 9441 O << ", "; 9442 Mask->printAsOperand(O, SlotTracker); 9443 } 9444 9445 unsigned OpIdx = 0; 9446 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9447 if (!IG->getMember(i)) 9448 continue; 9449 if (getNumStoreOperands() > 0) { 9450 O << "\n" << Indent << " store "; 9451 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9452 O << " to index " << i; 9453 } else { 9454 O << "\n" << Indent << " "; 9455 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9456 O << " = load from index " << i; 9457 } 9458 ++OpIdx; 9459 } 9460 } 9461 #endif 9462 9463 void VPWidenCallRecipe::execute(VPTransformState &State) { 9464 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9465 *this, State); 9466 } 9467 9468 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9469 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9470 State.ILV->setDebugLocFromInst(&I); 9471 9472 // The condition can be loop invariant but still defined inside the 9473 // loop. This means that we can't just use the original 'cond' value. 9474 // We have to take the 'vectorized' value and pick the first lane. 9475 // Instcombine will make this a no-op. 9476 auto *InvarCond = 9477 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9478 9479 for (unsigned Part = 0; Part < State.UF; ++Part) { 9480 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9481 Value *Op0 = State.get(getOperand(1), Part); 9482 Value *Op1 = State.get(getOperand(2), Part); 9483 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9484 State.set(this, Sel, Part); 9485 State.ILV->addMetadata(Sel, &I); 9486 } 9487 } 9488 9489 void VPWidenRecipe::execute(VPTransformState &State) { 9490 auto &I = *cast<Instruction>(getUnderlyingValue()); 9491 auto &Builder = State.Builder; 9492 switch (I.getOpcode()) { 9493 case Instruction::Call: 9494 case Instruction::Br: 9495 case Instruction::PHI: 9496 case Instruction::GetElementPtr: 9497 case Instruction::Select: 9498 llvm_unreachable("This instruction is handled by a different recipe."); 9499 case Instruction::UDiv: 9500 case Instruction::SDiv: 9501 case Instruction::SRem: 9502 case Instruction::URem: 9503 case Instruction::Add: 9504 case Instruction::FAdd: 9505 case Instruction::Sub: 9506 case Instruction::FSub: 9507 case Instruction::FNeg: 9508 case Instruction::Mul: 9509 case Instruction::FMul: 9510 case Instruction::FDiv: 9511 case Instruction::FRem: 9512 case Instruction::Shl: 9513 case Instruction::LShr: 9514 case Instruction::AShr: 9515 case Instruction::And: 9516 case Instruction::Or: 9517 case Instruction::Xor: { 9518 // Just widen unops and binops. 9519 State.ILV->setDebugLocFromInst(&I); 9520 9521 for (unsigned Part = 0; Part < State.UF; ++Part) { 9522 SmallVector<Value *, 2> Ops; 9523 for (VPValue *VPOp : operands()) 9524 Ops.push_back(State.get(VPOp, Part)); 9525 9526 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9527 9528 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9529 VecOp->copyIRFlags(&I); 9530 9531 // If the instruction is vectorized and was in a basic block that needed 9532 // predication, we can't propagate poison-generating flags (nuw/nsw, 9533 // exact, etc.). The control flow has been linearized and the 9534 // instruction is no longer guarded by the predicate, which could make 9535 // the flag properties to no longer hold. 9536 if (State.MayGeneratePoisonRecipes.contains(this)) 9537 VecOp->dropPoisonGeneratingFlags(); 9538 } 9539 9540 // Use this vector value for all users of the original instruction. 9541 State.set(this, V, Part); 9542 State.ILV->addMetadata(V, &I); 9543 } 9544 9545 break; 9546 } 9547 case Instruction::ICmp: 9548 case Instruction::FCmp: { 9549 // Widen compares. Generate vector compares. 9550 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9551 auto *Cmp = cast<CmpInst>(&I); 9552 State.ILV->setDebugLocFromInst(Cmp); 9553 for (unsigned Part = 0; Part < State.UF; ++Part) { 9554 Value *A = State.get(getOperand(0), Part); 9555 Value *B = State.get(getOperand(1), Part); 9556 Value *C = nullptr; 9557 if (FCmp) { 9558 // Propagate fast math flags. 9559 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9560 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9561 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9562 } else { 9563 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9564 } 9565 State.set(this, C, Part); 9566 State.ILV->addMetadata(C, &I); 9567 } 9568 9569 break; 9570 } 9571 9572 case Instruction::ZExt: 9573 case Instruction::SExt: 9574 case Instruction::FPToUI: 9575 case Instruction::FPToSI: 9576 case Instruction::FPExt: 9577 case Instruction::PtrToInt: 9578 case Instruction::IntToPtr: 9579 case Instruction::SIToFP: 9580 case Instruction::UIToFP: 9581 case Instruction::Trunc: 9582 case Instruction::FPTrunc: 9583 case Instruction::BitCast: { 9584 auto *CI = cast<CastInst>(&I); 9585 State.ILV->setDebugLocFromInst(CI); 9586 9587 /// Vectorize casts. 9588 Type *DestTy = (State.VF.isScalar()) 9589 ? CI->getType() 9590 : VectorType::get(CI->getType(), State.VF); 9591 9592 for (unsigned Part = 0; Part < State.UF; ++Part) { 9593 Value *A = State.get(getOperand(0), Part); 9594 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9595 State.set(this, Cast, Part); 9596 State.ILV->addMetadata(Cast, &I); 9597 } 9598 break; 9599 } 9600 default: 9601 // This instruction is not vectorized by simple widening. 9602 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9603 llvm_unreachable("Unhandled instruction!"); 9604 } // end of switch. 9605 } 9606 9607 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9608 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9609 // Construct a vector GEP by widening the operands of the scalar GEP as 9610 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9611 // results in a vector of pointers when at least one operand of the GEP 9612 // is vector-typed. Thus, to keep the representation compact, we only use 9613 // vector-typed operands for loop-varying values. 9614 9615 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9616 // If we are vectorizing, but the GEP has only loop-invariant operands, 9617 // the GEP we build (by only using vector-typed operands for 9618 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9619 // produce a vector of pointers, we need to either arbitrarily pick an 9620 // operand to broadcast, or broadcast a clone of the original GEP. 9621 // Here, we broadcast a clone of the original. 9622 // 9623 // TODO: If at some point we decide to scalarize instructions having 9624 // loop-invariant operands, this special case will no longer be 9625 // required. We would add the scalarization decision to 9626 // collectLoopScalars() and teach getVectorValue() to broadcast 9627 // the lane-zero scalar value. 9628 auto *Clone = State.Builder.Insert(GEP->clone()); 9629 for (unsigned Part = 0; Part < State.UF; ++Part) { 9630 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9631 State.set(this, EntryPart, Part); 9632 State.ILV->addMetadata(EntryPart, GEP); 9633 } 9634 } else { 9635 // If the GEP has at least one loop-varying operand, we are sure to 9636 // produce a vector of pointers. But if we are only unrolling, we want 9637 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9638 // produce with the code below will be scalar (if VF == 1) or vector 9639 // (otherwise). Note that for the unroll-only case, we still maintain 9640 // values in the vector mapping with initVector, as we do for other 9641 // instructions. 9642 for (unsigned Part = 0; Part < State.UF; ++Part) { 9643 // The pointer operand of the new GEP. If it's loop-invariant, we 9644 // won't broadcast it. 9645 auto *Ptr = IsPtrLoopInvariant 9646 ? State.get(getOperand(0), VPIteration(0, 0)) 9647 : State.get(getOperand(0), Part); 9648 9649 // Collect all the indices for the new GEP. If any index is 9650 // loop-invariant, we won't broadcast it. 9651 SmallVector<Value *, 4> Indices; 9652 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9653 VPValue *Operand = getOperand(I); 9654 if (IsIndexLoopInvariant[I - 1]) 9655 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9656 else 9657 Indices.push_back(State.get(Operand, Part)); 9658 } 9659 9660 // If the GEP instruction is vectorized and was in a basic block that 9661 // needed predication, we can't propagate the poison-generating 'inbounds' 9662 // flag. The control flow has been linearized and the GEP is no longer 9663 // guarded by the predicate, which could make the 'inbounds' properties to 9664 // no longer hold. 9665 bool IsInBounds = 9666 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9667 9668 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9669 // but it should be a vector, otherwise. 9670 auto *NewGEP = IsInBounds 9671 ? State.Builder.CreateInBoundsGEP( 9672 GEP->getSourceElementType(), Ptr, Indices) 9673 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9674 Ptr, Indices); 9675 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9676 "NewGEP is not a pointer vector"); 9677 State.set(this, NewGEP, Part); 9678 State.ILV->addMetadata(NewGEP, GEP); 9679 } 9680 } 9681 } 9682 9683 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9684 assert(!State.Instance && "Int or FP induction being replicated."); 9685 State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), 9686 getStartValue()->getLiveInIRValue(), 9687 getTruncInst(), getVPValue(0), State); 9688 } 9689 9690 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9691 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9692 State); 9693 } 9694 9695 void VPBlendRecipe::execute(VPTransformState &State) { 9696 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9697 // We know that all PHIs in non-header blocks are converted into 9698 // selects, so we don't have to worry about the insertion order and we 9699 // can just use the builder. 9700 // At this point we generate the predication tree. There may be 9701 // duplications since this is a simple recursive scan, but future 9702 // optimizations will clean it up. 9703 9704 unsigned NumIncoming = getNumIncomingValues(); 9705 9706 // Generate a sequence of selects of the form: 9707 // SELECT(Mask3, In3, 9708 // SELECT(Mask2, In2, 9709 // SELECT(Mask1, In1, 9710 // In0))) 9711 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9712 // are essentially undef are taken from In0. 9713 InnerLoopVectorizer::VectorParts Entry(State.UF); 9714 for (unsigned In = 0; In < NumIncoming; ++In) { 9715 for (unsigned Part = 0; Part < State.UF; ++Part) { 9716 // We might have single edge PHIs (blocks) - use an identity 9717 // 'select' for the first PHI operand. 9718 Value *In0 = State.get(getIncomingValue(In), Part); 9719 if (In == 0) 9720 Entry[Part] = In0; // Initialize with the first incoming value. 9721 else { 9722 // Select between the current value and the previous incoming edge 9723 // based on the incoming mask. 9724 Value *Cond = State.get(getMask(In), Part); 9725 Entry[Part] = 9726 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9727 } 9728 } 9729 } 9730 for (unsigned Part = 0; Part < State.UF; ++Part) 9731 State.set(this, Entry[Part], Part); 9732 } 9733 9734 void VPInterleaveRecipe::execute(VPTransformState &State) { 9735 assert(!State.Instance && "Interleave group being replicated."); 9736 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9737 getStoredValues(), getMask()); 9738 } 9739 9740 void VPReductionRecipe::execute(VPTransformState &State) { 9741 assert(!State.Instance && "Reduction being replicated."); 9742 Value *PrevInChain = State.get(getChainOp(), 0); 9743 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9744 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9745 // Propagate the fast-math flags carried by the underlying instruction. 9746 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9747 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9748 for (unsigned Part = 0; Part < State.UF; ++Part) { 9749 Value *NewVecOp = State.get(getVecOp(), Part); 9750 if (VPValue *Cond = getCondOp()) { 9751 Value *NewCond = State.get(Cond, Part); 9752 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9753 Value *Iden = RdxDesc->getRecurrenceIdentity( 9754 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9755 Value *IdenVec = 9756 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9757 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9758 NewVecOp = Select; 9759 } 9760 Value *NewRed; 9761 Value *NextInChain; 9762 if (IsOrdered) { 9763 if (State.VF.isVector()) 9764 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9765 PrevInChain); 9766 else 9767 NewRed = State.Builder.CreateBinOp( 9768 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9769 NewVecOp); 9770 PrevInChain = NewRed; 9771 } else { 9772 PrevInChain = State.get(getChainOp(), Part); 9773 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9774 } 9775 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9776 NextInChain = 9777 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9778 NewRed, PrevInChain); 9779 } else if (IsOrdered) 9780 NextInChain = NewRed; 9781 else 9782 NextInChain = State.Builder.CreateBinOp( 9783 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9784 PrevInChain); 9785 State.set(this, NextInChain, Part); 9786 } 9787 } 9788 9789 void VPReplicateRecipe::execute(VPTransformState &State) { 9790 if (State.Instance) { // Generate a single instance. 9791 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9792 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9793 IsPredicated, State); 9794 // Insert scalar instance packing it into a vector. 9795 if (AlsoPack && State.VF.isVector()) { 9796 // If we're constructing lane 0, initialize to start from poison. 9797 if (State.Instance->Lane.isFirstLane()) { 9798 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9799 Value *Poison = PoisonValue::get( 9800 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9801 State.set(this, Poison, State.Instance->Part); 9802 } 9803 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9804 } 9805 return; 9806 } 9807 9808 // Generate scalar instances for all VF lanes of all UF parts, unless the 9809 // instruction is uniform inwhich case generate only the first lane for each 9810 // of the UF parts. 9811 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9812 assert((!State.VF.isScalable() || IsUniform) && 9813 "Can't scalarize a scalable vector"); 9814 for (unsigned Part = 0; Part < State.UF; ++Part) 9815 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9816 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9817 VPIteration(Part, Lane), IsPredicated, 9818 State); 9819 } 9820 9821 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9822 assert(State.Instance && "Branch on Mask works only on single instance."); 9823 9824 unsigned Part = State.Instance->Part; 9825 unsigned Lane = State.Instance->Lane.getKnownLane(); 9826 9827 Value *ConditionBit = nullptr; 9828 VPValue *BlockInMask = getMask(); 9829 if (BlockInMask) { 9830 ConditionBit = State.get(BlockInMask, Part); 9831 if (ConditionBit->getType()->isVectorTy()) 9832 ConditionBit = State.Builder.CreateExtractElement( 9833 ConditionBit, State.Builder.getInt32(Lane)); 9834 } else // Block in mask is all-one. 9835 ConditionBit = State.Builder.getTrue(); 9836 9837 // Replace the temporary unreachable terminator with a new conditional branch, 9838 // whose two destinations will be set later when they are created. 9839 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9840 assert(isa<UnreachableInst>(CurrentTerminator) && 9841 "Expected to replace unreachable terminator with conditional branch."); 9842 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9843 CondBr->setSuccessor(0, nullptr); 9844 ReplaceInstWithInst(CurrentTerminator, CondBr); 9845 } 9846 9847 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9848 assert(State.Instance && "Predicated instruction PHI works per instance."); 9849 Instruction *ScalarPredInst = 9850 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9851 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9852 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9853 assert(PredicatingBB && "Predicated block has no single predecessor."); 9854 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9855 "operand must be VPReplicateRecipe"); 9856 9857 // By current pack/unpack logic we need to generate only a single phi node: if 9858 // a vector value for the predicated instruction exists at this point it means 9859 // the instruction has vector users only, and a phi for the vector value is 9860 // needed. In this case the recipe of the predicated instruction is marked to 9861 // also do that packing, thereby "hoisting" the insert-element sequence. 9862 // Otherwise, a phi node for the scalar value is needed. 9863 unsigned Part = State.Instance->Part; 9864 if (State.hasVectorValue(getOperand(0), Part)) { 9865 Value *VectorValue = State.get(getOperand(0), Part); 9866 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9867 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9868 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9869 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9870 if (State.hasVectorValue(this, Part)) 9871 State.reset(this, VPhi, Part); 9872 else 9873 State.set(this, VPhi, Part); 9874 // NOTE: Currently we need to update the value of the operand, so the next 9875 // predicated iteration inserts its generated value in the correct vector. 9876 State.reset(getOperand(0), VPhi, Part); 9877 } else { 9878 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9879 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9880 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9881 PredicatingBB); 9882 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9883 if (State.hasScalarValue(this, *State.Instance)) 9884 State.reset(this, Phi, *State.Instance); 9885 else 9886 State.set(this, Phi, *State.Instance); 9887 // NOTE: Currently we need to update the value of the operand, so the next 9888 // predicated iteration inserts its generated value in the correct vector. 9889 State.reset(getOperand(0), Phi, *State.Instance); 9890 } 9891 } 9892 9893 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9894 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9895 9896 // Attempt to issue a wide load. 9897 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9898 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9899 9900 assert((LI || SI) && "Invalid Load/Store instruction"); 9901 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9902 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9903 9904 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9905 9906 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9907 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9908 bool CreateGatherScatter = !Consecutive; 9909 9910 auto &Builder = State.Builder; 9911 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9912 bool isMaskRequired = getMask(); 9913 if (isMaskRequired) 9914 for (unsigned Part = 0; Part < State.UF; ++Part) 9915 BlockInMaskParts[Part] = State.get(getMask(), Part); 9916 9917 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9918 // Calculate the pointer for the specific unroll-part. 9919 GetElementPtrInst *PartPtr = nullptr; 9920 9921 bool InBounds = false; 9922 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9923 InBounds = gep->isInBounds(); 9924 if (Reverse) { 9925 // If the address is consecutive but reversed, then the 9926 // wide store needs to start at the last vector element. 9927 // RunTimeVF = VScale * VF.getKnownMinValue() 9928 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9929 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9930 // NumElt = -Part * RunTimeVF 9931 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9932 // LastLane = 1 - RunTimeVF 9933 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9934 PartPtr = 9935 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9936 PartPtr->setIsInBounds(InBounds); 9937 PartPtr = cast<GetElementPtrInst>( 9938 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9939 PartPtr->setIsInBounds(InBounds); 9940 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9941 BlockInMaskParts[Part] = 9942 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9943 } else { 9944 Value *Increment = 9945 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9946 PartPtr = cast<GetElementPtrInst>( 9947 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9948 PartPtr->setIsInBounds(InBounds); 9949 } 9950 9951 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9952 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9953 }; 9954 9955 // Handle Stores: 9956 if (SI) { 9957 State.ILV->setDebugLocFromInst(SI); 9958 9959 for (unsigned Part = 0; Part < State.UF; ++Part) { 9960 Instruction *NewSI = nullptr; 9961 Value *StoredVal = State.get(StoredValue, Part); 9962 if (CreateGatherScatter) { 9963 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9964 Value *VectorGep = State.get(getAddr(), Part); 9965 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9966 MaskPart); 9967 } else { 9968 if (Reverse) { 9969 // If we store to reverse consecutive memory locations, then we need 9970 // to reverse the order of elements in the stored value. 9971 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9972 // We don't want to update the value in the map as it might be used in 9973 // another expression. So don't call resetVectorValue(StoredVal). 9974 } 9975 auto *VecPtr = 9976 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9977 if (isMaskRequired) 9978 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9979 BlockInMaskParts[Part]); 9980 else 9981 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9982 } 9983 State.ILV->addMetadata(NewSI, SI); 9984 } 9985 return; 9986 } 9987 9988 // Handle loads. 9989 assert(LI && "Must have a load instruction"); 9990 State.ILV->setDebugLocFromInst(LI); 9991 for (unsigned Part = 0; Part < State.UF; ++Part) { 9992 Value *NewLI; 9993 if (CreateGatherScatter) { 9994 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9995 Value *VectorGep = State.get(getAddr(), Part); 9996 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9997 nullptr, "wide.masked.gather"); 9998 State.ILV->addMetadata(NewLI, LI); 9999 } else { 10000 auto *VecPtr = 10001 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10002 if (isMaskRequired) 10003 NewLI = Builder.CreateMaskedLoad( 10004 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10005 PoisonValue::get(DataTy), "wide.masked.load"); 10006 else 10007 NewLI = 10008 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10009 10010 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10011 State.ILV->addMetadata(NewLI, LI); 10012 if (Reverse) 10013 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10014 } 10015 10016 State.set(getVPSingleValue(), NewLI, Part); 10017 } 10018 } 10019 10020 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10021 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10022 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10023 // for predication. 10024 static ScalarEpilogueLowering getScalarEpilogueLowering( 10025 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10026 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10027 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10028 LoopVectorizationLegality &LVL) { 10029 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10030 // don't look at hints or options, and don't request a scalar epilogue. 10031 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10032 // LoopAccessInfo (due to code dependency and not being able to reliably get 10033 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10034 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10035 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10036 // back to the old way and vectorize with versioning when forced. See D81345.) 10037 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10038 PGSOQueryType::IRPass) && 10039 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10040 return CM_ScalarEpilogueNotAllowedOptSize; 10041 10042 // 2) If set, obey the directives 10043 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10044 switch (PreferPredicateOverEpilogue) { 10045 case PreferPredicateTy::ScalarEpilogue: 10046 return CM_ScalarEpilogueAllowed; 10047 case PreferPredicateTy::PredicateElseScalarEpilogue: 10048 return CM_ScalarEpilogueNotNeededUsePredicate; 10049 case PreferPredicateTy::PredicateOrDontVectorize: 10050 return CM_ScalarEpilogueNotAllowedUsePredicate; 10051 }; 10052 } 10053 10054 // 3) If set, obey the hints 10055 switch (Hints.getPredicate()) { 10056 case LoopVectorizeHints::FK_Enabled: 10057 return CM_ScalarEpilogueNotNeededUsePredicate; 10058 case LoopVectorizeHints::FK_Disabled: 10059 return CM_ScalarEpilogueAllowed; 10060 }; 10061 10062 // 4) if the TTI hook indicates this is profitable, request predication. 10063 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10064 LVL.getLAI())) 10065 return CM_ScalarEpilogueNotNeededUsePredicate; 10066 10067 return CM_ScalarEpilogueAllowed; 10068 } 10069 10070 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10071 // If Values have been set for this Def return the one relevant for \p Part. 10072 if (hasVectorValue(Def, Part)) 10073 return Data.PerPartOutput[Def][Part]; 10074 10075 if (!hasScalarValue(Def, {Part, 0})) { 10076 Value *IRV = Def->getLiveInIRValue(); 10077 Value *B = ILV->getBroadcastInstrs(IRV); 10078 set(Def, B, Part); 10079 return B; 10080 } 10081 10082 Value *ScalarValue = get(Def, {Part, 0}); 10083 // If we aren't vectorizing, we can just copy the scalar map values over 10084 // to the vector map. 10085 if (VF.isScalar()) { 10086 set(Def, ScalarValue, Part); 10087 return ScalarValue; 10088 } 10089 10090 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10091 bool IsUniform = RepR && RepR->isUniform(); 10092 10093 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10094 // Check if there is a scalar value for the selected lane. 10095 if (!hasScalarValue(Def, {Part, LastLane})) { 10096 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10097 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10098 "unexpected recipe found to be invariant"); 10099 IsUniform = true; 10100 LastLane = 0; 10101 } 10102 10103 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10104 // Set the insert point after the last scalarized instruction or after the 10105 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10106 // will directly follow the scalar definitions. 10107 auto OldIP = Builder.saveIP(); 10108 auto NewIP = 10109 isa<PHINode>(LastInst) 10110 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10111 : std::next(BasicBlock::iterator(LastInst)); 10112 Builder.SetInsertPoint(&*NewIP); 10113 10114 // However, if we are vectorizing, we need to construct the vector values. 10115 // If the value is known to be uniform after vectorization, we can just 10116 // broadcast the scalar value corresponding to lane zero for each unroll 10117 // iteration. Otherwise, we construct the vector values using 10118 // insertelement instructions. Since the resulting vectors are stored in 10119 // State, we will only generate the insertelements once. 10120 Value *VectorValue = nullptr; 10121 if (IsUniform) { 10122 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10123 set(Def, VectorValue, Part); 10124 } else { 10125 // Initialize packing with insertelements to start from undef. 10126 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10127 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10128 set(Def, Undef, Part); 10129 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10130 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10131 VectorValue = get(Def, Part); 10132 } 10133 Builder.restoreIP(OldIP); 10134 return VectorValue; 10135 } 10136 10137 // Process the loop in the VPlan-native vectorization path. This path builds 10138 // VPlan upfront in the vectorization pipeline, which allows to apply 10139 // VPlan-to-VPlan transformations from the very beginning without modifying the 10140 // input LLVM IR. 10141 static bool processLoopInVPlanNativePath( 10142 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10143 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10144 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10145 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10146 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10147 LoopVectorizationRequirements &Requirements) { 10148 10149 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10150 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10151 return false; 10152 } 10153 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10154 Function *F = L->getHeader()->getParent(); 10155 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10156 10157 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10158 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10159 10160 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10161 &Hints, IAI); 10162 // Use the planner for outer loop vectorization. 10163 // TODO: CM is not used at this point inside the planner. Turn CM into an 10164 // optional argument if we don't need it in the future. 10165 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10166 Requirements, ORE); 10167 10168 // Get user vectorization factor. 10169 ElementCount UserVF = Hints.getWidth(); 10170 10171 CM.collectElementTypesForWidening(); 10172 10173 // Plan how to best vectorize, return the best VF and its cost. 10174 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10175 10176 // If we are stress testing VPlan builds, do not attempt to generate vector 10177 // code. Masked vector code generation support will follow soon. 10178 // Also, do not attempt to vectorize if no vector code will be produced. 10179 if (VPlanBuildStressTest || EnableVPlanPredication || 10180 VectorizationFactor::Disabled() == VF) 10181 return false; 10182 10183 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10184 10185 { 10186 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10187 F->getParent()->getDataLayout()); 10188 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10189 &CM, BFI, PSI, Checks); 10190 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10191 << L->getHeader()->getParent()->getName() << "\"\n"); 10192 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10193 } 10194 10195 // Mark the loop as already vectorized to avoid vectorizing again. 10196 Hints.setAlreadyVectorized(); 10197 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10198 return true; 10199 } 10200 10201 // Emit a remark if there are stores to floats that required a floating point 10202 // extension. If the vectorized loop was generated with floating point there 10203 // will be a performance penalty from the conversion overhead and the change in 10204 // the vector width. 10205 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10206 SmallVector<Instruction *, 4> Worklist; 10207 for (BasicBlock *BB : L->getBlocks()) { 10208 for (Instruction &Inst : *BB) { 10209 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10210 if (S->getValueOperand()->getType()->isFloatTy()) 10211 Worklist.push_back(S); 10212 } 10213 } 10214 } 10215 10216 // Traverse the floating point stores upwards searching, for floating point 10217 // conversions. 10218 SmallPtrSet<const Instruction *, 4> Visited; 10219 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10220 while (!Worklist.empty()) { 10221 auto *I = Worklist.pop_back_val(); 10222 if (!L->contains(I)) 10223 continue; 10224 if (!Visited.insert(I).second) 10225 continue; 10226 10227 // Emit a remark if the floating point store required a floating 10228 // point conversion. 10229 // TODO: More work could be done to identify the root cause such as a 10230 // constant or a function return type and point the user to it. 10231 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10232 ORE->emit([&]() { 10233 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10234 I->getDebugLoc(), L->getHeader()) 10235 << "floating point conversion changes vector width. " 10236 << "Mixed floating point precision requires an up/down " 10237 << "cast that will negatively impact performance."; 10238 }); 10239 10240 for (Use &Op : I->operands()) 10241 if (auto *OpI = dyn_cast<Instruction>(Op)) 10242 Worklist.push_back(OpI); 10243 } 10244 } 10245 10246 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10247 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10248 !EnableLoopInterleaving), 10249 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10250 !EnableLoopVectorization) {} 10251 10252 bool LoopVectorizePass::processLoop(Loop *L) { 10253 assert((EnableVPlanNativePath || L->isInnermost()) && 10254 "VPlan-native path is not enabled. Only process inner loops."); 10255 10256 #ifndef NDEBUG 10257 const std::string DebugLocStr = getDebugLocString(L); 10258 #endif /* NDEBUG */ 10259 10260 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10261 << L->getHeader()->getParent()->getName() << "\" from " 10262 << DebugLocStr << "\n"); 10263 10264 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10265 10266 LLVM_DEBUG( 10267 dbgs() << "LV: Loop hints:" 10268 << " force=" 10269 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10270 ? "disabled" 10271 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10272 ? "enabled" 10273 : "?")) 10274 << " width=" << Hints.getWidth() 10275 << " interleave=" << Hints.getInterleave() << "\n"); 10276 10277 // Function containing loop 10278 Function *F = L->getHeader()->getParent(); 10279 10280 // Looking at the diagnostic output is the only way to determine if a loop 10281 // was vectorized (other than looking at the IR or machine code), so it 10282 // is important to generate an optimization remark for each loop. Most of 10283 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10284 // generated as OptimizationRemark and OptimizationRemarkMissed are 10285 // less verbose reporting vectorized loops and unvectorized loops that may 10286 // benefit from vectorization, respectively. 10287 10288 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10289 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10290 return false; 10291 } 10292 10293 PredicatedScalarEvolution PSE(*SE, *L); 10294 10295 // Check if it is legal to vectorize the loop. 10296 LoopVectorizationRequirements Requirements; 10297 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10298 &Requirements, &Hints, DB, AC, BFI, PSI); 10299 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10300 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10301 Hints.emitRemarkWithHints(); 10302 return false; 10303 } 10304 10305 // Check the function attributes and profiles to find out if this function 10306 // should be optimized for size. 10307 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10308 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10309 10310 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10311 // here. They may require CFG and instruction level transformations before 10312 // even evaluating whether vectorization is profitable. Since we cannot modify 10313 // the incoming IR, we need to build VPlan upfront in the vectorization 10314 // pipeline. 10315 if (!L->isInnermost()) 10316 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10317 ORE, BFI, PSI, Hints, Requirements); 10318 10319 assert(L->isInnermost() && "Inner loop expected."); 10320 10321 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10322 // count by optimizing for size, to minimize overheads. 10323 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10324 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10325 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10326 << "This loop is worth vectorizing only if no scalar " 10327 << "iteration overheads are incurred."); 10328 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10329 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10330 else { 10331 LLVM_DEBUG(dbgs() << "\n"); 10332 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10333 } 10334 } 10335 10336 // Check the function attributes to see if implicit floats are allowed. 10337 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10338 // an integer loop and the vector instructions selected are purely integer 10339 // vector instructions? 10340 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10341 reportVectorizationFailure( 10342 "Can't vectorize when the NoImplicitFloat attribute is used", 10343 "loop not vectorized due to NoImplicitFloat attribute", 10344 "NoImplicitFloat", ORE, L); 10345 Hints.emitRemarkWithHints(); 10346 return false; 10347 } 10348 10349 // Check if the target supports potentially unsafe FP vectorization. 10350 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10351 // for the target we're vectorizing for, to make sure none of the 10352 // additional fp-math flags can help. 10353 if (Hints.isPotentiallyUnsafe() && 10354 TTI->isFPVectorizationPotentiallyUnsafe()) { 10355 reportVectorizationFailure( 10356 "Potentially unsafe FP op prevents vectorization", 10357 "loop not vectorized due to unsafe FP support.", 10358 "UnsafeFP", ORE, L); 10359 Hints.emitRemarkWithHints(); 10360 return false; 10361 } 10362 10363 bool AllowOrderedReductions; 10364 // If the flag is set, use that instead and override the TTI behaviour. 10365 if (ForceOrderedReductions.getNumOccurrences() > 0) 10366 AllowOrderedReductions = ForceOrderedReductions; 10367 else 10368 AllowOrderedReductions = TTI->enableOrderedReductions(); 10369 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10370 ORE->emit([&]() { 10371 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10372 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10373 ExactFPMathInst->getDebugLoc(), 10374 ExactFPMathInst->getParent()) 10375 << "loop not vectorized: cannot prove it is safe to reorder " 10376 "floating-point operations"; 10377 }); 10378 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10379 "reorder floating-point operations\n"); 10380 Hints.emitRemarkWithHints(); 10381 return false; 10382 } 10383 10384 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10385 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10386 10387 // If an override option has been passed in for interleaved accesses, use it. 10388 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10389 UseInterleaved = EnableInterleavedMemAccesses; 10390 10391 // Analyze interleaved memory accesses. 10392 if (UseInterleaved) { 10393 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10394 } 10395 10396 // Use the cost model. 10397 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10398 F, &Hints, IAI); 10399 CM.collectValuesToIgnore(); 10400 CM.collectElementTypesForWidening(); 10401 10402 // Use the planner for vectorization. 10403 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10404 Requirements, ORE); 10405 10406 // Get user vectorization factor and interleave count. 10407 ElementCount UserVF = Hints.getWidth(); 10408 unsigned UserIC = Hints.getInterleave(); 10409 10410 // Plan how to best vectorize, return the best VF and its cost. 10411 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10412 10413 VectorizationFactor VF = VectorizationFactor::Disabled(); 10414 unsigned IC = 1; 10415 10416 if (MaybeVF) { 10417 VF = *MaybeVF; 10418 // Select the interleave count. 10419 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10420 } 10421 10422 // Identify the diagnostic messages that should be produced. 10423 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10424 bool VectorizeLoop = true, InterleaveLoop = true; 10425 if (VF.Width.isScalar()) { 10426 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10427 VecDiagMsg = std::make_pair( 10428 "VectorizationNotBeneficial", 10429 "the cost-model indicates that vectorization is not beneficial"); 10430 VectorizeLoop = false; 10431 } 10432 10433 if (!MaybeVF && UserIC > 1) { 10434 // Tell the user interleaving was avoided up-front, despite being explicitly 10435 // requested. 10436 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10437 "interleaving should be avoided up front\n"); 10438 IntDiagMsg = std::make_pair( 10439 "InterleavingAvoided", 10440 "Ignoring UserIC, because interleaving was avoided up front"); 10441 InterleaveLoop = false; 10442 } else if (IC == 1 && UserIC <= 1) { 10443 // Tell the user interleaving is not beneficial. 10444 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10445 IntDiagMsg = std::make_pair( 10446 "InterleavingNotBeneficial", 10447 "the cost-model indicates that interleaving is not beneficial"); 10448 InterleaveLoop = false; 10449 if (UserIC == 1) { 10450 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10451 IntDiagMsg.second += 10452 " and is explicitly disabled or interleave count is set to 1"; 10453 } 10454 } else if (IC > 1 && UserIC == 1) { 10455 // Tell the user interleaving is beneficial, but it explicitly disabled. 10456 LLVM_DEBUG( 10457 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10458 IntDiagMsg = std::make_pair( 10459 "InterleavingBeneficialButDisabled", 10460 "the cost-model indicates that interleaving is beneficial " 10461 "but is explicitly disabled or interleave count is set to 1"); 10462 InterleaveLoop = false; 10463 } 10464 10465 // Override IC if user provided an interleave count. 10466 IC = UserIC > 0 ? UserIC : IC; 10467 10468 // Emit diagnostic messages, if any. 10469 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10470 if (!VectorizeLoop && !InterleaveLoop) { 10471 // Do not vectorize or interleaving the loop. 10472 ORE->emit([&]() { 10473 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10474 L->getStartLoc(), L->getHeader()) 10475 << VecDiagMsg.second; 10476 }); 10477 ORE->emit([&]() { 10478 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10479 L->getStartLoc(), L->getHeader()) 10480 << IntDiagMsg.second; 10481 }); 10482 return false; 10483 } else if (!VectorizeLoop && InterleaveLoop) { 10484 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10485 ORE->emit([&]() { 10486 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10487 L->getStartLoc(), L->getHeader()) 10488 << VecDiagMsg.second; 10489 }); 10490 } else if (VectorizeLoop && !InterleaveLoop) { 10491 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10492 << ") in " << DebugLocStr << '\n'); 10493 ORE->emit([&]() { 10494 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10495 L->getStartLoc(), L->getHeader()) 10496 << IntDiagMsg.second; 10497 }); 10498 } else if (VectorizeLoop && InterleaveLoop) { 10499 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10500 << ") in " << DebugLocStr << '\n'); 10501 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10502 } 10503 10504 bool DisableRuntimeUnroll = false; 10505 MDNode *OrigLoopID = L->getLoopID(); 10506 { 10507 // Optimistically generate runtime checks. Drop them if they turn out to not 10508 // be profitable. Limit the scope of Checks, so the cleanup happens 10509 // immediately after vector codegeneration is done. 10510 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10511 F->getParent()->getDataLayout()); 10512 if (!VF.Width.isScalar() || IC > 1) 10513 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10514 10515 using namespace ore; 10516 if (!VectorizeLoop) { 10517 assert(IC > 1 && "interleave count should not be 1 or 0"); 10518 // If we decided that it is not legal to vectorize the loop, then 10519 // interleave it. 10520 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10521 &CM, BFI, PSI, Checks); 10522 10523 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10524 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10525 10526 ORE->emit([&]() { 10527 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10528 L->getHeader()) 10529 << "interleaved loop (interleaved count: " 10530 << NV("InterleaveCount", IC) << ")"; 10531 }); 10532 } else { 10533 // If we decided that it is *legal* to vectorize the loop, then do it. 10534 10535 // Consider vectorizing the epilogue too if it's profitable. 10536 VectorizationFactor EpilogueVF = 10537 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10538 if (EpilogueVF.Width.isVector()) { 10539 10540 // The first pass vectorizes the main loop and creates a scalar epilogue 10541 // to be vectorized by executing the plan (potentially with a different 10542 // factor) again shortly afterwards. 10543 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10544 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10545 EPI, &LVL, &CM, BFI, PSI, Checks); 10546 10547 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10548 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10549 DT); 10550 ++LoopsVectorized; 10551 10552 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10553 formLCSSARecursively(*L, *DT, LI, SE); 10554 10555 // Second pass vectorizes the epilogue and adjusts the control flow 10556 // edges from the first pass. 10557 EPI.MainLoopVF = EPI.EpilogueVF; 10558 EPI.MainLoopUF = EPI.EpilogueUF; 10559 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10560 ORE, EPI, &LVL, &CM, BFI, PSI, 10561 Checks); 10562 10563 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10564 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10565 DT); 10566 ++LoopsEpilogueVectorized; 10567 10568 if (!MainILV.areSafetyChecksAdded()) 10569 DisableRuntimeUnroll = true; 10570 } else { 10571 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10572 &LVL, &CM, BFI, PSI, Checks); 10573 10574 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10575 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10576 ++LoopsVectorized; 10577 10578 // Add metadata to disable runtime unrolling a scalar loop when there 10579 // are no runtime checks about strides and memory. A scalar loop that is 10580 // rarely used is not worth unrolling. 10581 if (!LB.areSafetyChecksAdded()) 10582 DisableRuntimeUnroll = true; 10583 } 10584 // Report the vectorization decision. 10585 ORE->emit([&]() { 10586 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10587 L->getHeader()) 10588 << "vectorized loop (vectorization width: " 10589 << NV("VectorizationFactor", VF.Width) 10590 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10591 }); 10592 } 10593 10594 if (ORE->allowExtraAnalysis(LV_NAME)) 10595 checkMixedPrecision(L, ORE); 10596 } 10597 10598 Optional<MDNode *> RemainderLoopID = 10599 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10600 LLVMLoopVectorizeFollowupEpilogue}); 10601 if (RemainderLoopID.hasValue()) { 10602 L->setLoopID(RemainderLoopID.getValue()); 10603 } else { 10604 if (DisableRuntimeUnroll) 10605 AddRuntimeUnrollDisableMetaData(L); 10606 10607 // Mark the loop as already vectorized to avoid vectorizing again. 10608 Hints.setAlreadyVectorized(); 10609 } 10610 10611 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10612 return true; 10613 } 10614 10615 LoopVectorizeResult LoopVectorizePass::runImpl( 10616 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10617 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10618 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10619 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10620 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10621 SE = &SE_; 10622 LI = &LI_; 10623 TTI = &TTI_; 10624 DT = &DT_; 10625 BFI = &BFI_; 10626 TLI = TLI_; 10627 AA = &AA_; 10628 AC = &AC_; 10629 GetLAA = &GetLAA_; 10630 DB = &DB_; 10631 ORE = &ORE_; 10632 PSI = PSI_; 10633 10634 // Don't attempt if 10635 // 1. the target claims to have no vector registers, and 10636 // 2. interleaving won't help ILP. 10637 // 10638 // The second condition is necessary because, even if the target has no 10639 // vector registers, loop vectorization may still enable scalar 10640 // interleaving. 10641 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10642 TTI->getMaxInterleaveFactor(1) < 2) 10643 return LoopVectorizeResult(false, false); 10644 10645 bool Changed = false, CFGChanged = false; 10646 10647 // The vectorizer requires loops to be in simplified form. 10648 // Since simplification may add new inner loops, it has to run before the 10649 // legality and profitability checks. This means running the loop vectorizer 10650 // will simplify all loops, regardless of whether anything end up being 10651 // vectorized. 10652 for (auto &L : *LI) 10653 Changed |= CFGChanged |= 10654 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10655 10656 // Build up a worklist of inner-loops to vectorize. This is necessary as 10657 // the act of vectorizing or partially unrolling a loop creates new loops 10658 // and can invalidate iterators across the loops. 10659 SmallVector<Loop *, 8> Worklist; 10660 10661 for (Loop *L : *LI) 10662 collectSupportedLoops(*L, LI, ORE, Worklist); 10663 10664 LoopsAnalyzed += Worklist.size(); 10665 10666 // Now walk the identified inner loops. 10667 while (!Worklist.empty()) { 10668 Loop *L = Worklist.pop_back_val(); 10669 10670 // For the inner loops we actually process, form LCSSA to simplify the 10671 // transform. 10672 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10673 10674 Changed |= CFGChanged |= processLoop(L); 10675 } 10676 10677 // Process each loop nest in the function. 10678 return LoopVectorizeResult(Changed, CFGChanged); 10679 } 10680 10681 PreservedAnalyses LoopVectorizePass::run(Function &F, 10682 FunctionAnalysisManager &AM) { 10683 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10684 auto &LI = AM.getResult<LoopAnalysis>(F); 10685 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10686 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10687 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10688 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10689 auto &AA = AM.getResult<AAManager>(F); 10690 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10691 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10692 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10693 10694 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10695 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10696 [&](Loop &L) -> const LoopAccessInfo & { 10697 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10698 TLI, TTI, nullptr, nullptr, nullptr}; 10699 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10700 }; 10701 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10702 ProfileSummaryInfo *PSI = 10703 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10704 LoopVectorizeResult Result = 10705 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10706 if (!Result.MadeAnyChange) 10707 return PreservedAnalyses::all(); 10708 PreservedAnalyses PA; 10709 10710 // We currently do not preserve loopinfo/dominator analyses with outer loop 10711 // vectorization. Until this is addressed, mark these analyses as preserved 10712 // only for non-VPlan-native path. 10713 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10714 if (!EnableVPlanNativePath) { 10715 PA.preserve<LoopAnalysis>(); 10716 PA.preserve<DominatorTreeAnalysis>(); 10717 } 10718 10719 if (Result.MadeCFGChange) { 10720 // Making CFG changes likely means a loop got vectorized. Indicate that 10721 // extra simplification passes should be run. 10722 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10723 // be run if runtime checks have been added. 10724 AM.getResult<ShouldRunExtraVectorPasses>(F); 10725 PA.preserve<ShouldRunExtraVectorPasses>(); 10726 } else { 10727 PA.preserveSet<CFGAnalyses>(); 10728 } 10729 return PA; 10730 } 10731 10732 void LoopVectorizePass::printPipeline( 10733 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10734 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10735 OS, MapClassName2PassName); 10736 10737 OS << "<"; 10738 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10739 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10740 OS << ">"; 10741 } 10742