1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 static cl::opt<bool> ForceOrderedReductions( 335 "force-ordered-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns true if the given type is irregular. The 379 /// type is irregular if its allocated size doesn't equal the store size of an 380 /// element of the corresponding vector type. 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 382 // Determine if an array of N elements of type Ty is "bitcast compatible" 383 // with a <N x Ty> vector. 384 // This is only true if there is no padding between the array elements. 385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 386 } 387 388 /// A helper function that returns the reciprocal of the block probability of 389 /// predicated blocks. If we return X, we are assuming the predicated block 390 /// will execute once for every X iterations of the loop header. 391 /// 392 /// TODO: We should use actual block probability here, if available. Currently, 393 /// we always assume predicated blocks have a 50% chance of executing. 394 static unsigned getReciprocalPredBlockProb() { return 2; } 395 396 /// A helper function that returns an integer or floating-point constant with 397 /// value C. 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 400 : ConstantFP::get(Ty, C); 401 } 402 403 /// Returns "best known" trip count for the specified loop \p L as defined by 404 /// the following procedure: 405 /// 1) Returns exact trip count if it is known. 406 /// 2) Returns expected trip count according to profile data if any. 407 /// 3) Returns upper bound estimate if it is known. 408 /// 4) Returns None if all of the above failed. 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 410 // Check if exact trip count is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 412 return ExpectedTC; 413 414 // Check if there is an expected trip count available from profile data. 415 if (LoopVectorizeWithBlockFrequency) 416 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 417 return EstimatedTC; 418 419 // Check if upper bound estimate is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 421 return ExpectedTC; 422 423 return None; 424 } 425 426 // Forward declare GeneratedRTChecks. 427 class GeneratedRTChecks; 428 429 namespace llvm { 430 431 AnalysisKey ShouldRunExtraVectorPasses::Key; 432 433 /// InnerLoopVectorizer vectorizes loops which contain only one basic 434 /// block to a specified vectorization factor (VF). 435 /// This class performs the widening of scalars into vectors, or multiple 436 /// scalars. This class also implements the following features: 437 /// * It inserts an epilogue loop for handling loops that don't have iteration 438 /// counts that are known to be a multiple of the vectorization factor. 439 /// * It handles the code generation for reduction variables. 440 /// * Scalarization (implementation using scalars) of un-vectorizable 441 /// instructions. 442 /// InnerLoopVectorizer does not perform any vectorization-legality 443 /// checks, and relies on the caller to check for the different legality 444 /// aspects. The InnerLoopVectorizer relies on the 445 /// LoopVectorizationLegality class to provide information about the induction 446 /// and reduction variables that were found to a given vectorization factor. 447 class InnerLoopVectorizer { 448 public: 449 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 450 LoopInfo *LI, DominatorTree *DT, 451 const TargetLibraryInfo *TLI, 452 const TargetTransformInfo *TTI, AssumptionCache *AC, 453 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 454 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 455 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 456 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 457 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 458 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 459 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 460 PSI(PSI), RTChecks(RTChecks) { 461 // Query this against the original loop and save it here because the profile 462 // of the original loop header may change as the transformation happens. 463 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 464 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 465 } 466 467 virtual ~InnerLoopVectorizer() = default; 468 469 /// Create a new empty loop that will contain vectorized instructions later 470 /// on, while the old loop will be used as the scalar remainder. Control flow 471 /// is generated around the vectorized (and scalar epilogue) loops consisting 472 /// of various checks and bypasses. Return the pre-header block of the new 473 /// loop and the start value for the canonical induction, if it is != 0. The 474 /// latter is the case when vectorizing the epilogue loop. In the case of 475 /// epilogue vectorization, this function is overriden to handle the more 476 /// complex control flow around the loops. 477 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 478 479 /// Widen a single call instruction within the innermost loop. 480 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 481 VPTransformState &State); 482 483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 484 void fixVectorizedLoop(VPTransformState &State); 485 486 // Return true if any runtime check is added. 487 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 488 489 /// A type for vectorized values in the new loop. Each value from the 490 /// original loop, when vectorized, is represented by UF vector values in the 491 /// new unrolled loop, where UF is the unroll factor. 492 using VectorParts = SmallVector<Value *, 2>; 493 494 /// Vectorize a single first-order recurrence or pointer induction PHINode in 495 /// a block. This method handles the induction variable canonicalization. It 496 /// supports both VF = 1 for unrolled loops and arbitrary length vectors. 497 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 498 VPTransformState &State); 499 500 /// A helper function to scalarize a single Instruction in the innermost loop. 501 /// Generates a sequence of scalar instances for each lane between \p MinLane 502 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 503 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 504 /// Instr's operands. 505 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 506 const VPIteration &Instance, bool IfPredicateInstr, 507 VPTransformState &State); 508 509 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 510 /// is provided, the integer induction variable will first be truncated to 511 /// the corresponding type. \p CanonicalIV is the scalar value generated for 512 /// the canonical induction variable. 513 void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, 514 VPTransformState &State, Value *CanonicalIV); 515 516 /// Construct the vector value of a scalarized value \p V one lane at a time. 517 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 518 VPTransformState &State); 519 520 /// Try to vectorize interleaved access group \p Group with the base address 521 /// given in \p Addr, optionally masking the vector operations if \p 522 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 523 /// values in the vectorized loop. 524 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 525 ArrayRef<VPValue *> VPDefs, 526 VPTransformState &State, VPValue *Addr, 527 ArrayRef<VPValue *> StoredValues, 528 VPValue *BlockInMask = nullptr); 529 530 /// Set the debug location in the builder \p Ptr using the debug location in 531 /// \p V. If \p Ptr is None then it uses the class member's Builder. 532 void setDebugLocFromInst(const Value *V, 533 Optional<IRBuilder<> *> CustomBuilder = None); 534 535 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 536 void fixNonInductionPHIs(VPTransformState &State); 537 538 /// Returns true if the reordering of FP operations is not allowed, but we are 539 /// able to vectorize with strict in-order reductions for the given RdxDesc. 540 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 541 542 /// Create a broadcast instruction. This method generates a broadcast 543 /// instruction (shuffle) for loop invariant values and for the induction 544 /// value. If this is the induction variable then we extend it to N, N+1, ... 545 /// this is needed because each iteration in the loop corresponds to a SIMD 546 /// element. 547 virtual Value *getBroadcastInstrs(Value *V); 548 549 /// Add metadata from one instruction to another. 550 /// 551 /// This includes both the original MDs from \p From and additional ones (\see 552 /// addNewMetadata). Use this for *newly created* instructions in the vector 553 /// loop. 554 void addMetadata(Instruction *To, Instruction *From); 555 556 /// Similar to the previous function but it adds the metadata to a 557 /// vector of instructions. 558 void addMetadata(ArrayRef<Value *> To, Instruction *From); 559 560 // Returns the resume value (bc.merge.rdx) for a reduction as 561 // generated by fixReduction. 562 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Introduce a conditional branch (on true, condition to be set later) at the 582 /// end of the header=latch connecting it to itself (across the backedge) and 583 /// to the exit block of \p L. 584 void createHeaderBranch(Loop *L); 585 586 /// Handle all cross-iteration phis in the header. 587 void fixCrossIterationPHIs(VPTransformState &State); 588 589 /// Create the exit value of first order recurrences in the middle block and 590 /// update their users. 591 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 592 VPTransformState &State); 593 594 /// Create code for the loop exit value of the reduction. 595 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 596 597 /// Clear NSW/NUW flags from reduction instructions if necessary. 598 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 599 VPTransformState &State); 600 601 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 602 /// means we need to add the appropriate incoming value from the middle 603 /// block as exiting edges from the scalar epilogue loop (if present) are 604 /// already in place, and we exit the vector loop exclusively to the middle 605 /// block. 606 void fixLCSSAPHIs(VPTransformState &State); 607 608 /// Iteratively sink the scalarized operands of a predicated instruction into 609 /// the block that was created for it. 610 void sinkScalarOperands(Instruction *PredInst); 611 612 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 613 /// represented as. 614 void truncateToMinimalBitwidths(VPTransformState &State); 615 616 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 617 /// variable on which to base the steps, \p Step is the size of the step, and 618 /// \p EntryVal is the value from the original loop that maps to the steps. 619 /// Note that \p EntryVal doesn't have to be an induction variable - it 620 /// can also be a truncate instruction. 621 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 622 const InductionDescriptor &ID, VPValue *Def, 623 VPTransformState &State); 624 625 /// Create a vector induction phi node based on an existing scalar one. \p 626 /// EntryVal is the value from the original loop that maps to the vector phi 627 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 628 /// truncate instruction, instead of widening the original IV, we widen a 629 /// version of the IV truncated to \p EntryVal's type. 630 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 631 Value *Step, Value *Start, 632 Instruction *EntryVal, VPValue *Def, 633 VPTransformState &State); 634 635 /// Returns true if an instruction \p I should be scalarized instead of 636 /// vectorized for the chosen vectorization factor. 637 bool shouldScalarizeInstruction(Instruction *I) const; 638 639 /// Returns true if we should generate a scalar version of \p IV. 640 bool needsScalarInduction(Instruction *IV) const; 641 642 /// Returns (and creates if needed) the original loop trip count. 643 Value *getOrCreateTripCount(Loop *NewLoop); 644 645 /// Returns (and creates if needed) the trip count of the widened loop. 646 Value *getOrCreateVectorTripCount(Loop *NewLoop); 647 648 /// Returns a bitcasted value to the requested vector type. 649 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 650 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 651 const DataLayout &DL); 652 653 /// Emit a bypass check to see if the vector trip count is zero, including if 654 /// it overflows. 655 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 656 657 /// Emit a bypass check to see if all of the SCEV assumptions we've 658 /// had to make are correct. Returns the block containing the checks or 659 /// nullptr if no checks have been added. 660 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 661 662 /// Emit bypass checks to check any memory assumptions we may have made. 663 /// Returns the block containing the checks or nullptr if no checks have been 664 /// added. 665 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 666 667 /// Compute the transformed value of Index at offset StartValue using step 668 /// StepValue. 669 /// For integer induction, returns StartValue + Index * StepValue. 670 /// For pointer induction, returns StartValue[Index * StepValue]. 671 /// FIXME: The newly created binary instructions should contain nsw/nuw 672 /// flags, which can be found from the original scalar operations. 673 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 674 const DataLayout &DL, 675 const InductionDescriptor &ID, 676 BasicBlock *VectorHeader) const; 677 678 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 679 /// vector loop preheader, middle block and scalar preheader. Also 680 /// allocate a loop object for the new vector loop and return it. 681 Loop *createVectorLoopSkeleton(StringRef Prefix); 682 683 /// Create new phi nodes for the induction variables to resume iteration count 684 /// in the scalar epilogue, from where the vectorized loop left off. 685 /// In cases where the loop skeleton is more complicated (eg. epilogue 686 /// vectorization) and the resume values can come from an additional bypass 687 /// block, the \p AdditionalBypass pair provides information about the bypass 688 /// block and the end value on the edge from bypass to this loop. 689 void createInductionResumeValues( 690 Loop *L, 691 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 692 693 /// Complete the loop skeleton by adding debug MDs, creating appropriate 694 /// conditional branches in the middle block, preparing the builder and 695 /// running the verifier. Take in the vector loop \p L as argument, and return 696 /// the preheader of the completed vector loop. 697 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 698 699 /// Add additional metadata to \p To that was not present on \p Orig. 700 /// 701 /// Currently this is used to add the noalias annotations based on the 702 /// inserted memchecks. Use this for instructions that are *cloned* into the 703 /// vector loop. 704 void addNewMetadata(Instruction *To, const Instruction *Orig); 705 706 /// Collect poison-generating recipes that may generate a poison value that is 707 /// used after vectorization, even when their operands are not poison. Those 708 /// recipes meet the following conditions: 709 /// * Contribute to the address computation of a recipe generating a widen 710 /// memory load/store (VPWidenMemoryInstructionRecipe or 711 /// VPInterleaveRecipe). 712 /// * Such a widen memory load/store has at least one underlying Instruction 713 /// that is in a basic block that needs predication and after vectorization 714 /// the generated instruction won't be predicated. 715 void collectPoisonGeneratingRecipes(VPTransformState &State); 716 717 /// Allow subclasses to override and print debug traces before/after vplan 718 /// execution, when trace information is requested. 719 virtual void printDebugTracesAtStart(){}; 720 virtual void printDebugTracesAtEnd(){}; 721 722 /// The original loop. 723 Loop *OrigLoop; 724 725 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 726 /// dynamic knowledge to simplify SCEV expressions and converts them to a 727 /// more usable form. 728 PredicatedScalarEvolution &PSE; 729 730 /// Loop Info. 731 LoopInfo *LI; 732 733 /// Dominator Tree. 734 DominatorTree *DT; 735 736 /// Alias Analysis. 737 AAResults *AA; 738 739 /// Target Library Info. 740 const TargetLibraryInfo *TLI; 741 742 /// Target Transform Info. 743 const TargetTransformInfo *TTI; 744 745 /// Assumption Cache. 746 AssumptionCache *AC; 747 748 /// Interface to emit optimization remarks. 749 OptimizationRemarkEmitter *ORE; 750 751 /// LoopVersioning. It's only set up (non-null) if memchecks were 752 /// used. 753 /// 754 /// This is currently only used to add no-alias metadata based on the 755 /// memchecks. The actually versioning is performed manually. 756 std::unique_ptr<LoopVersioning> LVer; 757 758 /// The vectorization SIMD factor to use. Each vector will have this many 759 /// vector elements. 760 ElementCount VF; 761 762 /// The vectorization unroll factor to use. Each scalar is vectorized to this 763 /// many different vector instructions. 764 unsigned UF; 765 766 /// The builder that we use 767 IRBuilder<> Builder; 768 769 // --- Vectorization state --- 770 771 /// The vector-loop preheader. 772 BasicBlock *LoopVectorPreHeader; 773 774 /// The scalar-loop preheader. 775 BasicBlock *LoopScalarPreHeader; 776 777 /// Middle Block between the vector and the scalar. 778 BasicBlock *LoopMiddleBlock; 779 780 /// The unique ExitBlock of the scalar loop if one exists. Note that 781 /// there can be multiple exiting edges reaching this block. 782 BasicBlock *LoopExitBlock; 783 784 /// The vector loop body. 785 BasicBlock *LoopVectorBody; 786 787 /// The scalar loop body. 788 BasicBlock *LoopScalarBody; 789 790 /// A list of all bypass blocks. The first block is the entry of the loop. 791 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 792 793 /// Store instructions that were predicated. 794 SmallVector<Instruction *, 4> PredicatedInstructions; 795 796 /// Trip count of the original loop. 797 Value *TripCount = nullptr; 798 799 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 800 Value *VectorTripCount = nullptr; 801 802 /// The legality analysis. 803 LoopVectorizationLegality *Legal; 804 805 /// The profitablity analysis. 806 LoopVectorizationCostModel *Cost; 807 808 // Record whether runtime checks are added. 809 bool AddedSafetyChecks = false; 810 811 // Holds the end values for each induction variable. We save the end values 812 // so we can later fix-up the external users of the induction variables. 813 DenseMap<PHINode *, Value *> IVEndValues; 814 815 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 816 // fixed up at the end of vector code generation. 817 SmallVector<PHINode *, 8> OrigPHIsToFix; 818 819 /// BFI and PSI are used to check for profile guided size optimizations. 820 BlockFrequencyInfo *BFI; 821 ProfileSummaryInfo *PSI; 822 823 // Whether this loop should be optimized for size based on profile guided size 824 // optimizatios. 825 bool OptForSizeBasedOnProfile; 826 827 /// Structure to hold information about generated runtime checks, responsible 828 /// for cleaning the checks, if vectorization turns out unprofitable. 829 GeneratedRTChecks &RTChecks; 830 831 // Holds the resume values for reductions in the loops, used to set the 832 // correct start value of reduction PHIs when vectorizing the epilogue. 833 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 834 ReductionResumeValues; 835 }; 836 837 class InnerLoopUnroller : public InnerLoopVectorizer { 838 public: 839 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 840 LoopInfo *LI, DominatorTree *DT, 841 const TargetLibraryInfo *TLI, 842 const TargetTransformInfo *TTI, AssumptionCache *AC, 843 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 844 LoopVectorizationLegality *LVL, 845 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 846 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 847 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 848 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 849 BFI, PSI, Check) {} 850 851 private: 852 Value *getBroadcastInstrs(Value *V) override; 853 }; 854 855 /// Encapsulate information regarding vectorization of a loop and its epilogue. 856 /// This information is meant to be updated and used across two stages of 857 /// epilogue vectorization. 858 struct EpilogueLoopVectorizationInfo { 859 ElementCount MainLoopVF = ElementCount::getFixed(0); 860 unsigned MainLoopUF = 0; 861 ElementCount EpilogueVF = ElementCount::getFixed(0); 862 unsigned EpilogueUF = 0; 863 BasicBlock *MainLoopIterationCountCheck = nullptr; 864 BasicBlock *EpilogueIterationCountCheck = nullptr; 865 BasicBlock *SCEVSafetyCheck = nullptr; 866 BasicBlock *MemSafetyCheck = nullptr; 867 Value *TripCount = nullptr; 868 Value *VectorTripCount = nullptr; 869 870 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 871 ElementCount EVF, unsigned EUF) 872 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 873 assert(EUF == 1 && 874 "A high UF for the epilogue loop is likely not beneficial."); 875 } 876 }; 877 878 /// An extension of the inner loop vectorizer that creates a skeleton for a 879 /// vectorized loop that has its epilogue (residual) also vectorized. 880 /// The idea is to run the vplan on a given loop twice, firstly to setup the 881 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 882 /// from the first step and vectorize the epilogue. This is achieved by 883 /// deriving two concrete strategy classes from this base class and invoking 884 /// them in succession from the loop vectorizer planner. 885 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 886 public: 887 InnerLoopAndEpilogueVectorizer( 888 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 889 DominatorTree *DT, const TargetLibraryInfo *TLI, 890 const TargetTransformInfo *TTI, AssumptionCache *AC, 891 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 892 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 893 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 894 GeneratedRTChecks &Checks) 895 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 896 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 897 Checks), 898 EPI(EPI) {} 899 900 // Override this function to handle the more complex control flow around the 901 // three loops. 902 std::pair<BasicBlock *, Value *> 903 createVectorizedLoopSkeleton() final override { 904 return createEpilogueVectorizedLoopSkeleton(); 905 } 906 907 /// The interface for creating a vectorized skeleton using one of two 908 /// different strategies, each corresponding to one execution of the vplan 909 /// as described above. 910 virtual std::pair<BasicBlock *, Value *> 911 createEpilogueVectorizedLoopSkeleton() = 0; 912 913 /// Holds and updates state information required to vectorize the main loop 914 /// and its epilogue in two separate passes. This setup helps us avoid 915 /// regenerating and recomputing runtime safety checks. It also helps us to 916 /// shorten the iteration-count-check path length for the cases where the 917 /// iteration count of the loop is so small that the main vector loop is 918 /// completely skipped. 919 EpilogueLoopVectorizationInfo &EPI; 920 }; 921 922 /// A specialized derived class of inner loop vectorizer that performs 923 /// vectorization of *main* loops in the process of vectorizing loops and their 924 /// epilogues. 925 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 926 public: 927 EpilogueVectorizerMainLoop( 928 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 929 DominatorTree *DT, const TargetLibraryInfo *TLI, 930 const TargetTransformInfo *TTI, AssumptionCache *AC, 931 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 932 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 933 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 934 GeneratedRTChecks &Check) 935 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 936 EPI, LVL, CM, BFI, PSI, Check) {} 937 /// Implements the interface for creating a vectorized skeleton using the 938 /// *main loop* strategy (ie the first pass of vplan execution). 939 std::pair<BasicBlock *, Value *> 940 createEpilogueVectorizedLoopSkeleton() final override; 941 942 protected: 943 /// Emits an iteration count bypass check once for the main loop (when \p 944 /// ForEpilogue is false) and once for the epilogue loop (when \p 945 /// ForEpilogue is true). 946 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 947 bool ForEpilogue); 948 void printDebugTracesAtStart() override; 949 void printDebugTracesAtEnd() override; 950 }; 951 952 // A specialized derived class of inner loop vectorizer that performs 953 // vectorization of *epilogue* loops in the process of vectorizing loops and 954 // their epilogues. 955 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 956 public: 957 EpilogueVectorizerEpilogueLoop( 958 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 959 DominatorTree *DT, const TargetLibraryInfo *TLI, 960 const TargetTransformInfo *TTI, AssumptionCache *AC, 961 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 962 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 963 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 964 GeneratedRTChecks &Checks) 965 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 966 EPI, LVL, CM, BFI, PSI, Checks) {} 967 /// Implements the interface for creating a vectorized skeleton using the 968 /// *epilogue loop* strategy (ie the second pass of vplan execution). 969 std::pair<BasicBlock *, Value *> 970 createEpilogueVectorizedLoopSkeleton() final override; 971 972 protected: 973 /// Emits an iteration count bypass check after the main vector loop has 974 /// finished to see if there are any iterations left to execute by either 975 /// the vector epilogue or the scalar epilogue. 976 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 977 BasicBlock *Bypass, 978 BasicBlock *Insert); 979 void printDebugTracesAtStart() override; 980 void printDebugTracesAtEnd() override; 981 }; 982 } // end namespace llvm 983 984 /// Look for a meaningful debug location on the instruction or it's 985 /// operands. 986 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 987 if (!I) 988 return I; 989 990 DebugLoc Empty; 991 if (I->getDebugLoc() != Empty) 992 return I; 993 994 for (Use &Op : I->operands()) { 995 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 996 if (OpInst->getDebugLoc() != Empty) 997 return OpInst; 998 } 999 1000 return I; 1001 } 1002 1003 void InnerLoopVectorizer::setDebugLocFromInst( 1004 const Value *V, Optional<IRBuilder<> *> CustomBuilder) { 1005 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 1006 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 1007 const DILocation *DIL = Inst->getDebugLoc(); 1008 1009 // When a FSDiscriminator is enabled, we don't need to add the multiply 1010 // factors to the discriminators. 1011 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1012 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1013 // FIXME: For scalable vectors, assume vscale=1. 1014 auto NewDIL = 1015 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1016 if (NewDIL) 1017 B->SetCurrentDebugLocation(NewDIL.getValue()); 1018 else 1019 LLVM_DEBUG(dbgs() 1020 << "Failed to create new discriminator: " 1021 << DIL->getFilename() << " Line: " << DIL->getLine()); 1022 } else 1023 B->SetCurrentDebugLocation(DIL); 1024 } else 1025 B->SetCurrentDebugLocation(DebugLoc()); 1026 } 1027 1028 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1029 /// is passed, the message relates to that particular instruction. 1030 #ifndef NDEBUG 1031 static void debugVectorizationMessage(const StringRef Prefix, 1032 const StringRef DebugMsg, 1033 Instruction *I) { 1034 dbgs() << "LV: " << Prefix << DebugMsg; 1035 if (I != nullptr) 1036 dbgs() << " " << *I; 1037 else 1038 dbgs() << '.'; 1039 dbgs() << '\n'; 1040 } 1041 #endif 1042 1043 /// Create an analysis remark that explains why vectorization failed 1044 /// 1045 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1046 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1047 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1048 /// the location of the remark. \return the remark object that can be 1049 /// streamed to. 1050 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1051 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1052 Value *CodeRegion = TheLoop->getHeader(); 1053 DebugLoc DL = TheLoop->getStartLoc(); 1054 1055 if (I) { 1056 CodeRegion = I->getParent(); 1057 // If there is no debug location attached to the instruction, revert back to 1058 // using the loop's. 1059 if (I->getDebugLoc()) 1060 DL = I->getDebugLoc(); 1061 } 1062 1063 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1064 } 1065 1066 namespace llvm { 1067 1068 /// Return a value for Step multiplied by VF. 1069 Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, 1070 int64_t Step) { 1071 assert(Ty->isIntegerTy() && "Expected an integer step"); 1072 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1073 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1074 } 1075 1076 /// Return the runtime value for VF. 1077 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1078 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1079 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1080 } 1081 1082 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) { 1083 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1084 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1085 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1086 return B.CreateUIToFP(RuntimeVF, FTy); 1087 } 1088 1089 void reportVectorizationFailure(const StringRef DebugMsg, 1090 const StringRef OREMsg, const StringRef ORETag, 1091 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1092 Instruction *I) { 1093 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1094 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1095 ORE->emit( 1096 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1097 << "loop not vectorized: " << OREMsg); 1098 } 1099 1100 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1101 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1102 Instruction *I) { 1103 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1104 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1105 ORE->emit( 1106 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1107 << Msg); 1108 } 1109 1110 } // end namespace llvm 1111 1112 #ifndef NDEBUG 1113 /// \return string containing a file name and a line # for the given loop. 1114 static std::string getDebugLocString(const Loop *L) { 1115 std::string Result; 1116 if (L) { 1117 raw_string_ostream OS(Result); 1118 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1119 LoopDbgLoc.print(OS); 1120 else 1121 // Just print the module name. 1122 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1123 OS.flush(); 1124 } 1125 return Result; 1126 } 1127 #endif 1128 1129 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1130 const Instruction *Orig) { 1131 // If the loop was versioned with memchecks, add the corresponding no-alias 1132 // metadata. 1133 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1134 LVer->annotateInstWithNoAlias(To, Orig); 1135 } 1136 1137 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1138 VPTransformState &State) { 1139 1140 // Collect recipes in the backward slice of `Root` that may generate a poison 1141 // value that is used after vectorization. 1142 SmallPtrSet<VPRecipeBase *, 16> Visited; 1143 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1144 SmallVector<VPRecipeBase *, 16> Worklist; 1145 Worklist.push_back(Root); 1146 1147 // Traverse the backward slice of Root through its use-def chain. 1148 while (!Worklist.empty()) { 1149 VPRecipeBase *CurRec = Worklist.back(); 1150 Worklist.pop_back(); 1151 1152 if (!Visited.insert(CurRec).second) 1153 continue; 1154 1155 // Prune search if we find another recipe generating a widen memory 1156 // instruction. Widen memory instructions involved in address computation 1157 // will lead to gather/scatter instructions, which don't need to be 1158 // handled. 1159 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1160 isa<VPInterleaveRecipe>(CurRec) || 1161 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1162 continue; 1163 1164 // This recipe contributes to the address computation of a widen 1165 // load/store. Collect recipe if its underlying instruction has 1166 // poison-generating flags. 1167 Instruction *Instr = CurRec->getUnderlyingInstr(); 1168 if (Instr && Instr->hasPoisonGeneratingFlags()) 1169 State.MayGeneratePoisonRecipes.insert(CurRec); 1170 1171 // Add new definitions to the worklist. 1172 for (VPValue *operand : CurRec->operands()) 1173 if (VPDef *OpDef = operand->getDef()) 1174 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1175 } 1176 }); 1177 1178 // Traverse all the recipes in the VPlan and collect the poison-generating 1179 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1180 // VPInterleaveRecipe. 1181 auto Iter = depth_first( 1182 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1183 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1184 for (VPRecipeBase &Recipe : *VPBB) { 1185 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1186 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr(); 1187 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1188 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr && 1189 Legal->blockNeedsPredication(UnderlyingInstr->getParent())) 1190 collectPoisonGeneratingInstrsInBackwardSlice( 1191 cast<VPRecipeBase>(AddrDef)); 1192 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1193 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1194 if (AddrDef) { 1195 // Check if any member of the interleave group needs predication. 1196 const InterleaveGroup<Instruction> *InterGroup = 1197 InterleaveRec->getInterleaveGroup(); 1198 bool NeedPredication = false; 1199 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1200 I < NumMembers; ++I) { 1201 Instruction *Member = InterGroup->getMember(I); 1202 if (Member) 1203 NeedPredication |= 1204 Legal->blockNeedsPredication(Member->getParent()); 1205 } 1206 1207 if (NeedPredication) 1208 collectPoisonGeneratingInstrsInBackwardSlice( 1209 cast<VPRecipeBase>(AddrDef)); 1210 } 1211 } 1212 } 1213 } 1214 } 1215 1216 void InnerLoopVectorizer::addMetadata(Instruction *To, 1217 Instruction *From) { 1218 propagateMetadata(To, From); 1219 addNewMetadata(To, From); 1220 } 1221 1222 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1223 Instruction *From) { 1224 for (Value *V : To) { 1225 if (Instruction *I = dyn_cast<Instruction>(V)) 1226 addMetadata(I, From); 1227 } 1228 } 1229 1230 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1231 const RecurrenceDescriptor &RdxDesc) { 1232 auto It = ReductionResumeValues.find(&RdxDesc); 1233 assert(It != ReductionResumeValues.end() && 1234 "Expected to find a resume value for the reduction."); 1235 return It->second; 1236 } 1237 1238 namespace llvm { 1239 1240 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1241 // lowered. 1242 enum ScalarEpilogueLowering { 1243 1244 // The default: allowing scalar epilogues. 1245 CM_ScalarEpilogueAllowed, 1246 1247 // Vectorization with OptForSize: don't allow epilogues. 1248 CM_ScalarEpilogueNotAllowedOptSize, 1249 1250 // A special case of vectorisation with OptForSize: loops with a very small 1251 // trip count are considered for vectorization under OptForSize, thereby 1252 // making sure the cost of their loop body is dominant, free of runtime 1253 // guards and scalar iteration overheads. 1254 CM_ScalarEpilogueNotAllowedLowTripLoop, 1255 1256 // Loop hint predicate indicating an epilogue is undesired. 1257 CM_ScalarEpilogueNotNeededUsePredicate, 1258 1259 // Directive indicating we must either tail fold or not vectorize 1260 CM_ScalarEpilogueNotAllowedUsePredicate 1261 }; 1262 1263 /// ElementCountComparator creates a total ordering for ElementCount 1264 /// for the purposes of using it in a set structure. 1265 struct ElementCountComparator { 1266 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1267 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1268 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1269 } 1270 }; 1271 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1272 1273 /// LoopVectorizationCostModel - estimates the expected speedups due to 1274 /// vectorization. 1275 /// In many cases vectorization is not profitable. This can happen because of 1276 /// a number of reasons. In this class we mainly attempt to predict the 1277 /// expected speedup/slowdowns due to the supported instruction set. We use the 1278 /// TargetTransformInfo to query the different backends for the cost of 1279 /// different operations. 1280 class LoopVectorizationCostModel { 1281 public: 1282 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1283 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1284 LoopVectorizationLegality *Legal, 1285 const TargetTransformInfo &TTI, 1286 const TargetLibraryInfo *TLI, DemandedBits *DB, 1287 AssumptionCache *AC, 1288 OptimizationRemarkEmitter *ORE, const Function *F, 1289 const LoopVectorizeHints *Hints, 1290 InterleavedAccessInfo &IAI) 1291 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1292 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1293 Hints(Hints), InterleaveInfo(IAI) {} 1294 1295 /// \return An upper bound for the vectorization factors (both fixed and 1296 /// scalable). If the factors are 0, vectorization and interleaving should be 1297 /// avoided up front. 1298 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1299 1300 /// \return True if runtime checks are required for vectorization, and false 1301 /// otherwise. 1302 bool runtimeChecksRequired(); 1303 1304 /// \return The most profitable vectorization factor and the cost of that VF. 1305 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1306 /// then this vectorization factor will be selected if vectorization is 1307 /// possible. 1308 VectorizationFactor 1309 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1310 1311 VectorizationFactor 1312 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1313 const LoopVectorizationPlanner &LVP); 1314 1315 /// Setup cost-based decisions for user vectorization factor. 1316 /// \return true if the UserVF is a feasible VF to be chosen. 1317 bool selectUserVectorizationFactor(ElementCount UserVF) { 1318 collectUniformsAndScalars(UserVF); 1319 collectInstsToScalarize(UserVF); 1320 return expectedCost(UserVF).first.isValid(); 1321 } 1322 1323 /// \return The size (in bits) of the smallest and widest types in the code 1324 /// that needs to be vectorized. We ignore values that remain scalar such as 1325 /// 64 bit loop indices. 1326 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1327 1328 /// \return The desired interleave count. 1329 /// If interleave count has been specified by metadata it will be returned. 1330 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1331 /// are the selected vectorization factor and the cost of the selected VF. 1332 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1333 1334 /// Memory access instruction may be vectorized in more than one way. 1335 /// Form of instruction after vectorization depends on cost. 1336 /// This function takes cost-based decisions for Load/Store instructions 1337 /// and collects them in a map. This decisions map is used for building 1338 /// the lists of loop-uniform and loop-scalar instructions. 1339 /// The calculated cost is saved with widening decision in order to 1340 /// avoid redundant calculations. 1341 void setCostBasedWideningDecision(ElementCount VF); 1342 1343 /// A struct that represents some properties of the register usage 1344 /// of a loop. 1345 struct RegisterUsage { 1346 /// Holds the number of loop invariant values that are used in the loop. 1347 /// The key is ClassID of target-provided register class. 1348 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1349 /// Holds the maximum number of concurrent live intervals in the loop. 1350 /// The key is ClassID of target-provided register class. 1351 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1352 }; 1353 1354 /// \return Returns information about the register usages of the loop for the 1355 /// given vectorization factors. 1356 SmallVector<RegisterUsage, 8> 1357 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1358 1359 /// Collect values we want to ignore in the cost model. 1360 void collectValuesToIgnore(); 1361 1362 /// Collect all element types in the loop for which widening is needed. 1363 void collectElementTypesForWidening(); 1364 1365 /// Split reductions into those that happen in the loop, and those that happen 1366 /// outside. In loop reductions are collected into InLoopReductionChains. 1367 void collectInLoopReductions(); 1368 1369 /// Returns true if we should use strict in-order reductions for the given 1370 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1371 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1372 /// of FP operations. 1373 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1374 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1375 } 1376 1377 /// \returns The smallest bitwidth each instruction can be represented with. 1378 /// The vector equivalents of these instructions should be truncated to this 1379 /// type. 1380 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1381 return MinBWs; 1382 } 1383 1384 /// \returns True if it is more profitable to scalarize instruction \p I for 1385 /// vectorization factor \p VF. 1386 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1387 assert(VF.isVector() && 1388 "Profitable to scalarize relevant only for VF > 1."); 1389 1390 // Cost model is not run in the VPlan-native path - return conservative 1391 // result until this changes. 1392 if (EnableVPlanNativePath) 1393 return false; 1394 1395 auto Scalars = InstsToScalarize.find(VF); 1396 assert(Scalars != InstsToScalarize.end() && 1397 "VF not yet analyzed for scalarization profitability"); 1398 return Scalars->second.find(I) != Scalars->second.end(); 1399 } 1400 1401 /// Returns true if \p I is known to be uniform after vectorization. 1402 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1403 if (VF.isScalar()) 1404 return true; 1405 1406 // Cost model is not run in the VPlan-native path - return conservative 1407 // result until this changes. 1408 if (EnableVPlanNativePath) 1409 return false; 1410 1411 auto UniformsPerVF = Uniforms.find(VF); 1412 assert(UniformsPerVF != Uniforms.end() && 1413 "VF not yet analyzed for uniformity"); 1414 return UniformsPerVF->second.count(I); 1415 } 1416 1417 /// Returns true if \p I is known to be scalar after vectorization. 1418 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1419 if (VF.isScalar()) 1420 return true; 1421 1422 // Cost model is not run in the VPlan-native path - return conservative 1423 // result until this changes. 1424 if (EnableVPlanNativePath) 1425 return false; 1426 1427 auto ScalarsPerVF = Scalars.find(VF); 1428 assert(ScalarsPerVF != Scalars.end() && 1429 "Scalar values are not calculated for VF"); 1430 return ScalarsPerVF->second.count(I); 1431 } 1432 1433 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1434 /// for vectorization factor \p VF. 1435 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1436 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1437 !isProfitableToScalarize(I, VF) && 1438 !isScalarAfterVectorization(I, VF); 1439 } 1440 1441 /// Decision that was taken during cost calculation for memory instruction. 1442 enum InstWidening { 1443 CM_Unknown, 1444 CM_Widen, // For consecutive accesses with stride +1. 1445 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1446 CM_Interleave, 1447 CM_GatherScatter, 1448 CM_Scalarize 1449 }; 1450 1451 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1452 /// instruction \p I and vector width \p VF. 1453 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1454 InstructionCost Cost) { 1455 assert(VF.isVector() && "Expected VF >=2"); 1456 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1457 } 1458 1459 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1460 /// interleaving group \p Grp and vector width \p VF. 1461 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1462 ElementCount VF, InstWidening W, 1463 InstructionCost Cost) { 1464 assert(VF.isVector() && "Expected VF >=2"); 1465 /// Broadcast this decicion to all instructions inside the group. 1466 /// But the cost will be assigned to one instruction only. 1467 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1468 if (auto *I = Grp->getMember(i)) { 1469 if (Grp->getInsertPos() == I) 1470 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1471 else 1472 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1473 } 1474 } 1475 } 1476 1477 /// Return the cost model decision for the given instruction \p I and vector 1478 /// width \p VF. Return CM_Unknown if this instruction did not pass 1479 /// through the cost modeling. 1480 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1481 assert(VF.isVector() && "Expected VF to be a vector VF"); 1482 // Cost model is not run in the VPlan-native path - return conservative 1483 // result until this changes. 1484 if (EnableVPlanNativePath) 1485 return CM_GatherScatter; 1486 1487 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1488 auto Itr = WideningDecisions.find(InstOnVF); 1489 if (Itr == WideningDecisions.end()) 1490 return CM_Unknown; 1491 return Itr->second.first; 1492 } 1493 1494 /// Return the vectorization cost for the given instruction \p I and vector 1495 /// width \p VF. 1496 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1497 assert(VF.isVector() && "Expected VF >=2"); 1498 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1499 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1500 "The cost is not calculated"); 1501 return WideningDecisions[InstOnVF].second; 1502 } 1503 1504 /// Return True if instruction \p I is an optimizable truncate whose operand 1505 /// is an induction variable. Such a truncate will be removed by adding a new 1506 /// induction variable with the destination type. 1507 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1508 // If the instruction is not a truncate, return false. 1509 auto *Trunc = dyn_cast<TruncInst>(I); 1510 if (!Trunc) 1511 return false; 1512 1513 // Get the source and destination types of the truncate. 1514 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1515 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1516 1517 // If the truncate is free for the given types, return false. Replacing a 1518 // free truncate with an induction variable would add an induction variable 1519 // update instruction to each iteration of the loop. We exclude from this 1520 // check the primary induction variable since it will need an update 1521 // instruction regardless. 1522 Value *Op = Trunc->getOperand(0); 1523 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1524 return false; 1525 1526 // If the truncated value is not an induction variable, return false. 1527 return Legal->isInductionPhi(Op); 1528 } 1529 1530 /// Collects the instructions to scalarize for each predicated instruction in 1531 /// the loop. 1532 void collectInstsToScalarize(ElementCount VF); 1533 1534 /// Collect Uniform and Scalar values for the given \p VF. 1535 /// The sets depend on CM decision for Load/Store instructions 1536 /// that may be vectorized as interleave, gather-scatter or scalarized. 1537 void collectUniformsAndScalars(ElementCount VF) { 1538 // Do the analysis once. 1539 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1540 return; 1541 setCostBasedWideningDecision(VF); 1542 collectLoopUniforms(VF); 1543 collectLoopScalars(VF); 1544 } 1545 1546 /// Returns true if the target machine supports masked store operation 1547 /// for the given \p DataType and kind of access to \p Ptr. 1548 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1549 return Legal->isConsecutivePtr(DataType, Ptr) && 1550 TTI.isLegalMaskedStore(DataType, Alignment); 1551 } 1552 1553 /// Returns true if the target machine supports masked load operation 1554 /// for the given \p DataType and kind of access to \p Ptr. 1555 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1556 return Legal->isConsecutivePtr(DataType, Ptr) && 1557 TTI.isLegalMaskedLoad(DataType, Alignment); 1558 } 1559 1560 /// Returns true if the target machine can represent \p V as a masked gather 1561 /// or scatter operation. 1562 bool isLegalGatherOrScatter(Value *V, 1563 ElementCount VF = ElementCount::getFixed(1)) { 1564 bool LI = isa<LoadInst>(V); 1565 bool SI = isa<StoreInst>(V); 1566 if (!LI && !SI) 1567 return false; 1568 auto *Ty = getLoadStoreType(V); 1569 Align Align = getLoadStoreAlignment(V); 1570 if (VF.isVector()) 1571 Ty = VectorType::get(Ty, VF); 1572 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1573 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1574 } 1575 1576 /// Returns true if the target machine supports all of the reduction 1577 /// variables found for the given VF. 1578 bool canVectorizeReductions(ElementCount VF) const { 1579 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1580 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1581 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1582 })); 1583 } 1584 1585 /// Returns true if \p I is an instruction that will be scalarized with 1586 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1587 /// instructions include conditional stores and instructions that may divide 1588 /// by zero. 1589 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1590 1591 // Returns true if \p I is an instruction that will be predicated either 1592 // through scalar predication or masked load/store or masked gather/scatter. 1593 // \p VF is the vectorization factor that will be used to vectorize \p I. 1594 // Superset of instructions that return true for isScalarWithPredication. 1595 bool isPredicatedInst(Instruction *I, ElementCount VF, 1596 bool IsKnownUniform = false) { 1597 // When we know the load is uniform and the original scalar loop was not 1598 // predicated we don't need to mark it as a predicated instruction. Any 1599 // vectorised blocks created when tail-folding are something artificial we 1600 // have introduced and we know there is always at least one active lane. 1601 // That's why we call Legal->blockNeedsPredication here because it doesn't 1602 // query tail-folding. 1603 if (IsKnownUniform && isa<LoadInst>(I) && 1604 !Legal->blockNeedsPredication(I->getParent())) 1605 return false; 1606 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1607 return false; 1608 // Loads and stores that need some form of masked operation are predicated 1609 // instructions. 1610 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1611 return Legal->isMaskRequired(I); 1612 return isScalarWithPredication(I, VF); 1613 } 1614 1615 /// Returns true if \p I is a memory instruction with consecutive memory 1616 /// access that can be widened. 1617 bool 1618 memoryInstructionCanBeWidened(Instruction *I, 1619 ElementCount VF = ElementCount::getFixed(1)); 1620 1621 /// Returns true if \p I is a memory instruction in an interleaved-group 1622 /// of memory accesses that can be vectorized with wide vector loads/stores 1623 /// and shuffles. 1624 bool 1625 interleavedAccessCanBeWidened(Instruction *I, 1626 ElementCount VF = ElementCount::getFixed(1)); 1627 1628 /// Check if \p Instr belongs to any interleaved access group. 1629 bool isAccessInterleaved(Instruction *Instr) { 1630 return InterleaveInfo.isInterleaved(Instr); 1631 } 1632 1633 /// Get the interleaved access group that \p Instr belongs to. 1634 const InterleaveGroup<Instruction> * 1635 getInterleavedAccessGroup(Instruction *Instr) { 1636 return InterleaveInfo.getInterleaveGroup(Instr); 1637 } 1638 1639 /// Returns true if we're required to use a scalar epilogue for at least 1640 /// the final iteration of the original loop. 1641 bool requiresScalarEpilogue(ElementCount VF) const { 1642 if (!isScalarEpilogueAllowed()) 1643 return false; 1644 // If we might exit from anywhere but the latch, must run the exiting 1645 // iteration in scalar form. 1646 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1647 return true; 1648 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1649 } 1650 1651 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1652 /// loop hint annotation. 1653 bool isScalarEpilogueAllowed() const { 1654 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1655 } 1656 1657 /// Returns true if all loop blocks should be masked to fold tail loop. 1658 bool foldTailByMasking() const { return FoldTailByMasking; } 1659 1660 /// Returns true if the instructions in this block requires predication 1661 /// for any reason, e.g. because tail folding now requires a predicate 1662 /// or because the block in the original loop was predicated. 1663 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1664 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1665 } 1666 1667 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1668 /// nodes to the chain of instructions representing the reductions. Uses a 1669 /// MapVector to ensure deterministic iteration order. 1670 using ReductionChainMap = 1671 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1672 1673 /// Return the chain of instructions representing an inloop reduction. 1674 const ReductionChainMap &getInLoopReductionChains() const { 1675 return InLoopReductionChains; 1676 } 1677 1678 /// Returns true if the Phi is part of an inloop reduction. 1679 bool isInLoopReduction(PHINode *Phi) const { 1680 return InLoopReductionChains.count(Phi); 1681 } 1682 1683 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1684 /// with factor VF. Return the cost of the instruction, including 1685 /// scalarization overhead if it's needed. 1686 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1687 1688 /// Estimate cost of a call instruction CI if it were vectorized with factor 1689 /// VF. Return the cost of the instruction, including scalarization overhead 1690 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1691 /// scalarized - 1692 /// i.e. either vector version isn't available, or is too expensive. 1693 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1694 bool &NeedToScalarize) const; 1695 1696 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1697 /// that of B. 1698 bool isMoreProfitable(const VectorizationFactor &A, 1699 const VectorizationFactor &B) const; 1700 1701 /// Invalidates decisions already taken by the cost model. 1702 void invalidateCostModelingDecisions() { 1703 WideningDecisions.clear(); 1704 Uniforms.clear(); 1705 Scalars.clear(); 1706 } 1707 1708 private: 1709 unsigned NumPredStores = 0; 1710 1711 /// \return An upper bound for the vectorization factors for both 1712 /// fixed and scalable vectorization, where the minimum-known number of 1713 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1714 /// disabled or unsupported, then the scalable part will be equal to 1715 /// ElementCount::getScalable(0). 1716 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1717 ElementCount UserVF, 1718 bool FoldTailByMasking); 1719 1720 /// \return the maximized element count based on the targets vector 1721 /// registers and the loop trip-count, but limited to a maximum safe VF. 1722 /// This is a helper function of computeFeasibleMaxVF. 1723 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1724 /// issue that occurred on one of the buildbots which cannot be reproduced 1725 /// without having access to the properietary compiler (see comments on 1726 /// D98509). The issue is currently under investigation and this workaround 1727 /// will be removed as soon as possible. 1728 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1729 unsigned SmallestType, 1730 unsigned WidestType, 1731 const ElementCount &MaxSafeVF, 1732 bool FoldTailByMasking); 1733 1734 /// \return the maximum legal scalable VF, based on the safe max number 1735 /// of elements. 1736 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1737 1738 /// The vectorization cost is a combination of the cost itself and a boolean 1739 /// indicating whether any of the contributing operations will actually 1740 /// operate on vector values after type legalization in the backend. If this 1741 /// latter value is false, then all operations will be scalarized (i.e. no 1742 /// vectorization has actually taken place). 1743 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1744 1745 /// Returns the expected execution cost. The unit of the cost does 1746 /// not matter because we use the 'cost' units to compare different 1747 /// vector widths. The cost that is returned is *not* normalized by 1748 /// the factor width. If \p Invalid is not nullptr, this function 1749 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1750 /// each instruction that has an Invalid cost for the given VF. 1751 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1752 VectorizationCostTy 1753 expectedCost(ElementCount VF, 1754 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1755 1756 /// Returns the execution time cost of an instruction for a given vector 1757 /// width. Vector width of one means scalar. 1758 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1759 1760 /// The cost-computation logic from getInstructionCost which provides 1761 /// the vector type as an output parameter. 1762 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1763 Type *&VectorTy); 1764 1765 /// Return the cost of instructions in an inloop reduction pattern, if I is 1766 /// part of that pattern. 1767 Optional<InstructionCost> 1768 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1769 TTI::TargetCostKind CostKind); 1770 1771 /// Calculate vectorization cost of memory instruction \p I. 1772 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1773 1774 /// The cost computation for scalarized memory instruction. 1775 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1776 1777 /// The cost computation for interleaving group of memory instructions. 1778 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1779 1780 /// The cost computation for Gather/Scatter instruction. 1781 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1782 1783 /// The cost computation for widening instruction \p I with consecutive 1784 /// memory access. 1785 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1786 1787 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1788 /// Load: scalar load + broadcast. 1789 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1790 /// element) 1791 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1792 1793 /// Estimate the overhead of scalarizing an instruction. This is a 1794 /// convenience wrapper for the type-based getScalarizationOverhead API. 1795 InstructionCost getScalarizationOverhead(Instruction *I, 1796 ElementCount VF) const; 1797 1798 /// Returns whether the instruction is a load or store and will be a emitted 1799 /// as a vector operation. 1800 bool isConsecutiveLoadOrStore(Instruction *I); 1801 1802 /// Returns true if an artificially high cost for emulated masked memrefs 1803 /// should be used. 1804 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1805 1806 /// Map of scalar integer values to the smallest bitwidth they can be legally 1807 /// represented as. The vector equivalents of these values should be truncated 1808 /// to this type. 1809 MapVector<Instruction *, uint64_t> MinBWs; 1810 1811 /// A type representing the costs for instructions if they were to be 1812 /// scalarized rather than vectorized. The entries are Instruction-Cost 1813 /// pairs. 1814 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1815 1816 /// A set containing all BasicBlocks that are known to present after 1817 /// vectorization as a predicated block. 1818 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1819 1820 /// Records whether it is allowed to have the original scalar loop execute at 1821 /// least once. This may be needed as a fallback loop in case runtime 1822 /// aliasing/dependence checks fail, or to handle the tail/remainder 1823 /// iterations when the trip count is unknown or doesn't divide by the VF, 1824 /// or as a peel-loop to handle gaps in interleave-groups. 1825 /// Under optsize and when the trip count is very small we don't allow any 1826 /// iterations to execute in the scalar loop. 1827 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1828 1829 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1830 bool FoldTailByMasking = false; 1831 1832 /// A map holding scalar costs for different vectorization factors. The 1833 /// presence of a cost for an instruction in the mapping indicates that the 1834 /// instruction will be scalarized when vectorizing with the associated 1835 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1836 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1837 1838 /// Holds the instructions known to be uniform after vectorization. 1839 /// The data is collected per VF. 1840 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1841 1842 /// Holds the instructions known to be scalar after vectorization. 1843 /// The data is collected per VF. 1844 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1845 1846 /// Holds the instructions (address computations) that are forced to be 1847 /// scalarized. 1848 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1849 1850 /// PHINodes of the reductions that should be expanded in-loop along with 1851 /// their associated chains of reduction operations, in program order from top 1852 /// (PHI) to bottom 1853 ReductionChainMap InLoopReductionChains; 1854 1855 /// A Map of inloop reduction operations and their immediate chain operand. 1856 /// FIXME: This can be removed once reductions can be costed correctly in 1857 /// vplan. This was added to allow quick lookup to the inloop operations, 1858 /// without having to loop through InLoopReductionChains. 1859 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1860 1861 /// Returns the expected difference in cost from scalarizing the expression 1862 /// feeding a predicated instruction \p PredInst. The instructions to 1863 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1864 /// non-negative return value implies the expression will be scalarized. 1865 /// Currently, only single-use chains are considered for scalarization. 1866 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1867 ElementCount VF); 1868 1869 /// Collect the instructions that are uniform after vectorization. An 1870 /// instruction is uniform if we represent it with a single scalar value in 1871 /// the vectorized loop corresponding to each vector iteration. Examples of 1872 /// uniform instructions include pointer operands of consecutive or 1873 /// interleaved memory accesses. Note that although uniformity implies an 1874 /// instruction will be scalar, the reverse is not true. In general, a 1875 /// scalarized instruction will be represented by VF scalar values in the 1876 /// vectorized loop, each corresponding to an iteration of the original 1877 /// scalar loop. 1878 void collectLoopUniforms(ElementCount VF); 1879 1880 /// Collect the instructions that are scalar after vectorization. An 1881 /// instruction is scalar if it is known to be uniform or will be scalarized 1882 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1883 /// to the list if they are used by a load/store instruction that is marked as 1884 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1885 /// VF values in the vectorized loop, each corresponding to an iteration of 1886 /// the original scalar loop. 1887 void collectLoopScalars(ElementCount VF); 1888 1889 /// Keeps cost model vectorization decision and cost for instructions. 1890 /// Right now it is used for memory instructions only. 1891 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1892 std::pair<InstWidening, InstructionCost>>; 1893 1894 DecisionList WideningDecisions; 1895 1896 /// Returns true if \p V is expected to be vectorized and it needs to be 1897 /// extracted. 1898 bool needsExtract(Value *V, ElementCount VF) const { 1899 Instruction *I = dyn_cast<Instruction>(V); 1900 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1901 TheLoop->isLoopInvariant(I)) 1902 return false; 1903 1904 // Assume we can vectorize V (and hence we need extraction) if the 1905 // scalars are not computed yet. This can happen, because it is called 1906 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1907 // the scalars are collected. That should be a safe assumption in most 1908 // cases, because we check if the operands have vectorizable types 1909 // beforehand in LoopVectorizationLegality. 1910 return Scalars.find(VF) == Scalars.end() || 1911 !isScalarAfterVectorization(I, VF); 1912 }; 1913 1914 /// Returns a range containing only operands needing to be extracted. 1915 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1916 ElementCount VF) const { 1917 return SmallVector<Value *, 4>(make_filter_range( 1918 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1919 } 1920 1921 /// Determines if we have the infrastructure to vectorize loop \p L and its 1922 /// epilogue, assuming the main loop is vectorized by \p VF. 1923 bool isCandidateForEpilogueVectorization(const Loop &L, 1924 const ElementCount VF) const; 1925 1926 /// Returns true if epilogue vectorization is considered profitable, and 1927 /// false otherwise. 1928 /// \p VF is the vectorization factor chosen for the original loop. 1929 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1930 1931 public: 1932 /// The loop that we evaluate. 1933 Loop *TheLoop; 1934 1935 /// Predicated scalar evolution analysis. 1936 PredicatedScalarEvolution &PSE; 1937 1938 /// Loop Info analysis. 1939 LoopInfo *LI; 1940 1941 /// Vectorization legality. 1942 LoopVectorizationLegality *Legal; 1943 1944 /// Vector target information. 1945 const TargetTransformInfo &TTI; 1946 1947 /// Target Library Info. 1948 const TargetLibraryInfo *TLI; 1949 1950 /// Demanded bits analysis. 1951 DemandedBits *DB; 1952 1953 /// Assumption cache. 1954 AssumptionCache *AC; 1955 1956 /// Interface to emit optimization remarks. 1957 OptimizationRemarkEmitter *ORE; 1958 1959 const Function *TheFunction; 1960 1961 /// Loop Vectorize Hint. 1962 const LoopVectorizeHints *Hints; 1963 1964 /// The interleave access information contains groups of interleaved accesses 1965 /// with the same stride and close to each other. 1966 InterleavedAccessInfo &InterleaveInfo; 1967 1968 /// Values to ignore in the cost model. 1969 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1970 1971 /// Values to ignore in the cost model when VF > 1. 1972 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1973 1974 /// All element types found in the loop. 1975 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1976 1977 /// Profitable vector factors. 1978 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1979 }; 1980 } // end namespace llvm 1981 1982 /// Helper struct to manage generating runtime checks for vectorization. 1983 /// 1984 /// The runtime checks are created up-front in temporary blocks to allow better 1985 /// estimating the cost and un-linked from the existing IR. After deciding to 1986 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1987 /// temporary blocks are completely removed. 1988 class GeneratedRTChecks { 1989 /// Basic block which contains the generated SCEV checks, if any. 1990 BasicBlock *SCEVCheckBlock = nullptr; 1991 1992 /// The value representing the result of the generated SCEV checks. If it is 1993 /// nullptr, either no SCEV checks have been generated or they have been used. 1994 Value *SCEVCheckCond = nullptr; 1995 1996 /// Basic block which contains the generated memory runtime checks, if any. 1997 BasicBlock *MemCheckBlock = nullptr; 1998 1999 /// The value representing the result of the generated memory runtime checks. 2000 /// If it is nullptr, either no memory runtime checks have been generated or 2001 /// they have been used. 2002 Value *MemRuntimeCheckCond = nullptr; 2003 2004 DominatorTree *DT; 2005 LoopInfo *LI; 2006 2007 SCEVExpander SCEVExp; 2008 SCEVExpander MemCheckExp; 2009 2010 public: 2011 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 2012 const DataLayout &DL) 2013 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 2014 MemCheckExp(SE, DL, "scev.check") {} 2015 2016 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 2017 /// accurately estimate the cost of the runtime checks. The blocks are 2018 /// un-linked from the IR and is added back during vector code generation. If 2019 /// there is no vector code generation, the check blocks are removed 2020 /// completely. 2021 void Create(Loop *L, const LoopAccessInfo &LAI, 2022 const SCEVUnionPredicate &UnionPred) { 2023 2024 BasicBlock *LoopHeader = L->getHeader(); 2025 BasicBlock *Preheader = L->getLoopPreheader(); 2026 2027 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 2028 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 2029 // may be used by SCEVExpander. The blocks will be un-linked from their 2030 // predecessors and removed from LI & DT at the end of the function. 2031 if (!UnionPred.isAlwaysTrue()) { 2032 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 2033 nullptr, "vector.scevcheck"); 2034 2035 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 2036 &UnionPred, SCEVCheckBlock->getTerminator()); 2037 } 2038 2039 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2040 if (RtPtrChecking.Need) { 2041 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2042 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2043 "vector.memcheck"); 2044 2045 MemRuntimeCheckCond = 2046 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2047 RtPtrChecking.getChecks(), MemCheckExp); 2048 assert(MemRuntimeCheckCond && 2049 "no RT checks generated although RtPtrChecking " 2050 "claimed checks are required"); 2051 } 2052 2053 if (!MemCheckBlock && !SCEVCheckBlock) 2054 return; 2055 2056 // Unhook the temporary block with the checks, update various places 2057 // accordingly. 2058 if (SCEVCheckBlock) 2059 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2060 if (MemCheckBlock) 2061 MemCheckBlock->replaceAllUsesWith(Preheader); 2062 2063 if (SCEVCheckBlock) { 2064 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2065 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2066 Preheader->getTerminator()->eraseFromParent(); 2067 } 2068 if (MemCheckBlock) { 2069 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2070 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2071 Preheader->getTerminator()->eraseFromParent(); 2072 } 2073 2074 DT->changeImmediateDominator(LoopHeader, Preheader); 2075 if (MemCheckBlock) { 2076 DT->eraseNode(MemCheckBlock); 2077 LI->removeBlock(MemCheckBlock); 2078 } 2079 if (SCEVCheckBlock) { 2080 DT->eraseNode(SCEVCheckBlock); 2081 LI->removeBlock(SCEVCheckBlock); 2082 } 2083 } 2084 2085 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2086 /// unused. 2087 ~GeneratedRTChecks() { 2088 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2089 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2090 if (!SCEVCheckCond) 2091 SCEVCleaner.markResultUsed(); 2092 2093 if (!MemRuntimeCheckCond) 2094 MemCheckCleaner.markResultUsed(); 2095 2096 if (MemRuntimeCheckCond) { 2097 auto &SE = *MemCheckExp.getSE(); 2098 // Memory runtime check generation creates compares that use expanded 2099 // values. Remove them before running the SCEVExpanderCleaners. 2100 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2101 if (MemCheckExp.isInsertedInstruction(&I)) 2102 continue; 2103 SE.forgetValue(&I); 2104 I.eraseFromParent(); 2105 } 2106 } 2107 MemCheckCleaner.cleanup(); 2108 SCEVCleaner.cleanup(); 2109 2110 if (SCEVCheckCond) 2111 SCEVCheckBlock->eraseFromParent(); 2112 if (MemRuntimeCheckCond) 2113 MemCheckBlock->eraseFromParent(); 2114 } 2115 2116 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2117 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2118 /// depending on the generated condition. 2119 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2120 BasicBlock *LoopVectorPreHeader, 2121 BasicBlock *LoopExitBlock) { 2122 if (!SCEVCheckCond) 2123 return nullptr; 2124 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2125 if (C->isZero()) 2126 return nullptr; 2127 2128 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2129 2130 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2131 // Create new preheader for vector loop. 2132 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2133 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2134 2135 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2136 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2137 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2138 SCEVCheckBlock); 2139 2140 DT->addNewBlock(SCEVCheckBlock, Pred); 2141 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2142 2143 ReplaceInstWithInst( 2144 SCEVCheckBlock->getTerminator(), 2145 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2146 // Mark the check as used, to prevent it from being removed during cleanup. 2147 SCEVCheckCond = nullptr; 2148 return SCEVCheckBlock; 2149 } 2150 2151 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2152 /// the branches to branch to the vector preheader or \p Bypass, depending on 2153 /// the generated condition. 2154 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2155 BasicBlock *LoopVectorPreHeader) { 2156 // Check if we generated code that checks in runtime if arrays overlap. 2157 if (!MemRuntimeCheckCond) 2158 return nullptr; 2159 2160 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2161 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2162 MemCheckBlock); 2163 2164 DT->addNewBlock(MemCheckBlock, Pred); 2165 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2166 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2167 2168 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2169 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2170 2171 ReplaceInstWithInst( 2172 MemCheckBlock->getTerminator(), 2173 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2174 MemCheckBlock->getTerminator()->setDebugLoc( 2175 Pred->getTerminator()->getDebugLoc()); 2176 2177 // Mark the check as used, to prevent it from being removed during cleanup. 2178 MemRuntimeCheckCond = nullptr; 2179 return MemCheckBlock; 2180 } 2181 }; 2182 2183 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2184 // vectorization. The loop needs to be annotated with #pragma omp simd 2185 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2186 // vector length information is not provided, vectorization is not considered 2187 // explicit. Interleave hints are not allowed either. These limitations will be 2188 // relaxed in the future. 2189 // Please, note that we are currently forced to abuse the pragma 'clang 2190 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2191 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2192 // provides *explicit vectorization hints* (LV can bypass legal checks and 2193 // assume that vectorization is legal). However, both hints are implemented 2194 // using the same metadata (llvm.loop.vectorize, processed by 2195 // LoopVectorizeHints). This will be fixed in the future when the native IR 2196 // representation for pragma 'omp simd' is introduced. 2197 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2198 OptimizationRemarkEmitter *ORE) { 2199 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2200 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2201 2202 // Only outer loops with an explicit vectorization hint are supported. 2203 // Unannotated outer loops are ignored. 2204 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2205 return false; 2206 2207 Function *Fn = OuterLp->getHeader()->getParent(); 2208 if (!Hints.allowVectorization(Fn, OuterLp, 2209 true /*VectorizeOnlyWhenForced*/)) { 2210 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2211 return false; 2212 } 2213 2214 if (Hints.getInterleave() > 1) { 2215 // TODO: Interleave support is future work. 2216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2217 "outer loops.\n"); 2218 Hints.emitRemarkWithHints(); 2219 return false; 2220 } 2221 2222 return true; 2223 } 2224 2225 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2226 OptimizationRemarkEmitter *ORE, 2227 SmallVectorImpl<Loop *> &V) { 2228 // Collect inner loops and outer loops without irreducible control flow. For 2229 // now, only collect outer loops that have explicit vectorization hints. If we 2230 // are stress testing the VPlan H-CFG construction, we collect the outermost 2231 // loop of every loop nest. 2232 if (L.isInnermost() || VPlanBuildStressTest || 2233 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2234 LoopBlocksRPO RPOT(&L); 2235 RPOT.perform(LI); 2236 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2237 V.push_back(&L); 2238 // TODO: Collect inner loops inside marked outer loops in case 2239 // vectorization fails for the outer loop. Do not invoke 2240 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2241 // already known to be reducible. We can use an inherited attribute for 2242 // that. 2243 return; 2244 } 2245 } 2246 for (Loop *InnerL : L) 2247 collectSupportedLoops(*InnerL, LI, ORE, V); 2248 } 2249 2250 namespace { 2251 2252 /// The LoopVectorize Pass. 2253 struct LoopVectorize : public FunctionPass { 2254 /// Pass identification, replacement for typeid 2255 static char ID; 2256 2257 LoopVectorizePass Impl; 2258 2259 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2260 bool VectorizeOnlyWhenForced = false) 2261 : FunctionPass(ID), 2262 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2263 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2264 } 2265 2266 bool runOnFunction(Function &F) override { 2267 if (skipFunction(F)) 2268 return false; 2269 2270 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2271 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2272 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2273 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2274 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2275 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2276 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2277 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2278 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2279 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2280 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2281 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2282 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2283 2284 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2285 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2286 2287 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2288 GetLAA, *ORE, PSI).MadeAnyChange; 2289 } 2290 2291 void getAnalysisUsage(AnalysisUsage &AU) const override { 2292 AU.addRequired<AssumptionCacheTracker>(); 2293 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2294 AU.addRequired<DominatorTreeWrapperPass>(); 2295 AU.addRequired<LoopInfoWrapperPass>(); 2296 AU.addRequired<ScalarEvolutionWrapperPass>(); 2297 AU.addRequired<TargetTransformInfoWrapperPass>(); 2298 AU.addRequired<AAResultsWrapperPass>(); 2299 AU.addRequired<LoopAccessLegacyAnalysis>(); 2300 AU.addRequired<DemandedBitsWrapperPass>(); 2301 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2302 AU.addRequired<InjectTLIMappingsLegacy>(); 2303 2304 // We currently do not preserve loopinfo/dominator analyses with outer loop 2305 // vectorization. Until this is addressed, mark these analyses as preserved 2306 // only for non-VPlan-native path. 2307 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2308 if (!EnableVPlanNativePath) { 2309 AU.addPreserved<LoopInfoWrapperPass>(); 2310 AU.addPreserved<DominatorTreeWrapperPass>(); 2311 } 2312 2313 AU.addPreserved<BasicAAWrapperPass>(); 2314 AU.addPreserved<GlobalsAAWrapperPass>(); 2315 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2316 } 2317 }; 2318 2319 } // end anonymous namespace 2320 2321 //===----------------------------------------------------------------------===// 2322 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2323 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2324 //===----------------------------------------------------------------------===// 2325 2326 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2327 // We need to place the broadcast of invariant variables outside the loop, 2328 // but only if it's proven safe to do so. Else, broadcast will be inside 2329 // vector loop body. 2330 Instruction *Instr = dyn_cast<Instruction>(V); 2331 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2332 (!Instr || 2333 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2334 // Place the code for broadcasting invariant variables in the new preheader. 2335 IRBuilder<>::InsertPointGuard Guard(Builder); 2336 if (SafeToHoist) 2337 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2338 2339 // Broadcast the scalar into all locations in the vector. 2340 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2341 2342 return Shuf; 2343 } 2344 2345 /// This function adds 2346 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2347 /// to each vector element of Val. The sequence starts at StartIndex. 2348 /// \p Opcode is relevant for FP induction variable. 2349 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2350 Instruction::BinaryOps BinOp, ElementCount VF, 2351 IRBuilder<> &Builder) { 2352 assert(VF.isVector() && "only vector VFs are supported"); 2353 2354 // Create and check the types. 2355 auto *ValVTy = cast<VectorType>(Val->getType()); 2356 ElementCount VLen = ValVTy->getElementCount(); 2357 2358 Type *STy = Val->getType()->getScalarType(); 2359 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2360 "Induction Step must be an integer or FP"); 2361 assert(Step->getType() == STy && "Step has wrong type"); 2362 2363 SmallVector<Constant *, 8> Indices; 2364 2365 // Create a vector of consecutive numbers from zero to VF. 2366 VectorType *InitVecValVTy = ValVTy; 2367 Type *InitVecValSTy = STy; 2368 if (STy->isFloatingPointTy()) { 2369 InitVecValSTy = 2370 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2371 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2372 } 2373 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2374 2375 // Splat the StartIdx 2376 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2377 2378 if (STy->isIntegerTy()) { 2379 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2380 Step = Builder.CreateVectorSplat(VLen, Step); 2381 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2382 // FIXME: The newly created binary instructions should contain nsw/nuw 2383 // flags, which can be found from the original scalar operations. 2384 Step = Builder.CreateMul(InitVec, Step); 2385 return Builder.CreateAdd(Val, Step, "induction"); 2386 } 2387 2388 // Floating point induction. 2389 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2390 "Binary Opcode should be specified for FP induction"); 2391 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2392 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2393 2394 Step = Builder.CreateVectorSplat(VLen, Step); 2395 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2396 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2397 } 2398 2399 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2400 const InductionDescriptor &II, Value *Step, Value *Start, 2401 Instruction *EntryVal, VPValue *Def, VPTransformState &State) { 2402 IRBuilder<> &Builder = State.Builder; 2403 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2404 "Expected either an induction phi-node or a truncate of it!"); 2405 2406 // Construct the initial value of the vector IV in the vector loop preheader 2407 auto CurrIP = Builder.saveIP(); 2408 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2409 if (isa<TruncInst>(EntryVal)) { 2410 assert(Start->getType()->isIntegerTy() && 2411 "Truncation requires an integer type"); 2412 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2413 Step = Builder.CreateTrunc(Step, TruncType); 2414 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2415 } 2416 2417 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 2418 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 2419 Value *SteppedStart = getStepVector( 2420 SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder); 2421 2422 // We create vector phi nodes for both integer and floating-point induction 2423 // variables. Here, we determine the kind of arithmetic we will perform. 2424 Instruction::BinaryOps AddOp; 2425 Instruction::BinaryOps MulOp; 2426 if (Step->getType()->isIntegerTy()) { 2427 AddOp = Instruction::Add; 2428 MulOp = Instruction::Mul; 2429 } else { 2430 AddOp = II.getInductionOpcode(); 2431 MulOp = Instruction::FMul; 2432 } 2433 2434 // Multiply the vectorization factor by the step using integer or 2435 // floating-point arithmetic as appropriate. 2436 Type *StepType = Step->getType(); 2437 Value *RuntimeVF; 2438 if (Step->getType()->isFloatingPointTy()) 2439 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 2440 else 2441 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 2442 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2443 2444 // Create a vector splat to use in the induction update. 2445 // 2446 // FIXME: If the step is non-constant, we create the vector splat with 2447 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2448 // handle a constant vector splat. 2449 Value *SplatVF = isa<Constant>(Mul) 2450 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 2451 : Builder.CreateVectorSplat(State.VF, Mul); 2452 Builder.restoreIP(CurrIP); 2453 2454 // We may need to add the step a number of times, depending on the unroll 2455 // factor. The last of those goes into the PHI. 2456 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2457 &*LoopVectorBody->getFirstInsertionPt()); 2458 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2459 Instruction *LastInduction = VecInd; 2460 for (unsigned Part = 0; Part < UF; ++Part) { 2461 State.set(Def, LastInduction, Part); 2462 2463 if (isa<TruncInst>(EntryVal)) 2464 addMetadata(LastInduction, EntryVal); 2465 2466 LastInduction = cast<Instruction>( 2467 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2468 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2469 } 2470 2471 // Move the last step to the end of the latch block. This ensures consistent 2472 // placement of all induction updates. 2473 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2474 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2475 LastInduction->moveBefore(Br); 2476 LastInduction->setName("vec.ind.next"); 2477 2478 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2479 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2480 } 2481 2482 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2483 return Cost->isScalarAfterVectorization(I, VF) || 2484 Cost->isProfitableToScalarize(I, VF); 2485 } 2486 2487 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2488 if (shouldScalarizeInstruction(IV)) 2489 return true; 2490 auto isScalarInst = [&](User *U) -> bool { 2491 auto *I = cast<Instruction>(U); 2492 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2493 }; 2494 return llvm::any_of(IV->users(), isScalarInst); 2495 } 2496 2497 void InnerLoopVectorizer::widenIntOrFpInduction( 2498 PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State, 2499 Value *CanonicalIV) { 2500 Value *Start = Def->getStartValue()->getLiveInIRValue(); 2501 const InductionDescriptor &ID = Def->getInductionDescriptor(); 2502 TruncInst *Trunc = Def->getTruncInst(); 2503 IRBuilder<> &Builder = State.Builder; 2504 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2505 assert(!State.VF.isZero() && "VF must be non-zero"); 2506 2507 // The value from the original loop to which we are mapping the new induction 2508 // variable. 2509 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2510 2511 auto &DL = EntryVal->getModule()->getDataLayout(); 2512 2513 // Generate code for the induction step. Note that induction steps are 2514 // required to be loop-invariant 2515 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2516 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2517 "Induction step should be loop invariant"); 2518 if (PSE.getSE()->isSCEVable(IV->getType())) { 2519 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2520 return Exp.expandCodeFor(Step, Step->getType(), 2521 State.CFG.VectorPreHeader->getTerminator()); 2522 } 2523 return cast<SCEVUnknown>(Step)->getValue(); 2524 }; 2525 2526 // The scalar value to broadcast. This is derived from the canonical 2527 // induction variable. If a truncation type is given, truncate the canonical 2528 // induction variable and step. Otherwise, derive these values from the 2529 // induction descriptor. 2530 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2531 Value *ScalarIV = CanonicalIV; 2532 Type *NeededType = IV->getType(); 2533 if (!Def->isCanonical() || ScalarIV->getType() != NeededType) { 2534 ScalarIV = 2535 NeededType->isIntegerTy() 2536 ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType) 2537 : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType); 2538 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, 2539 State.CFG.PrevBB); 2540 ScalarIV->setName("offset.idx"); 2541 } 2542 if (Trunc) { 2543 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2544 assert(Step->getType()->isIntegerTy() && 2545 "Truncation requires an integer step"); 2546 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2547 Step = Builder.CreateTrunc(Step, TruncType); 2548 } 2549 return ScalarIV; 2550 }; 2551 2552 // Create the vector values from the scalar IV, in the absence of creating a 2553 // vector IV. 2554 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2555 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2556 for (unsigned Part = 0; Part < UF; ++Part) { 2557 Value *StartIdx; 2558 if (Step->getType()->isFloatingPointTy()) 2559 StartIdx = 2560 getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); 2561 else 2562 StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); 2563 2564 Value *EntryPart = 2565 getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(), 2566 State.VF, State.Builder); 2567 State.set(Def, EntryPart, Part); 2568 if (Trunc) 2569 addMetadata(EntryPart, Trunc); 2570 } 2571 }; 2572 2573 // Fast-math-flags propagate from the original induction instruction. 2574 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2575 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2576 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2577 2578 // Now do the actual transformations, and start with creating the step value. 2579 Value *Step = CreateStepValue(ID.getStep()); 2580 if (State.VF.isScalar()) { 2581 Value *ScalarIV = CreateScalarIV(Step); 2582 Type *ScalarTy = IntegerType::get(ScalarIV->getContext(), 2583 Step->getType()->getScalarSizeInBits()); 2584 2585 Instruction::BinaryOps IncOp = ID.getInductionOpcode(); 2586 if (IncOp == Instruction::BinaryOpsEnd) 2587 IncOp = Instruction::Add; 2588 for (unsigned Part = 0; Part < UF; ++Part) { 2589 Value *StartIdx = ConstantInt::get(ScalarTy, Part); 2590 Instruction::BinaryOps MulOp = Instruction::Mul; 2591 if (Step->getType()->isFloatingPointTy()) { 2592 StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType()); 2593 MulOp = Instruction::FMul; 2594 } 2595 2596 Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2597 Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction"); 2598 State.set(Def, EntryPart, Part); 2599 if (Trunc) { 2600 assert(!Step->getType()->isFloatingPointTy() && 2601 "fp inductions shouldn't be truncated"); 2602 addMetadata(EntryPart, Trunc); 2603 } 2604 } 2605 return; 2606 } 2607 2608 // Determine if we want a scalar version of the induction variable. This is 2609 // true if the induction variable itself is not widened, or if it has at 2610 // least one user in the loop that is not widened. 2611 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2612 if (!NeedsScalarIV) { 2613 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2614 return; 2615 } 2616 2617 // Try to create a new independent vector induction variable. If we can't 2618 // create the phi node, we will splat the scalar induction variable in each 2619 // loop iteration. 2620 if (!shouldScalarizeInstruction(EntryVal)) { 2621 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); 2622 Value *ScalarIV = CreateScalarIV(Step); 2623 // Create scalar steps that can be used by instructions we will later 2624 // scalarize. Note that the addition of the scalar steps will not increase 2625 // the number of instructions in the loop in the common case prior to 2626 // InstCombine. We will be trading one vector extract for each scalar step. 2627 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2628 return; 2629 } 2630 2631 // All IV users are scalar instructions, so only emit a scalar IV, not a 2632 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2633 // predicate used by the masked loads/stores. 2634 Value *ScalarIV = CreateScalarIV(Step); 2635 if (!Cost->isScalarEpilogueAllowed()) 2636 CreateSplatIV(ScalarIV, Step); 2637 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); 2638 } 2639 2640 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2641 Instruction *EntryVal, 2642 const InductionDescriptor &ID, 2643 VPValue *Def, 2644 VPTransformState &State) { 2645 IRBuilder<> &Builder = State.Builder; 2646 // We shouldn't have to build scalar steps if we aren't vectorizing. 2647 assert(State.VF.isVector() && "VF should be greater than one"); 2648 // Get the value type and ensure it and the step have the same integer type. 2649 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2650 assert(ScalarIVTy == Step->getType() && 2651 "Val and Step should have the same type"); 2652 2653 // We build scalar steps for both integer and floating-point induction 2654 // variables. Here, we determine the kind of arithmetic we will perform. 2655 Instruction::BinaryOps AddOp; 2656 Instruction::BinaryOps MulOp; 2657 if (ScalarIVTy->isIntegerTy()) { 2658 AddOp = Instruction::Add; 2659 MulOp = Instruction::Mul; 2660 } else { 2661 AddOp = ID.getInductionOpcode(); 2662 MulOp = Instruction::FMul; 2663 } 2664 2665 // Determine the number of scalars we need to generate for each unroll 2666 // iteration. If EntryVal is uniform, we only need to generate the first 2667 // lane. Otherwise, we generate all VF values. 2668 bool IsUniform = 2669 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); 2670 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 2671 // Compute the scalar steps and save the results in State. 2672 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2673 ScalarIVTy->getScalarSizeInBits()); 2674 Type *VecIVTy = nullptr; 2675 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2676 if (!IsUniform && State.VF.isScalable()) { 2677 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2678 UnitStepVec = 2679 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2680 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2681 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2682 } 2683 2684 for (unsigned Part = 0; Part < State.UF; ++Part) { 2685 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2686 2687 if (!IsUniform && State.VF.isScalable()) { 2688 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2689 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2690 if (ScalarIVTy->isFloatingPointTy()) 2691 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2692 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2693 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2694 State.set(Def, Add, Part); 2695 // It's useful to record the lane values too for the known minimum number 2696 // of elements so we do those below. This improves the code quality when 2697 // trying to extract the first element, for example. 2698 } 2699 2700 if (ScalarIVTy->isFloatingPointTy()) 2701 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2702 2703 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2704 Value *StartIdx = Builder.CreateBinOp( 2705 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2706 // The step returned by `createStepForVF` is a runtime-evaluated value 2707 // when VF is scalable. Otherwise, it should be folded into a Constant. 2708 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2709 "Expected StartIdx to be folded to a constant when VF is not " 2710 "scalable"); 2711 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2712 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2713 State.set(Def, Add, VPIteration(Part, Lane)); 2714 } 2715 } 2716 } 2717 2718 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2719 const VPIteration &Instance, 2720 VPTransformState &State) { 2721 Value *ScalarInst = State.get(Def, Instance); 2722 Value *VectorValue = State.get(Def, Instance.Part); 2723 VectorValue = Builder.CreateInsertElement( 2724 VectorValue, ScalarInst, 2725 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2726 State.set(Def, VectorValue, Instance.Part); 2727 } 2728 2729 // Return whether we allow using masked interleave-groups (for dealing with 2730 // strided loads/stores that reside in predicated blocks, or for dealing 2731 // with gaps). 2732 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2733 // If an override option has been passed in for interleaved accesses, use it. 2734 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2735 return EnableMaskedInterleavedMemAccesses; 2736 2737 return TTI.enableMaskedInterleavedAccessVectorization(); 2738 } 2739 2740 // Try to vectorize the interleave group that \p Instr belongs to. 2741 // 2742 // E.g. Translate following interleaved load group (factor = 3): 2743 // for (i = 0; i < N; i+=3) { 2744 // R = Pic[i]; // Member of index 0 2745 // G = Pic[i+1]; // Member of index 1 2746 // B = Pic[i+2]; // Member of index 2 2747 // ... // do something to R, G, B 2748 // } 2749 // To: 2750 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2751 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2752 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2753 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2754 // 2755 // Or translate following interleaved store group (factor = 3): 2756 // for (i = 0; i < N; i+=3) { 2757 // ... do something to R, G, B 2758 // Pic[i] = R; // Member of index 0 2759 // Pic[i+1] = G; // Member of index 1 2760 // Pic[i+2] = B; // Member of index 2 2761 // } 2762 // To: 2763 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2764 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2765 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2766 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2767 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2768 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2769 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2770 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2771 VPValue *BlockInMask) { 2772 Instruction *Instr = Group->getInsertPos(); 2773 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2774 2775 // Prepare for the vector type of the interleaved load/store. 2776 Type *ScalarTy = getLoadStoreType(Instr); 2777 unsigned InterleaveFactor = Group->getFactor(); 2778 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2779 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2780 2781 // Prepare for the new pointers. 2782 SmallVector<Value *, 2> AddrParts; 2783 unsigned Index = Group->getIndex(Instr); 2784 2785 // TODO: extend the masked interleaved-group support to reversed access. 2786 assert((!BlockInMask || !Group->isReverse()) && 2787 "Reversed masked interleave-group not supported."); 2788 2789 // If the group is reverse, adjust the index to refer to the last vector lane 2790 // instead of the first. We adjust the index from the first vector lane, 2791 // rather than directly getting the pointer for lane VF - 1, because the 2792 // pointer operand of the interleaved access is supposed to be uniform. For 2793 // uniform instructions, we're only required to generate a value for the 2794 // first vector lane in each unroll iteration. 2795 if (Group->isReverse()) 2796 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2797 2798 for (unsigned Part = 0; Part < UF; Part++) { 2799 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2800 setDebugLocFromInst(AddrPart); 2801 2802 // Notice current instruction could be any index. Need to adjust the address 2803 // to the member of index 0. 2804 // 2805 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2806 // b = A[i]; // Member of index 0 2807 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2808 // 2809 // E.g. A[i+1] = a; // Member of index 1 2810 // A[i] = b; // Member of index 0 2811 // A[i+2] = c; // Member of index 2 (Current instruction) 2812 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2813 2814 bool InBounds = false; 2815 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2816 InBounds = gep->isInBounds(); 2817 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2818 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2819 2820 // Cast to the vector pointer type. 2821 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2822 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2823 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2824 } 2825 2826 setDebugLocFromInst(Instr); 2827 Value *PoisonVec = PoisonValue::get(VecTy); 2828 2829 Value *MaskForGaps = nullptr; 2830 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2831 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2832 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2833 } 2834 2835 // Vectorize the interleaved load group. 2836 if (isa<LoadInst>(Instr)) { 2837 // For each unroll part, create a wide load for the group. 2838 SmallVector<Value *, 2> NewLoads; 2839 for (unsigned Part = 0; Part < UF; Part++) { 2840 Instruction *NewLoad; 2841 if (BlockInMask || MaskForGaps) { 2842 assert(useMaskedInterleavedAccesses(*TTI) && 2843 "masked interleaved groups are not allowed."); 2844 Value *GroupMask = MaskForGaps; 2845 if (BlockInMask) { 2846 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2847 Value *ShuffledMask = Builder.CreateShuffleVector( 2848 BlockInMaskPart, 2849 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2850 "interleaved.mask"); 2851 GroupMask = MaskForGaps 2852 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2853 MaskForGaps) 2854 : ShuffledMask; 2855 } 2856 NewLoad = 2857 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2858 GroupMask, PoisonVec, "wide.masked.vec"); 2859 } 2860 else 2861 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2862 Group->getAlign(), "wide.vec"); 2863 Group->addMetadata(NewLoad); 2864 NewLoads.push_back(NewLoad); 2865 } 2866 2867 // For each member in the group, shuffle out the appropriate data from the 2868 // wide loads. 2869 unsigned J = 0; 2870 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2871 Instruction *Member = Group->getMember(I); 2872 2873 // Skip the gaps in the group. 2874 if (!Member) 2875 continue; 2876 2877 auto StrideMask = 2878 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2879 for (unsigned Part = 0; Part < UF; Part++) { 2880 Value *StridedVec = Builder.CreateShuffleVector( 2881 NewLoads[Part], StrideMask, "strided.vec"); 2882 2883 // If this member has different type, cast the result type. 2884 if (Member->getType() != ScalarTy) { 2885 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2886 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2887 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2888 } 2889 2890 if (Group->isReverse()) 2891 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2892 2893 State.set(VPDefs[J], StridedVec, Part); 2894 } 2895 ++J; 2896 } 2897 return; 2898 } 2899 2900 // The sub vector type for current instruction. 2901 auto *SubVT = VectorType::get(ScalarTy, VF); 2902 2903 // Vectorize the interleaved store group. 2904 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2905 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2906 "masked interleaved groups are not allowed."); 2907 assert((!MaskForGaps || !VF.isScalable()) && 2908 "masking gaps for scalable vectors is not yet supported."); 2909 for (unsigned Part = 0; Part < UF; Part++) { 2910 // Collect the stored vector from each member. 2911 SmallVector<Value *, 4> StoredVecs; 2912 for (unsigned i = 0; i < InterleaveFactor; i++) { 2913 assert((Group->getMember(i) || MaskForGaps) && 2914 "Fail to get a member from an interleaved store group"); 2915 Instruction *Member = Group->getMember(i); 2916 2917 // Skip the gaps in the group. 2918 if (!Member) { 2919 Value *Undef = PoisonValue::get(SubVT); 2920 StoredVecs.push_back(Undef); 2921 continue; 2922 } 2923 2924 Value *StoredVec = State.get(StoredValues[i], Part); 2925 2926 if (Group->isReverse()) 2927 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2928 2929 // If this member has different type, cast it to a unified type. 2930 2931 if (StoredVec->getType() != SubVT) 2932 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2933 2934 StoredVecs.push_back(StoredVec); 2935 } 2936 2937 // Concatenate all vectors into a wide vector. 2938 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2939 2940 // Interleave the elements in the wide vector. 2941 Value *IVec = Builder.CreateShuffleVector( 2942 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2943 "interleaved.vec"); 2944 2945 Instruction *NewStoreInstr; 2946 if (BlockInMask || MaskForGaps) { 2947 Value *GroupMask = MaskForGaps; 2948 if (BlockInMask) { 2949 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2950 Value *ShuffledMask = Builder.CreateShuffleVector( 2951 BlockInMaskPart, 2952 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2953 "interleaved.mask"); 2954 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2955 ShuffledMask, MaskForGaps) 2956 : ShuffledMask; 2957 } 2958 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2959 Group->getAlign(), GroupMask); 2960 } else 2961 NewStoreInstr = 2962 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2963 2964 Group->addMetadata(NewStoreInstr); 2965 } 2966 } 2967 2968 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2969 VPReplicateRecipe *RepRecipe, 2970 const VPIteration &Instance, 2971 bool IfPredicateInstr, 2972 VPTransformState &State) { 2973 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2974 2975 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2976 // the first lane and part. 2977 if (isa<NoAliasScopeDeclInst>(Instr)) 2978 if (!Instance.isFirstIteration()) 2979 return; 2980 2981 setDebugLocFromInst(Instr); 2982 2983 // Does this instruction return a value ? 2984 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2985 2986 Instruction *Cloned = Instr->clone(); 2987 if (!IsVoidRetTy) 2988 Cloned->setName(Instr->getName() + ".cloned"); 2989 2990 // If the scalarized instruction contributes to the address computation of a 2991 // widen masked load/store which was in a basic block that needed predication 2992 // and is not predicated after vectorization, we can't propagate 2993 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2994 // instruction could feed a poison value to the base address of the widen 2995 // load/store. 2996 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2997 Cloned->dropPoisonGeneratingFlags(); 2998 2999 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3000 Builder.GetInsertPoint()); 3001 // Replace the operands of the cloned instructions with their scalar 3002 // equivalents in the new loop. 3003 for (auto &I : enumerate(RepRecipe->operands())) { 3004 auto InputInstance = Instance; 3005 VPValue *Operand = I.value(); 3006 if (State.Plan->isUniformAfterVectorization(Operand)) 3007 InputInstance.Lane = VPLane::getFirstLane(); 3008 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 3009 } 3010 addNewMetadata(Cloned, Instr); 3011 3012 // Place the cloned scalar in the new loop. 3013 Builder.Insert(Cloned); 3014 3015 State.set(RepRecipe, Cloned, Instance); 3016 3017 // If we just cloned a new assumption, add it the assumption cache. 3018 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3019 AC->registerAssumption(II); 3020 3021 // End if-block. 3022 if (IfPredicateInstr) 3023 PredicatedInstructions.push_back(Cloned); 3024 } 3025 3026 void InnerLoopVectorizer::createHeaderBranch(Loop *L) { 3027 BasicBlock *Header = L->getHeader(); 3028 assert(!L->getLoopLatch() && "loop should not have a latch at this point"); 3029 3030 IRBuilder<> B(Header->getTerminator()); 3031 Instruction *OldInst = 3032 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 3033 setDebugLocFromInst(OldInst, &B); 3034 3035 // Connect the header to the exit and header blocks and replace the old 3036 // terminator. 3037 B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); 3038 3039 // Now we have two terminators. Remove the old one from the block. 3040 Header->getTerminator()->eraseFromParent(); 3041 } 3042 3043 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3044 if (TripCount) 3045 return TripCount; 3046 3047 assert(L && "Create Trip Count for null loop."); 3048 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3049 // Find the loop boundaries. 3050 ScalarEvolution *SE = PSE.getSE(); 3051 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3052 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3053 "Invalid loop count"); 3054 3055 Type *IdxTy = Legal->getWidestInductionType(); 3056 assert(IdxTy && "No type for induction"); 3057 3058 // The exit count might have the type of i64 while the phi is i32. This can 3059 // happen if we have an induction variable that is sign extended before the 3060 // compare. The only way that we get a backedge taken count is that the 3061 // induction variable was signed and as such will not overflow. In such a case 3062 // truncation is legal. 3063 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3064 IdxTy->getPrimitiveSizeInBits()) 3065 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3066 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3067 3068 // Get the total trip count from the count by adding 1. 3069 const SCEV *ExitCount = SE->getAddExpr( 3070 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3071 3072 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3073 3074 // Expand the trip count and place the new instructions in the preheader. 3075 // Notice that the pre-header does not change, only the loop body. 3076 SCEVExpander Exp(*SE, DL, "induction"); 3077 3078 // Count holds the overall loop count (N). 3079 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3080 L->getLoopPreheader()->getTerminator()); 3081 3082 if (TripCount->getType()->isPointerTy()) 3083 TripCount = 3084 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3085 L->getLoopPreheader()->getTerminator()); 3086 3087 return TripCount; 3088 } 3089 3090 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3091 if (VectorTripCount) 3092 return VectorTripCount; 3093 3094 Value *TC = getOrCreateTripCount(L); 3095 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3096 3097 Type *Ty = TC->getType(); 3098 // This is where we can make the step a runtime constant. 3099 Value *Step = createStepForVF(Builder, Ty, VF, UF); 3100 3101 // If the tail is to be folded by masking, round the number of iterations N 3102 // up to a multiple of Step instead of rounding down. This is done by first 3103 // adding Step-1 and then rounding down. Note that it's ok if this addition 3104 // overflows: the vector induction variable will eventually wrap to zero given 3105 // that it starts at zero and its Step is a power of two; the loop will then 3106 // exit, with the last early-exit vector comparison also producing all-true. 3107 if (Cost->foldTailByMasking()) { 3108 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3109 "VF*UF must be a power of 2 when folding tail by masking"); 3110 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 3111 TC = Builder.CreateAdd( 3112 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 3113 } 3114 3115 // Now we need to generate the expression for the part of the loop that the 3116 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3117 // iterations are not required for correctness, or N - Step, otherwise. Step 3118 // is equal to the vectorization factor (number of SIMD elements) times the 3119 // unroll factor (number of SIMD instructions). 3120 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3121 3122 // There are cases where we *must* run at least one iteration in the remainder 3123 // loop. See the cost model for when this can happen. If the step evenly 3124 // divides the trip count, we set the remainder to be equal to the step. If 3125 // the step does not evenly divide the trip count, no adjustment is necessary 3126 // since there will already be scalar iterations. Note that the minimum 3127 // iterations check ensures that N >= Step. 3128 if (Cost->requiresScalarEpilogue(VF)) { 3129 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3130 R = Builder.CreateSelect(IsZero, Step, R); 3131 } 3132 3133 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3134 3135 return VectorTripCount; 3136 } 3137 3138 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3139 const DataLayout &DL) { 3140 // Verify that V is a vector type with same number of elements as DstVTy. 3141 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3142 unsigned VF = DstFVTy->getNumElements(); 3143 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3144 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3145 Type *SrcElemTy = SrcVecTy->getElementType(); 3146 Type *DstElemTy = DstFVTy->getElementType(); 3147 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3148 "Vector elements must have same size"); 3149 3150 // Do a direct cast if element types are castable. 3151 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3152 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3153 } 3154 // V cannot be directly casted to desired vector type. 3155 // May happen when V is a floating point vector but DstVTy is a vector of 3156 // pointers or vice-versa. Handle this using a two-step bitcast using an 3157 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3158 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3159 "Only one type should be a pointer type"); 3160 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3161 "Only one type should be a floating point type"); 3162 Type *IntTy = 3163 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3164 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3165 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3166 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3167 } 3168 3169 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3170 BasicBlock *Bypass) { 3171 Value *Count = getOrCreateTripCount(L); 3172 // Reuse existing vector loop preheader for TC checks. 3173 // Note that new preheader block is generated for vector loop. 3174 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3175 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3176 3177 // Generate code to check if the loop's trip count is less than VF * UF, or 3178 // equal to it in case a scalar epilogue is required; this implies that the 3179 // vector trip count is zero. This check also covers the case where adding one 3180 // to the backedge-taken count overflowed leading to an incorrect trip count 3181 // of zero. In this case we will also jump to the scalar loop. 3182 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 3183 : ICmpInst::ICMP_ULT; 3184 3185 // If tail is to be folded, vector loop takes care of all iterations. 3186 Value *CheckMinIters = Builder.getFalse(); 3187 if (!Cost->foldTailByMasking()) { 3188 Value *Step = createStepForVF(Builder, Count->getType(), VF, UF); 3189 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3190 } 3191 // Create new preheader for vector loop. 3192 LoopVectorPreHeader = 3193 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3194 "vector.ph"); 3195 3196 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3197 DT->getNode(Bypass)->getIDom()) && 3198 "TC check is expected to dominate Bypass"); 3199 3200 // Update dominator for Bypass & LoopExit (if needed). 3201 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3202 if (!Cost->requiresScalarEpilogue(VF)) 3203 // If there is an epilogue which must run, there's no edge from the 3204 // middle block to exit blocks and thus no need to update the immediate 3205 // dominator of the exit blocks. 3206 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3207 3208 ReplaceInstWithInst( 3209 TCCheckBlock->getTerminator(), 3210 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3211 LoopBypassBlocks.push_back(TCCheckBlock); 3212 } 3213 3214 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3215 3216 BasicBlock *const SCEVCheckBlock = 3217 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3218 if (!SCEVCheckBlock) 3219 return nullptr; 3220 3221 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3222 (OptForSizeBasedOnProfile && 3223 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3224 "Cannot SCEV check stride or overflow when optimizing for size"); 3225 3226 3227 // Update dominator only if this is first RT check. 3228 if (LoopBypassBlocks.empty()) { 3229 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3230 if (!Cost->requiresScalarEpilogue(VF)) 3231 // If there is an epilogue which must run, there's no edge from the 3232 // middle block to exit blocks and thus no need to update the immediate 3233 // dominator of the exit blocks. 3234 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3235 } 3236 3237 LoopBypassBlocks.push_back(SCEVCheckBlock); 3238 AddedSafetyChecks = true; 3239 return SCEVCheckBlock; 3240 } 3241 3242 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3243 BasicBlock *Bypass) { 3244 // VPlan-native path does not do any analysis for runtime checks currently. 3245 if (EnableVPlanNativePath) 3246 return nullptr; 3247 3248 BasicBlock *const MemCheckBlock = 3249 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3250 3251 // Check if we generated code that checks in runtime if arrays overlap. We put 3252 // the checks into a separate block to make the more common case of few 3253 // elements faster. 3254 if (!MemCheckBlock) 3255 return nullptr; 3256 3257 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3258 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3259 "Cannot emit memory checks when optimizing for size, unless forced " 3260 "to vectorize."); 3261 ORE->emit([&]() { 3262 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3263 L->getStartLoc(), L->getHeader()) 3264 << "Code-size may be reduced by not forcing " 3265 "vectorization, or by source-code modifications " 3266 "eliminating the need for runtime checks " 3267 "(e.g., adding 'restrict')."; 3268 }); 3269 } 3270 3271 LoopBypassBlocks.push_back(MemCheckBlock); 3272 3273 AddedSafetyChecks = true; 3274 3275 // We currently don't use LoopVersioning for the actual loop cloning but we 3276 // still use it to add the noalias metadata. 3277 LVer = std::make_unique<LoopVersioning>( 3278 *Legal->getLAI(), 3279 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3280 DT, PSE.getSE()); 3281 LVer->prepareNoAliasMetadata(); 3282 return MemCheckBlock; 3283 } 3284 3285 Value *InnerLoopVectorizer::emitTransformedIndex( 3286 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3287 const InductionDescriptor &ID, BasicBlock *VectorHeader) const { 3288 3289 SCEVExpander Exp(*SE, DL, "induction"); 3290 auto Step = ID.getStep(); 3291 auto StartValue = ID.getStartValue(); 3292 assert(Index->getType()->getScalarType() == Step->getType() && 3293 "Index scalar type does not match StepValue type"); 3294 3295 // Note: the IR at this point is broken. We cannot use SE to create any new 3296 // SCEV and then expand it, hoping that SCEV's simplification will give us 3297 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3298 // lead to various SCEV crashes. So all we can do is to use builder and rely 3299 // on InstCombine for future simplifications. Here we handle some trivial 3300 // cases only. 3301 auto CreateAdd = [&B](Value *X, Value *Y) { 3302 assert(X->getType() == Y->getType() && "Types don't match!"); 3303 if (auto *CX = dyn_cast<ConstantInt>(X)) 3304 if (CX->isZero()) 3305 return Y; 3306 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3307 if (CY->isZero()) 3308 return X; 3309 return B.CreateAdd(X, Y); 3310 }; 3311 3312 // We allow X to be a vector type, in which case Y will potentially be 3313 // splatted into a vector with the same element count. 3314 auto CreateMul = [&B](Value *X, Value *Y) { 3315 assert(X->getType()->getScalarType() == Y->getType() && 3316 "Types don't match!"); 3317 if (auto *CX = dyn_cast<ConstantInt>(X)) 3318 if (CX->isOne()) 3319 return Y; 3320 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3321 if (CY->isOne()) 3322 return X; 3323 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3324 if (XVTy && !isa<VectorType>(Y->getType())) 3325 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3326 return B.CreateMul(X, Y); 3327 }; 3328 3329 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3330 // loop, choose the end of the vector loop header (=VectorHeader), because 3331 // the DomTree is not kept up-to-date for additional blocks generated in the 3332 // vector loop. By using the header as insertion point, we guarantee that the 3333 // expanded instructions dominate all their uses. 3334 auto GetInsertPoint = [this, &B, VectorHeader]() { 3335 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3336 if (InsertBB != LoopVectorBody && 3337 LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) 3338 return VectorHeader->getTerminator(); 3339 return &*B.GetInsertPoint(); 3340 }; 3341 3342 switch (ID.getKind()) { 3343 case InductionDescriptor::IK_IntInduction: { 3344 assert(!isa<VectorType>(Index->getType()) && 3345 "Vector indices not supported for integer inductions yet"); 3346 assert(Index->getType() == StartValue->getType() && 3347 "Index type does not match StartValue type"); 3348 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3349 return B.CreateSub(StartValue, Index); 3350 auto *Offset = CreateMul( 3351 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3352 return CreateAdd(StartValue, Offset); 3353 } 3354 case InductionDescriptor::IK_PtrInduction: { 3355 assert(isa<SCEVConstant>(Step) && 3356 "Expected constant step for pointer induction"); 3357 return B.CreateGEP( 3358 ID.getElementType(), StartValue, 3359 CreateMul(Index, 3360 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3361 GetInsertPoint()))); 3362 } 3363 case InductionDescriptor::IK_FpInduction: { 3364 assert(!isa<VectorType>(Index->getType()) && 3365 "Vector indices not supported for FP inductions yet"); 3366 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3367 auto InductionBinOp = ID.getInductionBinOp(); 3368 assert(InductionBinOp && 3369 (InductionBinOp->getOpcode() == Instruction::FAdd || 3370 InductionBinOp->getOpcode() == Instruction::FSub) && 3371 "Original bin op should be defined for FP induction"); 3372 3373 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3374 Value *MulExp = B.CreateFMul(StepValue, Index); 3375 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3376 "induction"); 3377 } 3378 case InductionDescriptor::IK_NoInduction: 3379 return nullptr; 3380 } 3381 llvm_unreachable("invalid enum"); 3382 } 3383 3384 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3385 LoopScalarBody = OrigLoop->getHeader(); 3386 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3387 assert(LoopVectorPreHeader && "Invalid loop structure"); 3388 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3389 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3390 "multiple exit loop without required epilogue?"); 3391 3392 LoopMiddleBlock = 3393 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3394 LI, nullptr, Twine(Prefix) + "middle.block"); 3395 LoopScalarPreHeader = 3396 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3397 nullptr, Twine(Prefix) + "scalar.ph"); 3398 3399 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3400 3401 // Set up the middle block terminator. Two cases: 3402 // 1) If we know that we must execute the scalar epilogue, emit an 3403 // unconditional branch. 3404 // 2) Otherwise, we must have a single unique exit block (due to how we 3405 // implement the multiple exit case). In this case, set up a conditonal 3406 // branch from the middle block to the loop scalar preheader, and the 3407 // exit block. completeLoopSkeleton will update the condition to use an 3408 // iteration check, if required to decide whether to execute the remainder. 3409 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3410 BranchInst::Create(LoopScalarPreHeader) : 3411 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3412 Builder.getTrue()); 3413 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3414 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3415 3416 // We intentionally don't let SplitBlock to update LoopInfo since 3417 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3418 // LoopVectorBody is explicitly added to the correct place few lines later. 3419 LoopVectorBody = 3420 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3421 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3422 3423 // Update dominator for loop exit. 3424 if (!Cost->requiresScalarEpilogue(VF)) 3425 // If there is an epilogue which must run, there's no edge from the 3426 // middle block to exit blocks and thus no need to update the immediate 3427 // dominator of the exit blocks. 3428 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3429 3430 // Create and register the new vector loop. 3431 Loop *Lp = LI->AllocateLoop(); 3432 Loop *ParentLoop = OrigLoop->getParentLoop(); 3433 3434 // Insert the new loop into the loop nest and register the new basic blocks 3435 // before calling any utilities such as SCEV that require valid LoopInfo. 3436 if (ParentLoop) { 3437 ParentLoop->addChildLoop(Lp); 3438 } else { 3439 LI->addTopLevelLoop(Lp); 3440 } 3441 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3442 return Lp; 3443 } 3444 3445 void InnerLoopVectorizer::createInductionResumeValues( 3446 Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) { 3447 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3448 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3449 "Inconsistent information about additional bypass."); 3450 3451 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3452 assert(VectorTripCount && L && "Expected valid arguments"); 3453 // We are going to resume the execution of the scalar loop. 3454 // Go over all of the induction variables that we found and fix the 3455 // PHIs that are left in the scalar version of the loop. 3456 // The starting values of PHI nodes depend on the counter of the last 3457 // iteration in the vectorized loop. 3458 // If we come from a bypass edge then we need to start from the original 3459 // start value. 3460 Instruction *OldInduction = Legal->getPrimaryInduction(); 3461 for (auto &InductionEntry : Legal->getInductionVars()) { 3462 PHINode *OrigPhi = InductionEntry.first; 3463 InductionDescriptor II = InductionEntry.second; 3464 3465 // Create phi nodes to merge from the backedge-taken check block. 3466 PHINode *BCResumeVal = 3467 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3468 LoopScalarPreHeader->getTerminator()); 3469 // Copy original phi DL over to the new one. 3470 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3471 Value *&EndValue = IVEndValues[OrigPhi]; 3472 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3473 if (OrigPhi == OldInduction) { 3474 // We know what the end value is. 3475 EndValue = VectorTripCount; 3476 } else { 3477 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3478 3479 // Fast-math-flags propagate from the original induction instruction. 3480 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3481 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3482 3483 Type *StepType = II.getStep()->getType(); 3484 Instruction::CastOps CastOp = 3485 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3486 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3487 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3488 EndValue = 3489 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3490 EndValue->setName("ind.end"); 3491 3492 // Compute the end value for the additional bypass (if applicable). 3493 if (AdditionalBypass.first) { 3494 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3495 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3496 StepType, true); 3497 CRD = 3498 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3499 EndValueFromAdditionalBypass = 3500 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); 3501 EndValueFromAdditionalBypass->setName("ind.end"); 3502 } 3503 } 3504 // The new PHI merges the original incoming value, in case of a bypass, 3505 // or the value at the end of the vectorized loop. 3506 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3507 3508 // Fix the scalar body counter (PHI node). 3509 // The old induction's phi node in the scalar body needs the truncated 3510 // value. 3511 for (BasicBlock *BB : LoopBypassBlocks) 3512 BCResumeVal->addIncoming(II.getStartValue(), BB); 3513 3514 if (AdditionalBypass.first) 3515 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3516 EndValueFromAdditionalBypass); 3517 3518 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3519 } 3520 } 3521 3522 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3523 MDNode *OrigLoopID) { 3524 assert(L && "Expected valid loop."); 3525 3526 // The trip counts should be cached by now. 3527 Value *Count = getOrCreateTripCount(L); 3528 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3529 3530 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3531 3532 // Add a check in the middle block to see if we have completed 3533 // all of the iterations in the first vector loop. Three cases: 3534 // 1) If we require a scalar epilogue, there is no conditional branch as 3535 // we unconditionally branch to the scalar preheader. Do nothing. 3536 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3537 // Thus if tail is to be folded, we know we don't need to run the 3538 // remainder and we can use the previous value for the condition (true). 3539 // 3) Otherwise, construct a runtime check. 3540 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3541 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3542 Count, VectorTripCount, "cmp.n", 3543 LoopMiddleBlock->getTerminator()); 3544 3545 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3546 // of the corresponding compare because they may have ended up with 3547 // different line numbers and we want to avoid awkward line stepping while 3548 // debugging. Eg. if the compare has got a line number inside the loop. 3549 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3550 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3551 } 3552 3553 // Get ready to start creating new instructions into the vectorized body. 3554 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3555 "Inconsistent vector loop preheader"); 3556 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3557 3558 #ifdef EXPENSIVE_CHECKS 3559 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3560 LI->verify(*DT); 3561 #endif 3562 3563 return LoopVectorPreHeader; 3564 } 3565 3566 std::pair<BasicBlock *, Value *> 3567 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3568 /* 3569 In this function we generate a new loop. The new loop will contain 3570 the vectorized instructions while the old loop will continue to run the 3571 scalar remainder. 3572 3573 [ ] <-- loop iteration number check. 3574 / | 3575 / v 3576 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3577 | / | 3578 | / v 3579 || [ ] <-- vector pre header. 3580 |/ | 3581 | v 3582 | [ ] \ 3583 | [ ]_| <-- vector loop. 3584 | | 3585 | v 3586 \ -[ ] <--- middle-block. 3587 \/ | 3588 /\ v 3589 | ->[ ] <--- new preheader. 3590 | | 3591 (opt) v <-- edge from middle to exit iff epilogue is not required. 3592 | [ ] \ 3593 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3594 \ | 3595 \ v 3596 >[ ] <-- exit block(s). 3597 ... 3598 */ 3599 3600 // Get the metadata of the original loop before it gets modified. 3601 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3602 3603 // Workaround! Compute the trip count of the original loop and cache it 3604 // before we start modifying the CFG. This code has a systemic problem 3605 // wherein it tries to run analysis over partially constructed IR; this is 3606 // wrong, and not simply for SCEV. The trip count of the original loop 3607 // simply happens to be prone to hitting this in practice. In theory, we 3608 // can hit the same issue for any SCEV, or ValueTracking query done during 3609 // mutation. See PR49900. 3610 getOrCreateTripCount(OrigLoop); 3611 3612 // Create an empty vector loop, and prepare basic blocks for the runtime 3613 // checks. 3614 Loop *Lp = createVectorLoopSkeleton(""); 3615 3616 // Now, compare the new count to zero. If it is zero skip the vector loop and 3617 // jump to the scalar loop. This check also covers the case where the 3618 // backedge-taken count is uint##_max: adding one to it will overflow leading 3619 // to an incorrect trip count of zero. In this (rare) case we will also jump 3620 // to the scalar loop. 3621 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3622 3623 // Generate the code to check any assumptions that we've made for SCEV 3624 // expressions. 3625 emitSCEVChecks(Lp, LoopScalarPreHeader); 3626 3627 // Generate the code that checks in runtime if arrays overlap. We put the 3628 // checks into a separate block to make the more common case of few elements 3629 // faster. 3630 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3631 3632 createHeaderBranch(Lp); 3633 3634 // Emit phis for the new starting index of the scalar loop. 3635 createInductionResumeValues(Lp); 3636 3637 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 3638 } 3639 3640 // Fix up external users of the induction variable. At this point, we are 3641 // in LCSSA form, with all external PHIs that use the IV having one input value, 3642 // coming from the remainder loop. We need those PHIs to also have a correct 3643 // value for the IV when arriving directly from the middle block. 3644 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3645 const InductionDescriptor &II, 3646 Value *CountRoundDown, Value *EndValue, 3647 BasicBlock *MiddleBlock) { 3648 // There are two kinds of external IV usages - those that use the value 3649 // computed in the last iteration (the PHI) and those that use the penultimate 3650 // value (the value that feeds into the phi from the loop latch). 3651 // We allow both, but they, obviously, have different values. 3652 3653 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3654 3655 DenseMap<Value *, Value *> MissingVals; 3656 3657 // An external user of the last iteration's value should see the value that 3658 // the remainder loop uses to initialize its own IV. 3659 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3660 for (User *U : PostInc->users()) { 3661 Instruction *UI = cast<Instruction>(U); 3662 if (!OrigLoop->contains(UI)) { 3663 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3664 MissingVals[UI] = EndValue; 3665 } 3666 } 3667 3668 // An external user of the penultimate value need to see EndValue - Step. 3669 // The simplest way to get this is to recompute it from the constituent SCEVs, 3670 // that is Start + (Step * (CRD - 1)). 3671 for (User *U : OrigPhi->users()) { 3672 auto *UI = cast<Instruction>(U); 3673 if (!OrigLoop->contains(UI)) { 3674 const DataLayout &DL = 3675 OrigLoop->getHeader()->getModule()->getDataLayout(); 3676 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3677 3678 IRBuilder<> B(MiddleBlock->getTerminator()); 3679 3680 // Fast-math-flags propagate from the original induction instruction. 3681 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3682 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3683 3684 Value *CountMinusOne = B.CreateSub( 3685 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3686 Value *CMO = 3687 !II.getStep()->getType()->isIntegerTy() 3688 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3689 II.getStep()->getType()) 3690 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3691 CMO->setName("cast.cmo"); 3692 Value *Escape = 3693 emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); 3694 Escape->setName("ind.escape"); 3695 MissingVals[UI] = Escape; 3696 } 3697 } 3698 3699 for (auto &I : MissingVals) { 3700 PHINode *PHI = cast<PHINode>(I.first); 3701 // One corner case we have to handle is two IVs "chasing" each-other, 3702 // that is %IV2 = phi [...], [ %IV1, %latch ] 3703 // In this case, if IV1 has an external use, we need to avoid adding both 3704 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3705 // don't already have an incoming value for the middle block. 3706 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3707 PHI->addIncoming(I.second, MiddleBlock); 3708 } 3709 } 3710 3711 namespace { 3712 3713 struct CSEDenseMapInfo { 3714 static bool canHandle(const Instruction *I) { 3715 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3716 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3717 } 3718 3719 static inline Instruction *getEmptyKey() { 3720 return DenseMapInfo<Instruction *>::getEmptyKey(); 3721 } 3722 3723 static inline Instruction *getTombstoneKey() { 3724 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3725 } 3726 3727 static unsigned getHashValue(const Instruction *I) { 3728 assert(canHandle(I) && "Unknown instruction!"); 3729 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3730 I->value_op_end())); 3731 } 3732 3733 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3734 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3735 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3736 return LHS == RHS; 3737 return LHS->isIdenticalTo(RHS); 3738 } 3739 }; 3740 3741 } // end anonymous namespace 3742 3743 ///Perform cse of induction variable instructions. 3744 static void cse(BasicBlock *BB) { 3745 // Perform simple cse. 3746 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3747 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3748 if (!CSEDenseMapInfo::canHandle(&In)) 3749 continue; 3750 3751 // Check if we can replace this instruction with any of the 3752 // visited instructions. 3753 if (Instruction *V = CSEMap.lookup(&In)) { 3754 In.replaceAllUsesWith(V); 3755 In.eraseFromParent(); 3756 continue; 3757 } 3758 3759 CSEMap[&In] = &In; 3760 } 3761 } 3762 3763 InstructionCost 3764 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3765 bool &NeedToScalarize) const { 3766 Function *F = CI->getCalledFunction(); 3767 Type *ScalarRetTy = CI->getType(); 3768 SmallVector<Type *, 4> Tys, ScalarTys; 3769 for (auto &ArgOp : CI->args()) 3770 ScalarTys.push_back(ArgOp->getType()); 3771 3772 // Estimate cost of scalarized vector call. The source operands are assumed 3773 // to be vectors, so we need to extract individual elements from there, 3774 // execute VF scalar calls, and then gather the result into the vector return 3775 // value. 3776 InstructionCost ScalarCallCost = 3777 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3778 if (VF.isScalar()) 3779 return ScalarCallCost; 3780 3781 // Compute corresponding vector type for return value and arguments. 3782 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3783 for (Type *ScalarTy : ScalarTys) 3784 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3785 3786 // Compute costs of unpacking argument values for the scalar calls and 3787 // packing the return values to a vector. 3788 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3789 3790 InstructionCost Cost = 3791 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3792 3793 // If we can't emit a vector call for this function, then the currently found 3794 // cost is the cost we need to return. 3795 NeedToScalarize = true; 3796 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3797 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3798 3799 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3800 return Cost; 3801 3802 // If the corresponding vector cost is cheaper, return its cost. 3803 InstructionCost VectorCallCost = 3804 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3805 if (VectorCallCost < Cost) { 3806 NeedToScalarize = false; 3807 Cost = VectorCallCost; 3808 } 3809 return Cost; 3810 } 3811 3812 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3813 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3814 return Elt; 3815 return VectorType::get(Elt, VF); 3816 } 3817 3818 InstructionCost 3819 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3820 ElementCount VF) const { 3821 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3822 assert(ID && "Expected intrinsic call!"); 3823 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3824 FastMathFlags FMF; 3825 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3826 FMF = FPMO->getFastMathFlags(); 3827 3828 SmallVector<const Value *> Arguments(CI->args()); 3829 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3830 SmallVector<Type *> ParamTys; 3831 std::transform(FTy->param_begin(), FTy->param_end(), 3832 std::back_inserter(ParamTys), 3833 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3834 3835 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3836 dyn_cast<IntrinsicInst>(CI)); 3837 return TTI.getIntrinsicInstrCost(CostAttrs, 3838 TargetTransformInfo::TCK_RecipThroughput); 3839 } 3840 3841 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3842 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3843 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3844 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3845 } 3846 3847 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3848 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3849 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3850 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3851 } 3852 3853 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3854 // For every instruction `I` in MinBWs, truncate the operands, create a 3855 // truncated version of `I` and reextend its result. InstCombine runs 3856 // later and will remove any ext/trunc pairs. 3857 SmallPtrSet<Value *, 4> Erased; 3858 for (const auto &KV : Cost->getMinimalBitwidths()) { 3859 // If the value wasn't vectorized, we must maintain the original scalar 3860 // type. The absence of the value from State indicates that it 3861 // wasn't vectorized. 3862 // FIXME: Should not rely on getVPValue at this point. 3863 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3864 if (!State.hasAnyVectorValue(Def)) 3865 continue; 3866 for (unsigned Part = 0; Part < UF; ++Part) { 3867 Value *I = State.get(Def, Part); 3868 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3869 continue; 3870 Type *OriginalTy = I->getType(); 3871 Type *ScalarTruncatedTy = 3872 IntegerType::get(OriginalTy->getContext(), KV.second); 3873 auto *TruncatedTy = VectorType::get( 3874 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3875 if (TruncatedTy == OriginalTy) 3876 continue; 3877 3878 IRBuilder<> B(cast<Instruction>(I)); 3879 auto ShrinkOperand = [&](Value *V) -> Value * { 3880 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3881 if (ZI->getSrcTy() == TruncatedTy) 3882 return ZI->getOperand(0); 3883 return B.CreateZExtOrTrunc(V, TruncatedTy); 3884 }; 3885 3886 // The actual instruction modification depends on the instruction type, 3887 // unfortunately. 3888 Value *NewI = nullptr; 3889 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3890 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3891 ShrinkOperand(BO->getOperand(1))); 3892 3893 // Any wrapping introduced by shrinking this operation shouldn't be 3894 // considered undefined behavior. So, we can't unconditionally copy 3895 // arithmetic wrapping flags to NewI. 3896 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3897 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3898 NewI = 3899 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3900 ShrinkOperand(CI->getOperand(1))); 3901 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3902 NewI = B.CreateSelect(SI->getCondition(), 3903 ShrinkOperand(SI->getTrueValue()), 3904 ShrinkOperand(SI->getFalseValue())); 3905 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3906 switch (CI->getOpcode()) { 3907 default: 3908 llvm_unreachable("Unhandled cast!"); 3909 case Instruction::Trunc: 3910 NewI = ShrinkOperand(CI->getOperand(0)); 3911 break; 3912 case Instruction::SExt: 3913 NewI = B.CreateSExtOrTrunc( 3914 CI->getOperand(0), 3915 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3916 break; 3917 case Instruction::ZExt: 3918 NewI = B.CreateZExtOrTrunc( 3919 CI->getOperand(0), 3920 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3921 break; 3922 } 3923 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3924 auto Elements0 = 3925 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3926 auto *O0 = B.CreateZExtOrTrunc( 3927 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3928 auto Elements1 = 3929 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3930 auto *O1 = B.CreateZExtOrTrunc( 3931 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3932 3933 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3934 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3935 // Don't do anything with the operands, just extend the result. 3936 continue; 3937 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3938 auto Elements = 3939 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3940 auto *O0 = B.CreateZExtOrTrunc( 3941 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3942 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3943 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3944 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3945 auto Elements = 3946 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3947 auto *O0 = B.CreateZExtOrTrunc( 3948 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3949 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3950 } else { 3951 // If we don't know what to do, be conservative and don't do anything. 3952 continue; 3953 } 3954 3955 // Lastly, extend the result. 3956 NewI->takeName(cast<Instruction>(I)); 3957 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3958 I->replaceAllUsesWith(Res); 3959 cast<Instruction>(I)->eraseFromParent(); 3960 Erased.insert(I); 3961 State.reset(Def, Res, Part); 3962 } 3963 } 3964 3965 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3966 for (const auto &KV : Cost->getMinimalBitwidths()) { 3967 // If the value wasn't vectorized, we must maintain the original scalar 3968 // type. The absence of the value from State indicates that it 3969 // wasn't vectorized. 3970 // FIXME: Should not rely on getVPValue at this point. 3971 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3972 if (!State.hasAnyVectorValue(Def)) 3973 continue; 3974 for (unsigned Part = 0; Part < UF; ++Part) { 3975 Value *I = State.get(Def, Part); 3976 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3977 if (Inst && Inst->use_empty()) { 3978 Value *NewI = Inst->getOperand(0); 3979 Inst->eraseFromParent(); 3980 State.reset(Def, NewI, Part); 3981 } 3982 } 3983 } 3984 } 3985 3986 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3987 // Insert truncates and extends for any truncated instructions as hints to 3988 // InstCombine. 3989 if (VF.isVector()) 3990 truncateToMinimalBitwidths(State); 3991 3992 // Fix widened non-induction PHIs by setting up the PHI operands. 3993 if (OrigPHIsToFix.size()) { 3994 assert(EnableVPlanNativePath && 3995 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3996 fixNonInductionPHIs(State); 3997 } 3998 3999 // At this point every instruction in the original loop is widened to a 4000 // vector form. Now we need to fix the recurrences in the loop. These PHI 4001 // nodes are currently empty because we did not want to introduce cycles. 4002 // This is the second stage of vectorizing recurrences. 4003 fixCrossIterationPHIs(State); 4004 4005 // Forget the original basic block. 4006 PSE.getSE()->forgetLoop(OrigLoop); 4007 4008 // If we inserted an edge from the middle block to the unique exit block, 4009 // update uses outside the loop (phis) to account for the newly inserted 4010 // edge. 4011 if (!Cost->requiresScalarEpilogue(VF)) { 4012 // Fix-up external users of the induction variables. 4013 for (auto &Entry : Legal->getInductionVars()) 4014 fixupIVUsers(Entry.first, Entry.second, 4015 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4016 IVEndValues[Entry.first], LoopMiddleBlock); 4017 4018 fixLCSSAPHIs(State); 4019 } 4020 4021 for (Instruction *PI : PredicatedInstructions) 4022 sinkScalarOperands(&*PI); 4023 4024 // Remove redundant induction instructions. 4025 cse(LoopVectorBody); 4026 4027 // Set/update profile weights for the vector and remainder loops as original 4028 // loop iterations are now distributed among them. Note that original loop 4029 // represented by LoopScalarBody becomes remainder loop after vectorization. 4030 // 4031 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4032 // end up getting slightly roughened result but that should be OK since 4033 // profile is not inherently precise anyway. Note also possible bypass of 4034 // vector code caused by legality checks is ignored, assigning all the weight 4035 // to the vector loop, optimistically. 4036 // 4037 // For scalable vectorization we can't know at compile time how many iterations 4038 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4039 // vscale of '1'. 4040 setProfileInfoAfterUnrolling( 4041 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4042 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4043 } 4044 4045 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4046 // In order to support recurrences we need to be able to vectorize Phi nodes. 4047 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4048 // stage #2: We now need to fix the recurrences by adding incoming edges to 4049 // the currently empty PHI nodes. At this point every instruction in the 4050 // original loop is widened to a vector form so we can use them to construct 4051 // the incoming edges. 4052 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4053 for (VPRecipeBase &R : Header->phis()) { 4054 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 4055 fixReduction(ReductionPhi, State); 4056 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 4057 fixFirstOrderRecurrence(FOR, State); 4058 } 4059 } 4060 4061 void InnerLoopVectorizer::fixFirstOrderRecurrence( 4062 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 4063 // This is the second phase of vectorizing first-order recurrences. An 4064 // overview of the transformation is described below. Suppose we have the 4065 // following loop. 4066 // 4067 // for (int i = 0; i < n; ++i) 4068 // b[i] = a[i] - a[i - 1]; 4069 // 4070 // There is a first-order recurrence on "a". For this loop, the shorthand 4071 // scalar IR looks like: 4072 // 4073 // scalar.ph: 4074 // s_init = a[-1] 4075 // br scalar.body 4076 // 4077 // scalar.body: 4078 // i = phi [0, scalar.ph], [i+1, scalar.body] 4079 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4080 // s2 = a[i] 4081 // b[i] = s2 - s1 4082 // br cond, scalar.body, ... 4083 // 4084 // In this example, s1 is a recurrence because it's value depends on the 4085 // previous iteration. In the first phase of vectorization, we created a 4086 // vector phi v1 for s1. We now complete the vectorization and produce the 4087 // shorthand vector IR shown below (for VF = 4, UF = 1). 4088 // 4089 // vector.ph: 4090 // v_init = vector(..., ..., ..., a[-1]) 4091 // br vector.body 4092 // 4093 // vector.body 4094 // i = phi [0, vector.ph], [i+4, vector.body] 4095 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4096 // v2 = a[i, i+1, i+2, i+3]; 4097 // v3 = vector(v1(3), v2(0, 1, 2)) 4098 // b[i, i+1, i+2, i+3] = v2 - v3 4099 // br cond, vector.body, middle.block 4100 // 4101 // middle.block: 4102 // x = v2(3) 4103 // br scalar.ph 4104 // 4105 // scalar.ph: 4106 // s_init = phi [x, middle.block], [a[-1], otherwise] 4107 // br scalar.body 4108 // 4109 // After execution completes the vector loop, we extract the next value of 4110 // the recurrence (x) to use as the initial value in the scalar loop. 4111 4112 // Extract the last vector element in the middle block. This will be the 4113 // initial value for the recurrence when jumping to the scalar loop. 4114 VPValue *PreviousDef = PhiR->getBackedgeValue(); 4115 Value *Incoming = State.get(PreviousDef, UF - 1); 4116 auto *ExtractForScalar = Incoming; 4117 auto *IdxTy = Builder.getInt32Ty(); 4118 if (VF.isVector()) { 4119 auto *One = ConstantInt::get(IdxTy, 1); 4120 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4121 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4122 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4123 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4124 "vector.recur.extract"); 4125 } 4126 // Extract the second last element in the middle block if the 4127 // Phi is used outside the loop. We need to extract the phi itself 4128 // and not the last element (the phi update in the current iteration). This 4129 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4130 // when the scalar loop is not run at all. 4131 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4132 if (VF.isVector()) { 4133 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4134 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4135 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4136 Incoming, Idx, "vector.recur.extract.for.phi"); 4137 } else if (UF > 1) 4138 // When loop is unrolled without vectorizing, initialize 4139 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4140 // of `Incoming`. This is analogous to the vectorized case above: extracting 4141 // the second last element when VF > 1. 4142 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4143 4144 // Fix the initial value of the original recurrence in the scalar loop. 4145 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4146 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 4147 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4148 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 4149 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4150 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4151 Start->addIncoming(Incoming, BB); 4152 } 4153 4154 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4155 Phi->setName("scalar.recur"); 4156 4157 // Finally, fix users of the recurrence outside the loop. The users will need 4158 // either the last value of the scalar recurrence or the last value of the 4159 // vector recurrence we extracted in the middle block. Since the loop is in 4160 // LCSSA form, we just need to find all the phi nodes for the original scalar 4161 // recurrence in the exit block, and then add an edge for the middle block. 4162 // Note that LCSSA does not imply single entry when the original scalar loop 4163 // had multiple exiting edges (as we always run the last iteration in the 4164 // scalar epilogue); in that case, there is no edge from middle to exit and 4165 // and thus no phis which needed updated. 4166 if (!Cost->requiresScalarEpilogue(VF)) 4167 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4168 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) 4169 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4170 } 4171 4172 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 4173 VPTransformState &State) { 4174 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4175 // Get it's reduction variable descriptor. 4176 assert(Legal->isReductionVariable(OrigPhi) && 4177 "Unable to find the reduction variable"); 4178 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4179 4180 RecurKind RK = RdxDesc.getRecurrenceKind(); 4181 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4182 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4183 setDebugLocFromInst(ReductionStartValue); 4184 4185 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 4186 // This is the vector-clone of the value that leaves the loop. 4187 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4188 4189 // Wrap flags are in general invalid after vectorization, clear them. 4190 clearReductionWrapFlags(RdxDesc, State); 4191 4192 // Before each round, move the insertion point right between 4193 // the PHIs and the values we are going to write. 4194 // This allows us to write both PHINodes and the extractelement 4195 // instructions. 4196 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4197 4198 setDebugLocFromInst(LoopExitInst); 4199 4200 Type *PhiTy = OrigPhi->getType(); 4201 // If tail is folded by masking, the vector value to leave the loop should be 4202 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4203 // instead of the former. For an inloop reduction the reduction will already 4204 // be predicated, and does not need to be handled here. 4205 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 4206 for (unsigned Part = 0; Part < UF; ++Part) { 4207 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4208 Value *Sel = nullptr; 4209 for (User *U : VecLoopExitInst->users()) { 4210 if (isa<SelectInst>(U)) { 4211 assert(!Sel && "Reduction exit feeding two selects"); 4212 Sel = U; 4213 } else 4214 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4215 } 4216 assert(Sel && "Reduction exit feeds no select"); 4217 State.reset(LoopExitInstDef, Sel, Part); 4218 4219 // If the target can create a predicated operator for the reduction at no 4220 // extra cost in the loop (for example a predicated vadd), it can be 4221 // cheaper for the select to remain in the loop than be sunk out of it, 4222 // and so use the select value for the phi instead of the old 4223 // LoopExitValue. 4224 if (PreferPredicatedReductionSelect || 4225 TTI->preferPredicatedReductionSelect( 4226 RdxDesc.getOpcode(), PhiTy, 4227 TargetTransformInfo::ReductionFlags())) { 4228 auto *VecRdxPhi = 4229 cast<PHINode>(State.get(PhiR, Part)); 4230 VecRdxPhi->setIncomingValueForBlock( 4231 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4232 } 4233 } 4234 } 4235 4236 // If the vector reduction can be performed in a smaller type, we truncate 4237 // then extend the loop exit value to enable InstCombine to evaluate the 4238 // entire expression in the smaller type. 4239 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4240 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4241 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4242 Builder.SetInsertPoint( 4243 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4244 VectorParts RdxParts(UF); 4245 for (unsigned Part = 0; Part < UF; ++Part) { 4246 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4247 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4248 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4249 : Builder.CreateZExt(Trunc, VecTy); 4250 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4251 if (U != Trunc) { 4252 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4253 RdxParts[Part] = Extnd; 4254 } 4255 } 4256 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4259 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4260 } 4261 } 4262 4263 // Reduce all of the unrolled parts into a single vector. 4264 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4265 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4266 4267 // The middle block terminator has already been assigned a DebugLoc here (the 4268 // OrigLoop's single latch terminator). We want the whole middle block to 4269 // appear to execute on this line because: (a) it is all compiler generated, 4270 // (b) these instructions are always executed after evaluating the latch 4271 // conditional branch, and (c) other passes may add new predecessors which 4272 // terminate on this line. This is the easiest way to ensure we don't 4273 // accidentally cause an extra step back into the loop while debugging. 4274 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4275 if (PhiR->isOrdered()) 4276 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4277 else { 4278 // Floating-point operations should have some FMF to enable the reduction. 4279 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4280 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4281 for (unsigned Part = 1; Part < UF; ++Part) { 4282 Value *RdxPart = State.get(LoopExitInstDef, Part); 4283 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4284 ReducedPartRdx = Builder.CreateBinOp( 4285 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4286 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4287 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4288 ReducedPartRdx, RdxPart); 4289 else 4290 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4291 } 4292 } 4293 4294 // Create the reduction after the loop. Note that inloop reductions create the 4295 // target reduction in the loop using a Reduction recipe. 4296 if (VF.isVector() && !PhiR->isInLoop()) { 4297 ReducedPartRdx = 4298 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4299 // If the reduction can be performed in a smaller type, we need to extend 4300 // the reduction to the wider type before we branch to the original loop. 4301 if (PhiTy != RdxDesc.getRecurrenceType()) 4302 ReducedPartRdx = RdxDesc.isSigned() 4303 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4304 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4305 } 4306 4307 PHINode *ResumePhi = 4308 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4309 4310 // Create a phi node that merges control-flow from the backedge-taken check 4311 // block and the middle block. 4312 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4313 LoopScalarPreHeader->getTerminator()); 4314 4315 // If we are fixing reductions in the epilogue loop then we should already 4316 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4317 // we carry over the incoming values correctly. 4318 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4319 if (Incoming == LoopMiddleBlock) 4320 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4321 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4322 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4323 Incoming); 4324 else 4325 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4326 } 4327 4328 // Set the resume value for this reduction 4329 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4330 4331 // Now, we need to fix the users of the reduction variable 4332 // inside and outside of the scalar remainder loop. 4333 4334 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4335 // in the exit blocks. See comment on analogous loop in 4336 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4337 if (!Cost->requiresScalarEpilogue(VF)) 4338 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4339 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) 4340 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4341 4342 // Fix the scalar loop reduction variable with the incoming reduction sum 4343 // from the vector body and from the backedge value. 4344 int IncomingEdgeBlockIdx = 4345 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4346 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4347 // Pick the other block. 4348 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4349 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4350 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4351 } 4352 4353 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc, 4354 VPTransformState &State) { 4355 RecurKind RK = RdxDesc.getRecurrenceKind(); 4356 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4357 return; 4358 4359 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4360 assert(LoopExitInstr && "null loop exit instruction"); 4361 SmallVector<Instruction *, 8> Worklist; 4362 SmallPtrSet<Instruction *, 8> Visited; 4363 Worklist.push_back(LoopExitInstr); 4364 Visited.insert(LoopExitInstr); 4365 4366 while (!Worklist.empty()) { 4367 Instruction *Cur = Worklist.pop_back_val(); 4368 if (isa<OverflowingBinaryOperator>(Cur)) 4369 for (unsigned Part = 0; Part < UF; ++Part) { 4370 // FIXME: Should not rely on getVPValue at this point. 4371 Value *V = State.get(State.Plan->getVPValue(Cur, true), Part); 4372 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4373 } 4374 4375 for (User *U : Cur->users()) { 4376 Instruction *UI = cast<Instruction>(U); 4377 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4378 Visited.insert(UI).second) 4379 Worklist.push_back(UI); 4380 } 4381 } 4382 } 4383 4384 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4385 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4386 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4387 // Some phis were already hand updated by the reduction and recurrence 4388 // code above, leave them alone. 4389 continue; 4390 4391 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4392 // Non-instruction incoming values will have only one value. 4393 4394 VPLane Lane = VPLane::getFirstLane(); 4395 if (isa<Instruction>(IncomingValue) && 4396 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4397 VF)) 4398 Lane = VPLane::getLastLaneForVF(VF); 4399 4400 // Can be a loop invariant incoming value or the last scalar value to be 4401 // extracted from the vectorized loop. 4402 // FIXME: Should not rely on getVPValue at this point. 4403 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4404 Value *lastIncomingValue = 4405 OrigLoop->isLoopInvariant(IncomingValue) 4406 ? IncomingValue 4407 : State.get(State.Plan->getVPValue(IncomingValue, true), 4408 VPIteration(UF - 1, Lane)); 4409 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4410 } 4411 } 4412 4413 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4414 // The basic block and loop containing the predicated instruction. 4415 auto *PredBB = PredInst->getParent(); 4416 auto *VectorLoop = LI->getLoopFor(PredBB); 4417 4418 // Initialize a worklist with the operands of the predicated instruction. 4419 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4420 4421 // Holds instructions that we need to analyze again. An instruction may be 4422 // reanalyzed if we don't yet know if we can sink it or not. 4423 SmallVector<Instruction *, 8> InstsToReanalyze; 4424 4425 // Returns true if a given use occurs in the predicated block. Phi nodes use 4426 // their operands in their corresponding predecessor blocks. 4427 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4428 auto *I = cast<Instruction>(U.getUser()); 4429 BasicBlock *BB = I->getParent(); 4430 if (auto *Phi = dyn_cast<PHINode>(I)) 4431 BB = Phi->getIncomingBlock( 4432 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4433 return BB == PredBB; 4434 }; 4435 4436 // Iteratively sink the scalarized operands of the predicated instruction 4437 // into the block we created for it. When an instruction is sunk, it's 4438 // operands are then added to the worklist. The algorithm ends after one pass 4439 // through the worklist doesn't sink a single instruction. 4440 bool Changed; 4441 do { 4442 // Add the instructions that need to be reanalyzed to the worklist, and 4443 // reset the changed indicator. 4444 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4445 InstsToReanalyze.clear(); 4446 Changed = false; 4447 4448 while (!Worklist.empty()) { 4449 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4450 4451 // We can't sink an instruction if it is a phi node, is not in the loop, 4452 // or may have side effects. 4453 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4454 I->mayHaveSideEffects()) 4455 continue; 4456 4457 // If the instruction is already in PredBB, check if we can sink its 4458 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4459 // sinking the scalar instruction I, hence it appears in PredBB; but it 4460 // may have failed to sink I's operands (recursively), which we try 4461 // (again) here. 4462 if (I->getParent() == PredBB) { 4463 Worklist.insert(I->op_begin(), I->op_end()); 4464 continue; 4465 } 4466 4467 // It's legal to sink the instruction if all its uses occur in the 4468 // predicated block. Otherwise, there's nothing to do yet, and we may 4469 // need to reanalyze the instruction. 4470 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4471 InstsToReanalyze.push_back(I); 4472 continue; 4473 } 4474 4475 // Move the instruction to the beginning of the predicated block, and add 4476 // it's operands to the worklist. 4477 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4478 Worklist.insert(I->op_begin(), I->op_end()); 4479 4480 // The sinking may have enabled other instructions to be sunk, so we will 4481 // need to iterate. 4482 Changed = true; 4483 } 4484 } while (Changed); 4485 } 4486 4487 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4488 for (PHINode *OrigPhi : OrigPHIsToFix) { 4489 VPWidenPHIRecipe *VPPhi = 4490 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4491 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4492 // Make sure the builder has a valid insert point. 4493 Builder.SetInsertPoint(NewPhi); 4494 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4495 VPValue *Inc = VPPhi->getIncomingValue(i); 4496 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4497 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4498 } 4499 } 4500 } 4501 4502 bool InnerLoopVectorizer::useOrderedReductions( 4503 const RecurrenceDescriptor &RdxDesc) { 4504 return Cost->useOrderedReductions(RdxDesc); 4505 } 4506 4507 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4508 VPWidenPHIRecipe *PhiR, 4509 VPTransformState &State) { 4510 PHINode *P = cast<PHINode>(PN); 4511 if (EnableVPlanNativePath) { 4512 // Currently we enter here in the VPlan-native path for non-induction 4513 // PHIs where all control flow is uniform. We simply widen these PHIs. 4514 // Create a vector phi with no operands - the vector phi operands will be 4515 // set at the end of vector code generation. 4516 Type *VecTy = (State.VF.isScalar()) 4517 ? PN->getType() 4518 : VectorType::get(PN->getType(), State.VF); 4519 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4520 State.set(PhiR, VecPhi, 0); 4521 OrigPHIsToFix.push_back(P); 4522 4523 return; 4524 } 4525 4526 assert(PN->getParent() == OrigLoop->getHeader() && 4527 "Non-header phis should have been handled elsewhere"); 4528 4529 // In order to support recurrences we need to be able to vectorize Phi nodes. 4530 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4531 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4532 // this value when we vectorize all of the instructions that use the PHI. 4533 4534 assert(!Legal->isReductionVariable(P) && 4535 "reductions should be handled elsewhere"); 4536 4537 setDebugLocFromInst(P); 4538 4539 // This PHINode must be an induction variable. 4540 // Make sure that we know about it. 4541 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4542 4543 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4544 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4545 4546 auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV(); 4547 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 4548 4549 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4550 // which can be found from the original scalar operations. 4551 switch (II.getKind()) { 4552 case InductionDescriptor::IK_NoInduction: 4553 llvm_unreachable("Unknown induction"); 4554 case InductionDescriptor::IK_IntInduction: 4555 case InductionDescriptor::IK_FpInduction: 4556 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4557 case InductionDescriptor::IK_PtrInduction: { 4558 // Handle the pointer induction variable case. 4559 assert(P->getType()->isPointerTy() && "Unexpected type."); 4560 4561 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4562 // This is the normalized GEP that starts counting at zero. 4563 Value *PtrInd = 4564 Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType()); 4565 // Determine the number of scalars we need to generate for each unroll 4566 // iteration. If the instruction is uniform, we only need to generate the 4567 // first lane. Otherwise, we generate all VF values. 4568 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4569 assert((IsUniform || !State.VF.isScalable()) && 4570 "Cannot scalarize a scalable VF"); 4571 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4572 4573 for (unsigned Part = 0; Part < UF; ++Part) { 4574 Value *PartStart = 4575 createStepForVF(Builder, PtrInd->getType(), VF, Part); 4576 4577 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4578 Value *Idx = Builder.CreateAdd( 4579 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4580 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4581 Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), 4582 DL, II, State.CFG.PrevBB); 4583 SclrGep->setName("next.gep"); 4584 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4585 } 4586 } 4587 return; 4588 } 4589 assert(isa<SCEVConstant>(II.getStep()) && 4590 "Induction step not a SCEV constant!"); 4591 Type *PhiType = II.getStep()->getType(); 4592 4593 // Build a pointer phi 4594 Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue(); 4595 Type *ScStValueType = ScalarStartValue->getType(); 4596 PHINode *NewPointerPhi = 4597 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 4598 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4599 4600 // A pointer induction, performed by using a gep 4601 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4602 Instruction *InductionLoc = LoopLatch->getTerminator(); 4603 const SCEV *ScalarStep = II.getStep(); 4604 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4605 Value *ScalarStepValue = 4606 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4607 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4608 Value *NumUnrolledElems = 4609 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4610 Value *InductionGEP = GetElementPtrInst::Create( 4611 II.getElementType(), NewPointerPhi, 4612 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4613 InductionLoc); 4614 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4615 4616 // Create UF many actual address geps that use the pointer 4617 // phi as base and a vectorized version of the step value 4618 // (<step*0, ..., step*N>) as offset. 4619 for (unsigned Part = 0; Part < State.UF; ++Part) { 4620 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4621 Value *StartOffsetScalar = 4622 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4623 Value *StartOffset = 4624 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4625 // Create a vector of consecutive numbers from zero to VF. 4626 StartOffset = 4627 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4628 4629 Value *GEP = Builder.CreateGEP( 4630 II.getElementType(), NewPointerPhi, 4631 Builder.CreateMul( 4632 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4633 "vector.gep")); 4634 State.set(PhiR, GEP, Part); 4635 } 4636 } 4637 } 4638 } 4639 4640 /// A helper function for checking whether an integer division-related 4641 /// instruction may divide by zero (in which case it must be predicated if 4642 /// executed conditionally in the scalar code). 4643 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4644 /// Non-zero divisors that are non compile-time constants will not be 4645 /// converted into multiplication, so we will still end up scalarizing 4646 /// the division, but can do so w/o predication. 4647 static bool mayDivideByZero(Instruction &I) { 4648 assert((I.getOpcode() == Instruction::UDiv || 4649 I.getOpcode() == Instruction::SDiv || 4650 I.getOpcode() == Instruction::URem || 4651 I.getOpcode() == Instruction::SRem) && 4652 "Unexpected instruction"); 4653 Value *Divisor = I.getOperand(1); 4654 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4655 return !CInt || CInt->isZero(); 4656 } 4657 4658 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4659 VPUser &ArgOperands, 4660 VPTransformState &State) { 4661 assert(!isa<DbgInfoIntrinsic>(I) && 4662 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4663 setDebugLocFromInst(&I); 4664 4665 Module *M = I.getParent()->getParent()->getParent(); 4666 auto *CI = cast<CallInst>(&I); 4667 4668 SmallVector<Type *, 4> Tys; 4669 for (Value *ArgOperand : CI->args()) 4670 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4671 4672 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4673 4674 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4675 // version of the instruction. 4676 // Is it beneficial to perform intrinsic call compared to lib call? 4677 bool NeedToScalarize = false; 4678 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4679 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4680 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4681 assert((UseVectorIntrinsic || !NeedToScalarize) && 4682 "Instruction should be scalarized elsewhere."); 4683 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4684 "Either the intrinsic cost or vector call cost must be valid"); 4685 4686 for (unsigned Part = 0; Part < UF; ++Part) { 4687 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4688 SmallVector<Value *, 4> Args; 4689 for (auto &I : enumerate(ArgOperands.operands())) { 4690 // Some intrinsics have a scalar argument - don't replace it with a 4691 // vector. 4692 Value *Arg; 4693 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4694 Arg = State.get(I.value(), Part); 4695 else { 4696 Arg = State.get(I.value(), VPIteration(0, 0)); 4697 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index())) 4698 TysForDecl.push_back(Arg->getType()); 4699 } 4700 Args.push_back(Arg); 4701 } 4702 4703 Function *VectorF; 4704 if (UseVectorIntrinsic) { 4705 // Use vector version of the intrinsic. 4706 if (VF.isVector()) 4707 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4708 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4709 assert(VectorF && "Can't retrieve vector intrinsic."); 4710 } else { 4711 // Use vector version of the function call. 4712 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4713 #ifndef NDEBUG 4714 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4715 "Can't create vector function."); 4716 #endif 4717 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4718 } 4719 SmallVector<OperandBundleDef, 1> OpBundles; 4720 CI->getOperandBundlesAsDefs(OpBundles); 4721 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4722 4723 if (isa<FPMathOperator>(V)) 4724 V->copyFastMathFlags(CI); 4725 4726 State.set(Def, V, Part); 4727 addMetadata(V, &I); 4728 } 4729 } 4730 4731 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4732 // We should not collect Scalars more than once per VF. Right now, this 4733 // function is called from collectUniformsAndScalars(), which already does 4734 // this check. Collecting Scalars for VF=1 does not make any sense. 4735 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4736 "This function should not be visited twice for the same VF"); 4737 4738 SmallSetVector<Instruction *, 8> Worklist; 4739 4740 // These sets are used to seed the analysis with pointers used by memory 4741 // accesses that will remain scalar. 4742 SmallSetVector<Instruction *, 8> ScalarPtrs; 4743 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4744 auto *Latch = TheLoop->getLoopLatch(); 4745 4746 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4747 // The pointer operands of loads and stores will be scalar as long as the 4748 // memory access is not a gather or scatter operation. The value operand of a 4749 // store will remain scalar if the store is scalarized. 4750 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4751 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4752 assert(WideningDecision != CM_Unknown && 4753 "Widening decision should be ready at this moment"); 4754 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4755 if (Ptr == Store->getValueOperand()) 4756 return WideningDecision == CM_Scalarize; 4757 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4758 "Ptr is neither a value or pointer operand"); 4759 return WideningDecision != CM_GatherScatter; 4760 }; 4761 4762 // A helper that returns true if the given value is a bitcast or 4763 // getelementptr instruction contained in the loop. 4764 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4765 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4766 isa<GetElementPtrInst>(V)) && 4767 !TheLoop->isLoopInvariant(V); 4768 }; 4769 4770 // A helper that evaluates a memory access's use of a pointer. If the use will 4771 // be a scalar use and the pointer is only used by memory accesses, we place 4772 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4773 // PossibleNonScalarPtrs. 4774 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4775 // We only care about bitcast and getelementptr instructions contained in 4776 // the loop. 4777 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4778 return; 4779 4780 // If the pointer has already been identified as scalar (e.g., if it was 4781 // also identified as uniform), there's nothing to do. 4782 auto *I = cast<Instruction>(Ptr); 4783 if (Worklist.count(I)) 4784 return; 4785 4786 // If the use of the pointer will be a scalar use, and all users of the 4787 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4788 // place the pointer in PossibleNonScalarPtrs. 4789 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4790 return isa<LoadInst>(U) || isa<StoreInst>(U); 4791 })) 4792 ScalarPtrs.insert(I); 4793 else 4794 PossibleNonScalarPtrs.insert(I); 4795 }; 4796 4797 // We seed the scalars analysis with three classes of instructions: (1) 4798 // instructions marked uniform-after-vectorization and (2) bitcast, 4799 // getelementptr and (pointer) phi instructions used by memory accesses 4800 // requiring a scalar use. 4801 // 4802 // (1) Add to the worklist all instructions that have been identified as 4803 // uniform-after-vectorization. 4804 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4805 4806 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4807 // memory accesses requiring a scalar use. The pointer operands of loads and 4808 // stores will be scalar as long as the memory accesses is not a gather or 4809 // scatter operation. The value operand of a store will remain scalar if the 4810 // store is scalarized. 4811 for (auto *BB : TheLoop->blocks()) 4812 for (auto &I : *BB) { 4813 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4814 evaluatePtrUse(Load, Load->getPointerOperand()); 4815 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4816 evaluatePtrUse(Store, Store->getPointerOperand()); 4817 evaluatePtrUse(Store, Store->getValueOperand()); 4818 } 4819 } 4820 for (auto *I : ScalarPtrs) 4821 if (!PossibleNonScalarPtrs.count(I)) { 4822 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4823 Worklist.insert(I); 4824 } 4825 4826 // Insert the forced scalars. 4827 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4828 // induction variable when the PHI user is scalarized. 4829 auto ForcedScalar = ForcedScalars.find(VF); 4830 if (ForcedScalar != ForcedScalars.end()) 4831 for (auto *I : ForcedScalar->second) 4832 Worklist.insert(I); 4833 4834 // Expand the worklist by looking through any bitcasts and getelementptr 4835 // instructions we've already identified as scalar. This is similar to the 4836 // expansion step in collectLoopUniforms(); however, here we're only 4837 // expanding to include additional bitcasts and getelementptr instructions. 4838 unsigned Idx = 0; 4839 while (Idx != Worklist.size()) { 4840 Instruction *Dst = Worklist[Idx++]; 4841 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4842 continue; 4843 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4844 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4845 auto *J = cast<Instruction>(U); 4846 return !TheLoop->contains(J) || Worklist.count(J) || 4847 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4848 isScalarUse(J, Src)); 4849 })) { 4850 Worklist.insert(Src); 4851 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4852 } 4853 } 4854 4855 // An induction variable will remain scalar if all users of the induction 4856 // variable and induction variable update remain scalar. 4857 for (auto &Induction : Legal->getInductionVars()) { 4858 auto *Ind = Induction.first; 4859 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4860 4861 // If tail-folding is applied, the primary induction variable will be used 4862 // to feed a vector compare. 4863 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4864 continue; 4865 4866 // Returns true if \p Indvar is a pointer induction that is used directly by 4867 // load/store instruction \p I. 4868 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4869 Instruction *I) { 4870 return Induction.second.getKind() == 4871 InductionDescriptor::IK_PtrInduction && 4872 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4873 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4874 }; 4875 4876 // Determine if all users of the induction variable are scalar after 4877 // vectorization. 4878 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4879 auto *I = cast<Instruction>(U); 4880 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4881 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4882 }); 4883 if (!ScalarInd) 4884 continue; 4885 4886 // Determine if all users of the induction variable update instruction are 4887 // scalar after vectorization. 4888 auto ScalarIndUpdate = 4889 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4890 auto *I = cast<Instruction>(U); 4891 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4892 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4893 }); 4894 if (!ScalarIndUpdate) 4895 continue; 4896 4897 // The induction variable and its update instruction will remain scalar. 4898 Worklist.insert(Ind); 4899 Worklist.insert(IndUpdate); 4900 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4901 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4902 << "\n"); 4903 } 4904 4905 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4906 } 4907 4908 bool LoopVectorizationCostModel::isScalarWithPredication( 4909 Instruction *I, ElementCount VF) const { 4910 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4911 return false; 4912 switch(I->getOpcode()) { 4913 default: 4914 break; 4915 case Instruction::Load: 4916 case Instruction::Store: { 4917 if (!Legal->isMaskRequired(I)) 4918 return false; 4919 auto *Ptr = getLoadStorePointerOperand(I); 4920 auto *Ty = getLoadStoreType(I); 4921 Type *VTy = Ty; 4922 if (VF.isVector()) 4923 VTy = VectorType::get(Ty, VF); 4924 const Align Alignment = getLoadStoreAlignment(I); 4925 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4926 TTI.isLegalMaskedGather(VTy, Alignment)) 4927 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4928 TTI.isLegalMaskedScatter(VTy, Alignment)); 4929 } 4930 case Instruction::UDiv: 4931 case Instruction::SDiv: 4932 case Instruction::SRem: 4933 case Instruction::URem: 4934 return mayDivideByZero(*I); 4935 } 4936 return false; 4937 } 4938 4939 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4940 Instruction *I, ElementCount VF) { 4941 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4942 assert(getWideningDecision(I, VF) == CM_Unknown && 4943 "Decision should not be set yet."); 4944 auto *Group = getInterleavedAccessGroup(I); 4945 assert(Group && "Must have a group."); 4946 4947 // If the instruction's allocated size doesn't equal it's type size, it 4948 // requires padding and will be scalarized. 4949 auto &DL = I->getModule()->getDataLayout(); 4950 auto *ScalarTy = getLoadStoreType(I); 4951 if (hasIrregularType(ScalarTy, DL)) 4952 return false; 4953 4954 // Check if masking is required. 4955 // A Group may need masking for one of two reasons: it resides in a block that 4956 // needs predication, or it was decided to use masking to deal with gaps 4957 // (either a gap at the end of a load-access that may result in a speculative 4958 // load, or any gaps in a store-access). 4959 bool PredicatedAccessRequiresMasking = 4960 blockNeedsPredicationForAnyReason(I->getParent()) && 4961 Legal->isMaskRequired(I); 4962 bool LoadAccessWithGapsRequiresEpilogMasking = 4963 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4964 !isScalarEpilogueAllowed(); 4965 bool StoreAccessWithGapsRequiresMasking = 4966 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4967 if (!PredicatedAccessRequiresMasking && 4968 !LoadAccessWithGapsRequiresEpilogMasking && 4969 !StoreAccessWithGapsRequiresMasking) 4970 return true; 4971 4972 // If masked interleaving is required, we expect that the user/target had 4973 // enabled it, because otherwise it either wouldn't have been created or 4974 // it should have been invalidated by the CostModel. 4975 assert(useMaskedInterleavedAccesses(TTI) && 4976 "Masked interleave-groups for predicated accesses are not enabled."); 4977 4978 if (Group->isReverse()) 4979 return false; 4980 4981 auto *Ty = getLoadStoreType(I); 4982 const Align Alignment = getLoadStoreAlignment(I); 4983 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4984 : TTI.isLegalMaskedStore(Ty, Alignment); 4985 } 4986 4987 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4988 Instruction *I, ElementCount VF) { 4989 // Get and ensure we have a valid memory instruction. 4990 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4991 4992 auto *Ptr = getLoadStorePointerOperand(I); 4993 auto *ScalarTy = getLoadStoreType(I); 4994 4995 // In order to be widened, the pointer should be consecutive, first of all. 4996 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4997 return false; 4998 4999 // If the instruction is a store located in a predicated block, it will be 5000 // scalarized. 5001 if (isScalarWithPredication(I, VF)) 5002 return false; 5003 5004 // If the instruction's allocated size doesn't equal it's type size, it 5005 // requires padding and will be scalarized. 5006 auto &DL = I->getModule()->getDataLayout(); 5007 if (hasIrregularType(ScalarTy, DL)) 5008 return false; 5009 5010 return true; 5011 } 5012 5013 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5014 // We should not collect Uniforms more than once per VF. Right now, 5015 // this function is called from collectUniformsAndScalars(), which 5016 // already does this check. Collecting Uniforms for VF=1 does not make any 5017 // sense. 5018 5019 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5020 "This function should not be visited twice for the same VF"); 5021 5022 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5023 // not analyze again. Uniforms.count(VF) will return 1. 5024 Uniforms[VF].clear(); 5025 5026 // We now know that the loop is vectorizable! 5027 // Collect instructions inside the loop that will remain uniform after 5028 // vectorization. 5029 5030 // Global values, params and instructions outside of current loop are out of 5031 // scope. 5032 auto isOutOfScope = [&](Value *V) -> bool { 5033 Instruction *I = dyn_cast<Instruction>(V); 5034 return (!I || !TheLoop->contains(I)); 5035 }; 5036 5037 // Worklist containing uniform instructions demanding lane 0. 5038 SetVector<Instruction *> Worklist; 5039 BasicBlock *Latch = TheLoop->getLoopLatch(); 5040 5041 // Add uniform instructions demanding lane 0 to the worklist. Instructions 5042 // that are scalar with predication must not be considered uniform after 5043 // vectorization, because that would create an erroneous replicating region 5044 // where only a single instance out of VF should be formed. 5045 // TODO: optimize such seldom cases if found important, see PR40816. 5046 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5047 if (isOutOfScope(I)) { 5048 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5049 << *I << "\n"); 5050 return; 5051 } 5052 if (isScalarWithPredication(I, VF)) { 5053 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5054 << *I << "\n"); 5055 return; 5056 } 5057 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5058 Worklist.insert(I); 5059 }; 5060 5061 // Start with the conditional branch. If the branch condition is an 5062 // instruction contained in the loop that is only used by the branch, it is 5063 // uniform. 5064 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5065 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5066 addToWorklistIfAllowed(Cmp); 5067 5068 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5069 InstWidening WideningDecision = getWideningDecision(I, VF); 5070 assert(WideningDecision != CM_Unknown && 5071 "Widening decision should be ready at this moment"); 5072 5073 // A uniform memory op is itself uniform. We exclude uniform stores 5074 // here as they demand the last lane, not the first one. 5075 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5076 assert(WideningDecision == CM_Scalarize); 5077 return true; 5078 } 5079 5080 return (WideningDecision == CM_Widen || 5081 WideningDecision == CM_Widen_Reverse || 5082 WideningDecision == CM_Interleave); 5083 }; 5084 5085 5086 // Returns true if Ptr is the pointer operand of a memory access instruction 5087 // I, and I is known to not require scalarization. 5088 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5089 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5090 }; 5091 5092 // Holds a list of values which are known to have at least one uniform use. 5093 // Note that there may be other uses which aren't uniform. A "uniform use" 5094 // here is something which only demands lane 0 of the unrolled iterations; 5095 // it does not imply that all lanes produce the same value (e.g. this is not 5096 // the usual meaning of uniform) 5097 SetVector<Value *> HasUniformUse; 5098 5099 // Scan the loop for instructions which are either a) known to have only 5100 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5101 for (auto *BB : TheLoop->blocks()) 5102 for (auto &I : *BB) { 5103 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 5104 switch (II->getIntrinsicID()) { 5105 case Intrinsic::sideeffect: 5106 case Intrinsic::experimental_noalias_scope_decl: 5107 case Intrinsic::assume: 5108 case Intrinsic::lifetime_start: 5109 case Intrinsic::lifetime_end: 5110 if (TheLoop->hasLoopInvariantOperands(&I)) 5111 addToWorklistIfAllowed(&I); 5112 break; 5113 default: 5114 break; 5115 } 5116 } 5117 5118 // ExtractValue instructions must be uniform, because the operands are 5119 // known to be loop-invariant. 5120 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 5121 assert(isOutOfScope(EVI->getAggregateOperand()) && 5122 "Expected aggregate value to be loop invariant"); 5123 addToWorklistIfAllowed(EVI); 5124 continue; 5125 } 5126 5127 // If there's no pointer operand, there's nothing to do. 5128 auto *Ptr = getLoadStorePointerOperand(&I); 5129 if (!Ptr) 5130 continue; 5131 5132 // A uniform memory op is itself uniform. We exclude uniform stores 5133 // here as they demand the last lane, not the first one. 5134 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5135 addToWorklistIfAllowed(&I); 5136 5137 if (isUniformDecision(&I, VF)) { 5138 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5139 HasUniformUse.insert(Ptr); 5140 } 5141 } 5142 5143 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5144 // demanding) users. Since loops are assumed to be in LCSSA form, this 5145 // disallows uses outside the loop as well. 5146 for (auto *V : HasUniformUse) { 5147 if (isOutOfScope(V)) 5148 continue; 5149 auto *I = cast<Instruction>(V); 5150 auto UsersAreMemAccesses = 5151 llvm::all_of(I->users(), [&](User *U) -> bool { 5152 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5153 }); 5154 if (UsersAreMemAccesses) 5155 addToWorklistIfAllowed(I); 5156 } 5157 5158 // Expand Worklist in topological order: whenever a new instruction 5159 // is added , its users should be already inside Worklist. It ensures 5160 // a uniform instruction will only be used by uniform instructions. 5161 unsigned idx = 0; 5162 while (idx != Worklist.size()) { 5163 Instruction *I = Worklist[idx++]; 5164 5165 for (auto OV : I->operand_values()) { 5166 // isOutOfScope operands cannot be uniform instructions. 5167 if (isOutOfScope(OV)) 5168 continue; 5169 // First order recurrence Phi's should typically be considered 5170 // non-uniform. 5171 auto *OP = dyn_cast<PHINode>(OV); 5172 if (OP && Legal->isFirstOrderRecurrence(OP)) 5173 continue; 5174 // If all the users of the operand are uniform, then add the 5175 // operand into the uniform worklist. 5176 auto *OI = cast<Instruction>(OV); 5177 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5178 auto *J = cast<Instruction>(U); 5179 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5180 })) 5181 addToWorklistIfAllowed(OI); 5182 } 5183 } 5184 5185 // For an instruction to be added into Worklist above, all its users inside 5186 // the loop should also be in Worklist. However, this condition cannot be 5187 // true for phi nodes that form a cyclic dependence. We must process phi 5188 // nodes separately. An induction variable will remain uniform if all users 5189 // of the induction variable and induction variable update remain uniform. 5190 // The code below handles both pointer and non-pointer induction variables. 5191 for (auto &Induction : Legal->getInductionVars()) { 5192 auto *Ind = Induction.first; 5193 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5194 5195 // Determine if all users of the induction variable are uniform after 5196 // vectorization. 5197 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5198 auto *I = cast<Instruction>(U); 5199 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5200 isVectorizedMemAccessUse(I, Ind); 5201 }); 5202 if (!UniformInd) 5203 continue; 5204 5205 // Determine if all users of the induction variable update instruction are 5206 // uniform after vectorization. 5207 auto UniformIndUpdate = 5208 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5209 auto *I = cast<Instruction>(U); 5210 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5211 isVectorizedMemAccessUse(I, IndUpdate); 5212 }); 5213 if (!UniformIndUpdate) 5214 continue; 5215 5216 // The induction variable and its update instruction will remain uniform. 5217 addToWorklistIfAllowed(Ind); 5218 addToWorklistIfAllowed(IndUpdate); 5219 } 5220 5221 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5222 } 5223 5224 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5225 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5226 5227 if (Legal->getRuntimePointerChecking()->Need) { 5228 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5229 "runtime pointer checks needed. Enable vectorization of this " 5230 "loop with '#pragma clang loop vectorize(enable)' when " 5231 "compiling with -Os/-Oz", 5232 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5233 return true; 5234 } 5235 5236 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5237 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5238 "runtime SCEV checks needed. Enable vectorization of this " 5239 "loop with '#pragma clang loop vectorize(enable)' when " 5240 "compiling with -Os/-Oz", 5241 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5242 return true; 5243 } 5244 5245 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5246 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5247 reportVectorizationFailure("Runtime stride check for small trip count", 5248 "runtime stride == 1 checks needed. Enable vectorization of " 5249 "this loop without such check by compiling with -Os/-Oz", 5250 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5251 return true; 5252 } 5253 5254 return false; 5255 } 5256 5257 ElementCount 5258 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5259 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 5260 return ElementCount::getScalable(0); 5261 5262 if (Hints->isScalableVectorizationDisabled()) { 5263 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5264 "ScalableVectorizationDisabled", ORE, TheLoop); 5265 return ElementCount::getScalable(0); 5266 } 5267 5268 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 5269 5270 auto MaxScalableVF = ElementCount::getScalable( 5271 std::numeric_limits<ElementCount::ScalarTy>::max()); 5272 5273 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5274 // FIXME: While for scalable vectors this is currently sufficient, this should 5275 // be replaced by a more detailed mechanism that filters out specific VFs, 5276 // instead of invalidating vectorization for a whole set of VFs based on the 5277 // MaxVF. 5278 5279 // Disable scalable vectorization if the loop contains unsupported reductions. 5280 if (!canVectorizeReductions(MaxScalableVF)) { 5281 reportVectorizationInfo( 5282 "Scalable vectorization not supported for the reduction " 5283 "operations found in this loop.", 5284 "ScalableVFUnfeasible", ORE, TheLoop); 5285 return ElementCount::getScalable(0); 5286 } 5287 5288 // Disable scalable vectorization if the loop contains any instructions 5289 // with element types not supported for scalable vectors. 5290 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 5291 return !Ty->isVoidTy() && 5292 !this->TTI.isElementTypeLegalForScalableVector(Ty); 5293 })) { 5294 reportVectorizationInfo("Scalable vectorization is not supported " 5295 "for all element types found in this loop.", 5296 "ScalableVFUnfeasible", ORE, TheLoop); 5297 return ElementCount::getScalable(0); 5298 } 5299 5300 if (Legal->isSafeForAnyVectorWidth()) 5301 return MaxScalableVF; 5302 5303 // Limit MaxScalableVF by the maximum safe dependence distance. 5304 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5305 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 5306 MaxVScale = 5307 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 5308 MaxScalableVF = ElementCount::getScalable( 5309 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5310 if (!MaxScalableVF) 5311 reportVectorizationInfo( 5312 "Max legal vector width too small, scalable vectorization " 5313 "unfeasible.", 5314 "ScalableVFUnfeasible", ORE, TheLoop); 5315 5316 return MaxScalableVF; 5317 } 5318 5319 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 5320 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 5321 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5322 unsigned SmallestType, WidestType; 5323 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5324 5325 // Get the maximum safe dependence distance in bits computed by LAA. 5326 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5327 // the memory accesses that is most restrictive (involved in the smallest 5328 // dependence distance). 5329 unsigned MaxSafeElements = 5330 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5331 5332 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5333 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5334 5335 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5336 << ".\n"); 5337 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5338 << ".\n"); 5339 5340 // First analyze the UserVF, fall back if the UserVF should be ignored. 5341 if (UserVF) { 5342 auto MaxSafeUserVF = 5343 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5344 5345 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5346 // If `VF=vscale x N` is safe, then so is `VF=N` 5347 if (UserVF.isScalable()) 5348 return FixedScalableVFPair( 5349 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5350 else 5351 return UserVF; 5352 } 5353 5354 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5355 5356 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5357 // is better to ignore the hint and let the compiler choose a suitable VF. 5358 if (!UserVF.isScalable()) { 5359 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5360 << " is unsafe, clamping to max safe VF=" 5361 << MaxSafeFixedVF << ".\n"); 5362 ORE->emit([&]() { 5363 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5364 TheLoop->getStartLoc(), 5365 TheLoop->getHeader()) 5366 << "User-specified vectorization factor " 5367 << ore::NV("UserVectorizationFactor", UserVF) 5368 << " is unsafe, clamping to maximum safe vectorization factor " 5369 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5370 }); 5371 return MaxSafeFixedVF; 5372 } 5373 5374 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5375 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5376 << " is ignored because scalable vectors are not " 5377 "available.\n"); 5378 ORE->emit([&]() { 5379 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5380 TheLoop->getStartLoc(), 5381 TheLoop->getHeader()) 5382 << "User-specified vectorization factor " 5383 << ore::NV("UserVectorizationFactor", UserVF) 5384 << " is ignored because the target does not support scalable " 5385 "vectors. The compiler will pick a more suitable value."; 5386 }); 5387 } else { 5388 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5389 << " is unsafe. Ignoring scalable UserVF.\n"); 5390 ORE->emit([&]() { 5391 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5392 TheLoop->getStartLoc(), 5393 TheLoop->getHeader()) 5394 << "User-specified vectorization factor " 5395 << ore::NV("UserVectorizationFactor", UserVF) 5396 << " is unsafe. Ignoring the hint to let the compiler pick a " 5397 "more suitable value."; 5398 }); 5399 } 5400 } 5401 5402 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5403 << " / " << WidestType << " bits.\n"); 5404 5405 FixedScalableVFPair Result(ElementCount::getFixed(1), 5406 ElementCount::getScalable(0)); 5407 if (auto MaxVF = 5408 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5409 MaxSafeFixedVF, FoldTailByMasking)) 5410 Result.FixedVF = MaxVF; 5411 5412 if (auto MaxVF = 5413 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5414 MaxSafeScalableVF, FoldTailByMasking)) 5415 if (MaxVF.isScalable()) { 5416 Result.ScalableVF = MaxVF; 5417 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5418 << "\n"); 5419 } 5420 5421 return Result; 5422 } 5423 5424 FixedScalableVFPair 5425 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5426 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5427 // TODO: It may by useful to do since it's still likely to be dynamically 5428 // uniform if the target can skip. 5429 reportVectorizationFailure( 5430 "Not inserting runtime ptr check for divergent target", 5431 "runtime pointer checks needed. Not enabled for divergent target", 5432 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5433 return FixedScalableVFPair::getNone(); 5434 } 5435 5436 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5437 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5438 if (TC == 1) { 5439 reportVectorizationFailure("Single iteration (non) loop", 5440 "loop trip count is one, irrelevant for vectorization", 5441 "SingleIterationLoop", ORE, TheLoop); 5442 return FixedScalableVFPair::getNone(); 5443 } 5444 5445 switch (ScalarEpilogueStatus) { 5446 case CM_ScalarEpilogueAllowed: 5447 return computeFeasibleMaxVF(TC, UserVF, false); 5448 case CM_ScalarEpilogueNotAllowedUsePredicate: 5449 LLVM_FALLTHROUGH; 5450 case CM_ScalarEpilogueNotNeededUsePredicate: 5451 LLVM_DEBUG( 5452 dbgs() << "LV: vector predicate hint/switch found.\n" 5453 << "LV: Not allowing scalar epilogue, creating predicated " 5454 << "vector loop.\n"); 5455 break; 5456 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5457 // fallthrough as a special case of OptForSize 5458 case CM_ScalarEpilogueNotAllowedOptSize: 5459 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5460 LLVM_DEBUG( 5461 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5462 else 5463 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5464 << "count.\n"); 5465 5466 // Bail if runtime checks are required, which are not good when optimising 5467 // for size. 5468 if (runtimeChecksRequired()) 5469 return FixedScalableVFPair::getNone(); 5470 5471 break; 5472 } 5473 5474 // The only loops we can vectorize without a scalar epilogue, are loops with 5475 // a bottom-test and a single exiting block. We'd have to handle the fact 5476 // that not every instruction executes on the last iteration. This will 5477 // require a lane mask which varies through the vector loop body. (TODO) 5478 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5479 // If there was a tail-folding hint/switch, but we can't fold the tail by 5480 // masking, fallback to a vectorization with a scalar epilogue. 5481 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5482 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5483 "scalar epilogue instead.\n"); 5484 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5485 return computeFeasibleMaxVF(TC, UserVF, false); 5486 } 5487 return FixedScalableVFPair::getNone(); 5488 } 5489 5490 // Now try the tail folding 5491 5492 // Invalidate interleave groups that require an epilogue if we can't mask 5493 // the interleave-group. 5494 if (!useMaskedInterleavedAccesses(TTI)) { 5495 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5496 "No decisions should have been taken at this point"); 5497 // Note: There is no need to invalidate any cost modeling decisions here, as 5498 // non where taken so far. 5499 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5500 } 5501 5502 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5503 // Avoid tail folding if the trip count is known to be a multiple of any VF 5504 // we chose. 5505 // FIXME: The condition below pessimises the case for fixed-width vectors, 5506 // when scalable VFs are also candidates for vectorization. 5507 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5508 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5509 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5510 "MaxFixedVF must be a power of 2"); 5511 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5512 : MaxFixedVF.getFixedValue(); 5513 ScalarEvolution *SE = PSE.getSE(); 5514 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5515 const SCEV *ExitCount = SE->getAddExpr( 5516 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5517 const SCEV *Rem = SE->getURemExpr( 5518 SE->applyLoopGuards(ExitCount, TheLoop), 5519 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5520 if (Rem->isZero()) { 5521 // Accept MaxFixedVF if we do not have a tail. 5522 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5523 return MaxFactors; 5524 } 5525 } 5526 5527 // For scalable vectors don't use tail folding for low trip counts or 5528 // optimizing for code size. We only permit this if the user has explicitly 5529 // requested it. 5530 if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && 5531 ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && 5532 MaxFactors.ScalableVF.isVector()) 5533 MaxFactors.ScalableVF = ElementCount::getScalable(0); 5534 5535 // If we don't know the precise trip count, or if the trip count that we 5536 // found modulo the vectorization factor is not zero, try to fold the tail 5537 // by masking. 5538 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5539 if (Legal->prepareToFoldTailByMasking()) { 5540 FoldTailByMasking = true; 5541 return MaxFactors; 5542 } 5543 5544 // If there was a tail-folding hint/switch, but we can't fold the tail by 5545 // masking, fallback to a vectorization with a scalar epilogue. 5546 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5547 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5548 "scalar epilogue instead.\n"); 5549 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5550 return MaxFactors; 5551 } 5552 5553 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5554 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5555 return FixedScalableVFPair::getNone(); 5556 } 5557 5558 if (TC == 0) { 5559 reportVectorizationFailure( 5560 "Unable to calculate the loop count due to complex control flow", 5561 "unable to calculate the loop count due to complex control flow", 5562 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5563 return FixedScalableVFPair::getNone(); 5564 } 5565 5566 reportVectorizationFailure( 5567 "Cannot optimize for size and vectorize at the same time.", 5568 "cannot optimize for size and vectorize at the same time. " 5569 "Enable vectorization of this loop with '#pragma clang loop " 5570 "vectorize(enable)' when compiling with -Os/-Oz", 5571 "NoTailLoopWithOptForSize", ORE, TheLoop); 5572 return FixedScalableVFPair::getNone(); 5573 } 5574 5575 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5576 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5577 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5578 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5579 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5580 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5581 : TargetTransformInfo::RGK_FixedWidthVector); 5582 5583 // Convenience function to return the minimum of two ElementCounts. 5584 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5585 assert((LHS.isScalable() == RHS.isScalable()) && 5586 "Scalable flags must match"); 5587 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5588 }; 5589 5590 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5591 // Note that both WidestRegister and WidestType may not be a powers of 2. 5592 auto MaxVectorElementCount = ElementCount::get( 5593 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5594 ComputeScalableMaxVF); 5595 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5596 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5597 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5598 5599 if (!MaxVectorElementCount) { 5600 LLVM_DEBUG(dbgs() << "LV: The target has no " 5601 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5602 << " vector registers.\n"); 5603 return ElementCount::getFixed(1); 5604 } 5605 5606 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5607 if (ConstTripCount && 5608 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5609 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5610 // If loop trip count (TC) is known at compile time there is no point in 5611 // choosing VF greater than TC (as done in the loop below). Select maximum 5612 // power of two which doesn't exceed TC. 5613 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5614 // when the TC is less than or equal to the known number of lanes. 5615 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5616 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5617 "exceeding the constant trip count: " 5618 << ClampedConstTripCount << "\n"); 5619 return ElementCount::getFixed(ClampedConstTripCount); 5620 } 5621 5622 ElementCount MaxVF = MaxVectorElementCount; 5623 if (TTI.shouldMaximizeVectorBandwidth() || 5624 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5625 auto MaxVectorElementCountMaxBW = ElementCount::get( 5626 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5627 ComputeScalableMaxVF); 5628 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5629 5630 // Collect all viable vectorization factors larger than the default MaxVF 5631 // (i.e. MaxVectorElementCount). 5632 SmallVector<ElementCount, 8> VFs; 5633 for (ElementCount VS = MaxVectorElementCount * 2; 5634 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5635 VFs.push_back(VS); 5636 5637 // For each VF calculate its register usage. 5638 auto RUs = calculateRegisterUsage(VFs); 5639 5640 // Select the largest VF which doesn't require more registers than existing 5641 // ones. 5642 for (int i = RUs.size() - 1; i >= 0; --i) { 5643 bool Selected = true; 5644 for (auto &pair : RUs[i].MaxLocalUsers) { 5645 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5646 if (pair.second > TargetNumRegisters) 5647 Selected = false; 5648 } 5649 if (Selected) { 5650 MaxVF = VFs[i]; 5651 break; 5652 } 5653 } 5654 if (ElementCount MinVF = 5655 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5656 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5657 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5658 << ") with target's minimum: " << MinVF << '\n'); 5659 MaxVF = MinVF; 5660 } 5661 } 5662 } 5663 return MaxVF; 5664 } 5665 5666 bool LoopVectorizationCostModel::isMoreProfitable( 5667 const VectorizationFactor &A, const VectorizationFactor &B) const { 5668 InstructionCost CostA = A.Cost; 5669 InstructionCost CostB = B.Cost; 5670 5671 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5672 5673 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5674 MaxTripCount) { 5675 // If we are folding the tail and the trip count is a known (possibly small) 5676 // constant, the trip count will be rounded up to an integer number of 5677 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5678 // which we compare directly. When not folding the tail, the total cost will 5679 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5680 // approximated with the per-lane cost below instead of using the tripcount 5681 // as here. 5682 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5683 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5684 return RTCostA < RTCostB; 5685 } 5686 5687 // Improve estimate for the vector width if it is scalable. 5688 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5689 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5690 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) { 5691 if (A.Width.isScalable()) 5692 EstimatedWidthA *= VScale.getValue(); 5693 if (B.Width.isScalable()) 5694 EstimatedWidthB *= VScale.getValue(); 5695 } 5696 5697 // Assume vscale may be larger than 1 (or the value being tuned for), 5698 // so that scalable vectorization is slightly favorable over fixed-width 5699 // vectorization. 5700 if (A.Width.isScalable() && !B.Width.isScalable()) 5701 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5702 5703 // To avoid the need for FP division: 5704 // (CostA / A.Width) < (CostB / B.Width) 5705 // <=> (CostA * B.Width) < (CostB * A.Width) 5706 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5707 } 5708 5709 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5710 const ElementCountSet &VFCandidates) { 5711 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5712 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5713 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5714 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5715 "Expected Scalar VF to be a candidate"); 5716 5717 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5718 VectorizationFactor ChosenFactor = ScalarCost; 5719 5720 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5721 if (ForceVectorization && VFCandidates.size() > 1) { 5722 // Ignore scalar width, because the user explicitly wants vectorization. 5723 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5724 // evaluation. 5725 ChosenFactor.Cost = InstructionCost::getMax(); 5726 } 5727 5728 SmallVector<InstructionVFPair> InvalidCosts; 5729 for (const auto &i : VFCandidates) { 5730 // The cost for scalar VF=1 is already calculated, so ignore it. 5731 if (i.isScalar()) 5732 continue; 5733 5734 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5735 VectorizationFactor Candidate(i, C.first); 5736 5737 #ifndef NDEBUG 5738 unsigned AssumedMinimumVscale = 1; 5739 if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) 5740 AssumedMinimumVscale = VScale.getValue(); 5741 unsigned Width = 5742 Candidate.Width.isScalable() 5743 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5744 : Candidate.Width.getFixedValue(); 5745 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5746 << " costs: " << (Candidate.Cost / Width)); 5747 if (i.isScalable()) 5748 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5749 << AssumedMinimumVscale << ")"); 5750 LLVM_DEBUG(dbgs() << ".\n"); 5751 #endif 5752 5753 if (!C.second && !ForceVectorization) { 5754 LLVM_DEBUG( 5755 dbgs() << "LV: Not considering vector loop of width " << i 5756 << " because it will not generate any vector instructions.\n"); 5757 continue; 5758 } 5759 5760 // If profitable add it to ProfitableVF list. 5761 if (isMoreProfitable(Candidate, ScalarCost)) 5762 ProfitableVFs.push_back(Candidate); 5763 5764 if (isMoreProfitable(Candidate, ChosenFactor)) 5765 ChosenFactor = Candidate; 5766 } 5767 5768 // Emit a report of VFs with invalid costs in the loop. 5769 if (!InvalidCosts.empty()) { 5770 // Group the remarks per instruction, keeping the instruction order from 5771 // InvalidCosts. 5772 std::map<Instruction *, unsigned> Numbering; 5773 unsigned I = 0; 5774 for (auto &Pair : InvalidCosts) 5775 if (!Numbering.count(Pair.first)) 5776 Numbering[Pair.first] = I++; 5777 5778 // Sort the list, first on instruction(number) then on VF. 5779 llvm::sort(InvalidCosts, 5780 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5781 if (Numbering[A.first] != Numbering[B.first]) 5782 return Numbering[A.first] < Numbering[B.first]; 5783 ElementCountComparator ECC; 5784 return ECC(A.second, B.second); 5785 }); 5786 5787 // For a list of ordered instruction-vf pairs: 5788 // [(load, vf1), (load, vf2), (store, vf1)] 5789 // Group the instructions together to emit separate remarks for: 5790 // load (vf1, vf2) 5791 // store (vf1) 5792 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5793 auto Subset = ArrayRef<InstructionVFPair>(); 5794 do { 5795 if (Subset.empty()) 5796 Subset = Tail.take_front(1); 5797 5798 Instruction *I = Subset.front().first; 5799 5800 // If the next instruction is different, or if there are no other pairs, 5801 // emit a remark for the collated subset. e.g. 5802 // [(load, vf1), (load, vf2))] 5803 // to emit: 5804 // remark: invalid costs for 'load' at VF=(vf, vf2) 5805 if (Subset == Tail || Tail[Subset.size()].first != I) { 5806 std::string OutString; 5807 raw_string_ostream OS(OutString); 5808 assert(!Subset.empty() && "Unexpected empty range"); 5809 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5810 for (auto &Pair : Subset) 5811 OS << (Pair.second == Subset.front().second ? "" : ", ") 5812 << Pair.second; 5813 OS << "):"; 5814 if (auto *CI = dyn_cast<CallInst>(I)) 5815 OS << " call to " << CI->getCalledFunction()->getName(); 5816 else 5817 OS << " " << I->getOpcodeName(); 5818 OS.flush(); 5819 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5820 Tail = Tail.drop_front(Subset.size()); 5821 Subset = {}; 5822 } else 5823 // Grow the subset by one element 5824 Subset = Tail.take_front(Subset.size() + 1); 5825 } while (!Tail.empty()); 5826 } 5827 5828 if (!EnableCondStoresVectorization && NumPredStores) { 5829 reportVectorizationFailure("There are conditional stores.", 5830 "store that is conditionally executed prevents vectorization", 5831 "ConditionalStore", ORE, TheLoop); 5832 ChosenFactor = ScalarCost; 5833 } 5834 5835 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5836 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5837 << "LV: Vectorization seems to be not beneficial, " 5838 << "but was forced by a user.\n"); 5839 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5840 return ChosenFactor; 5841 } 5842 5843 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5844 const Loop &L, ElementCount VF) const { 5845 // Cross iteration phis such as reductions need special handling and are 5846 // currently unsupported. 5847 if (any_of(L.getHeader()->phis(), 5848 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5849 return false; 5850 5851 // Phis with uses outside of the loop require special handling and are 5852 // currently unsupported. 5853 for (auto &Entry : Legal->getInductionVars()) { 5854 // Look for uses of the value of the induction at the last iteration. 5855 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5856 for (User *U : PostInc->users()) 5857 if (!L.contains(cast<Instruction>(U))) 5858 return false; 5859 // Look for uses of penultimate value of the induction. 5860 for (User *U : Entry.first->users()) 5861 if (!L.contains(cast<Instruction>(U))) 5862 return false; 5863 } 5864 5865 // Induction variables that are widened require special handling that is 5866 // currently not supported. 5867 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5868 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5869 this->isProfitableToScalarize(Entry.first, VF)); 5870 })) 5871 return false; 5872 5873 // Epilogue vectorization code has not been auditted to ensure it handles 5874 // non-latch exits properly. It may be fine, but it needs auditted and 5875 // tested. 5876 if (L.getExitingBlock() != L.getLoopLatch()) 5877 return false; 5878 5879 return true; 5880 } 5881 5882 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5883 const ElementCount VF) const { 5884 // FIXME: We need a much better cost-model to take different parameters such 5885 // as register pressure, code size increase and cost of extra branches into 5886 // account. For now we apply a very crude heuristic and only consider loops 5887 // with vectorization factors larger than a certain value. 5888 // We also consider epilogue vectorization unprofitable for targets that don't 5889 // consider interleaving beneficial (eg. MVE). 5890 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5891 return false; 5892 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5893 return true; 5894 return false; 5895 } 5896 5897 VectorizationFactor 5898 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5899 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5900 VectorizationFactor Result = VectorizationFactor::Disabled(); 5901 if (!EnableEpilogueVectorization) { 5902 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5903 return Result; 5904 } 5905 5906 if (!isScalarEpilogueAllowed()) { 5907 LLVM_DEBUG( 5908 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5909 "allowed.\n";); 5910 return Result; 5911 } 5912 5913 // Not really a cost consideration, but check for unsupported cases here to 5914 // simplify the logic. 5915 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5916 LLVM_DEBUG( 5917 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5918 "not a supported candidate.\n";); 5919 return Result; 5920 } 5921 5922 if (EpilogueVectorizationForceVF > 1) { 5923 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5924 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5925 if (LVP.hasPlanWithVF(ForcedEC)) 5926 return {ForcedEC, 0}; 5927 else { 5928 LLVM_DEBUG( 5929 dbgs() 5930 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5931 return Result; 5932 } 5933 } 5934 5935 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5936 TheLoop->getHeader()->getParent()->hasMinSize()) { 5937 LLVM_DEBUG( 5938 dbgs() 5939 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5940 return Result; 5941 } 5942 5943 auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5944 if (MainLoopVF.isScalable()) 5945 LLVM_DEBUG( 5946 dbgs() << "LEV: Epilogue vectorization using scalable vectors not " 5947 "yet supported. Converting to fixed-width (VF=" 5948 << FixedMainLoopVF << ") instead\n"); 5949 5950 if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) { 5951 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5952 "this loop\n"); 5953 return Result; 5954 } 5955 5956 for (auto &NextVF : ProfitableVFs) 5957 if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) && 5958 (Result.Width.getFixedValue() == 1 || 5959 isMoreProfitable(NextVF, Result)) && 5960 LVP.hasPlanWithVF(NextVF.Width)) 5961 Result = NextVF; 5962 5963 if (Result != VectorizationFactor::Disabled()) 5964 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5965 << Result.Width.getFixedValue() << "\n";); 5966 return Result; 5967 } 5968 5969 std::pair<unsigned, unsigned> 5970 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5971 unsigned MinWidth = -1U; 5972 unsigned MaxWidth = 8; 5973 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5974 // For in-loop reductions, no element types are added to ElementTypesInLoop 5975 // if there are no loads/stores in the loop. In this case, check through the 5976 // reduction variables to determine the maximum width. 5977 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5978 // Reset MaxWidth so that we can find the smallest type used by recurrences 5979 // in the loop. 5980 MaxWidth = -1U; 5981 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5982 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5983 // When finding the min width used by the recurrence we need to account 5984 // for casts on the input operands of the recurrence. 5985 MaxWidth = std::min<unsigned>( 5986 MaxWidth, std::min<unsigned>( 5987 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5988 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5989 } 5990 } else { 5991 for (Type *T : ElementTypesInLoop) { 5992 MinWidth = std::min<unsigned>( 5993 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5994 MaxWidth = std::max<unsigned>( 5995 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5996 } 5997 } 5998 return {MinWidth, MaxWidth}; 5999 } 6000 6001 void LoopVectorizationCostModel::collectElementTypesForWidening() { 6002 ElementTypesInLoop.clear(); 6003 // For each block. 6004 for (BasicBlock *BB : TheLoop->blocks()) { 6005 // For each instruction in the loop. 6006 for (Instruction &I : BB->instructionsWithoutDebug()) { 6007 Type *T = I.getType(); 6008 6009 // Skip ignored values. 6010 if (ValuesToIgnore.count(&I)) 6011 continue; 6012 6013 // Only examine Loads, Stores and PHINodes. 6014 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6015 continue; 6016 6017 // Examine PHI nodes that are reduction variables. Update the type to 6018 // account for the recurrence type. 6019 if (auto *PN = dyn_cast<PHINode>(&I)) { 6020 if (!Legal->isReductionVariable(PN)) 6021 continue; 6022 const RecurrenceDescriptor &RdxDesc = 6023 Legal->getReductionVars().find(PN)->second; 6024 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6025 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6026 RdxDesc.getRecurrenceType(), 6027 TargetTransformInfo::ReductionFlags())) 6028 continue; 6029 T = RdxDesc.getRecurrenceType(); 6030 } 6031 6032 // Examine the stored values. 6033 if (auto *ST = dyn_cast<StoreInst>(&I)) 6034 T = ST->getValueOperand()->getType(); 6035 6036 assert(T->isSized() && 6037 "Expected the load/store/recurrence type to be sized"); 6038 6039 ElementTypesInLoop.insert(T); 6040 } 6041 } 6042 } 6043 6044 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6045 unsigned LoopCost) { 6046 // -- The interleave heuristics -- 6047 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6048 // There are many micro-architectural considerations that we can't predict 6049 // at this level. For example, frontend pressure (on decode or fetch) due to 6050 // code size, or the number and capabilities of the execution ports. 6051 // 6052 // We use the following heuristics to select the interleave count: 6053 // 1. If the code has reductions, then we interleave to break the cross 6054 // iteration dependency. 6055 // 2. If the loop is really small, then we interleave to reduce the loop 6056 // overhead. 6057 // 3. We don't interleave if we think that we will spill registers to memory 6058 // due to the increased register pressure. 6059 6060 if (!isScalarEpilogueAllowed()) 6061 return 1; 6062 6063 // We used the distance for the interleave count. 6064 if (Legal->getMaxSafeDepDistBytes() != -1U) 6065 return 1; 6066 6067 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6068 const bool HasReductions = !Legal->getReductionVars().empty(); 6069 // Do not interleave loops with a relatively small known or estimated trip 6070 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6071 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6072 // because with the above conditions interleaving can expose ILP and break 6073 // cross iteration dependences for reductions. 6074 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6075 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6076 return 1; 6077 6078 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6079 // We divide by these constants so assume that we have at least one 6080 // instruction that uses at least one register. 6081 for (auto& pair : R.MaxLocalUsers) { 6082 pair.second = std::max(pair.second, 1U); 6083 } 6084 6085 // We calculate the interleave count using the following formula. 6086 // Subtract the number of loop invariants from the number of available 6087 // registers. These registers are used by all of the interleaved instances. 6088 // Next, divide the remaining registers by the number of registers that is 6089 // required by the loop, in order to estimate how many parallel instances 6090 // fit without causing spills. All of this is rounded down if necessary to be 6091 // a power of two. We want power of two interleave count to simplify any 6092 // addressing operations or alignment considerations. 6093 // We also want power of two interleave counts to ensure that the induction 6094 // variable of the vector loop wraps to zero, when tail is folded by masking; 6095 // this currently happens when OptForSize, in which case IC is set to 1 above. 6096 unsigned IC = UINT_MAX; 6097 6098 for (auto& pair : R.MaxLocalUsers) { 6099 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6100 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6101 << " registers of " 6102 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6103 if (VF.isScalar()) { 6104 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6105 TargetNumRegisters = ForceTargetNumScalarRegs; 6106 } else { 6107 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6108 TargetNumRegisters = ForceTargetNumVectorRegs; 6109 } 6110 unsigned MaxLocalUsers = pair.second; 6111 unsigned LoopInvariantRegs = 0; 6112 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6113 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6114 6115 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6116 // Don't count the induction variable as interleaved. 6117 if (EnableIndVarRegisterHeur) { 6118 TmpIC = 6119 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6120 std::max(1U, (MaxLocalUsers - 1))); 6121 } 6122 6123 IC = std::min(IC, TmpIC); 6124 } 6125 6126 // Clamp the interleave ranges to reasonable counts. 6127 unsigned MaxInterleaveCount = 6128 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6129 6130 // Check if the user has overridden the max. 6131 if (VF.isScalar()) { 6132 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6133 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6134 } else { 6135 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6136 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6137 } 6138 6139 // If trip count is known or estimated compile time constant, limit the 6140 // interleave count to be less than the trip count divided by VF, provided it 6141 // is at least 1. 6142 // 6143 // For scalable vectors we can't know if interleaving is beneficial. It may 6144 // not be beneficial for small loops if none of the lanes in the second vector 6145 // iterations is enabled. However, for larger loops, there is likely to be a 6146 // similar benefit as for fixed-width vectors. For now, we choose to leave 6147 // the InterleaveCount as if vscale is '1', although if some information about 6148 // the vector is known (e.g. min vector size), we can make a better decision. 6149 if (BestKnownTC) { 6150 MaxInterleaveCount = 6151 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6152 // Make sure MaxInterleaveCount is greater than 0. 6153 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6154 } 6155 6156 assert(MaxInterleaveCount > 0 && 6157 "Maximum interleave count must be greater than 0"); 6158 6159 // Clamp the calculated IC to be between the 1 and the max interleave count 6160 // that the target and trip count allows. 6161 if (IC > MaxInterleaveCount) 6162 IC = MaxInterleaveCount; 6163 else 6164 // Make sure IC is greater than 0. 6165 IC = std::max(1u, IC); 6166 6167 assert(IC > 0 && "Interleave count must be greater than 0."); 6168 6169 // If we did not calculate the cost for VF (because the user selected the VF) 6170 // then we calculate the cost of VF here. 6171 if (LoopCost == 0) { 6172 InstructionCost C = expectedCost(VF).first; 6173 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 6174 LoopCost = *C.getValue(); 6175 } 6176 6177 assert(LoopCost && "Non-zero loop cost expected"); 6178 6179 // Interleave if we vectorized this loop and there is a reduction that could 6180 // benefit from interleaving. 6181 if (VF.isVector() && HasReductions) { 6182 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6183 return IC; 6184 } 6185 6186 // Note that if we've already vectorized the loop we will have done the 6187 // runtime check and so interleaving won't require further checks. 6188 bool InterleavingRequiresRuntimePointerCheck = 6189 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6190 6191 // We want to interleave small loops in order to reduce the loop overhead and 6192 // potentially expose ILP opportunities. 6193 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6194 << "LV: IC is " << IC << '\n' 6195 << "LV: VF is " << VF << '\n'); 6196 const bool AggressivelyInterleaveReductions = 6197 TTI.enableAggressiveInterleaving(HasReductions); 6198 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6199 // We assume that the cost overhead is 1 and we use the cost model 6200 // to estimate the cost of the loop and interleave until the cost of the 6201 // loop overhead is about 5% of the cost of the loop. 6202 unsigned SmallIC = 6203 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6204 6205 // Interleave until store/load ports (estimated by max interleave count) are 6206 // saturated. 6207 unsigned NumStores = Legal->getNumStores(); 6208 unsigned NumLoads = Legal->getNumLoads(); 6209 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6210 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6211 6212 // There is little point in interleaving for reductions containing selects 6213 // and compares when VF=1 since it may just create more overhead than it's 6214 // worth for loops with small trip counts. This is because we still have to 6215 // do the final reduction after the loop. 6216 bool HasSelectCmpReductions = 6217 HasReductions && 6218 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6219 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6220 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 6221 RdxDesc.getRecurrenceKind()); 6222 }); 6223 if (HasSelectCmpReductions) { 6224 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 6225 return 1; 6226 } 6227 6228 // If we have a scalar reduction (vector reductions are already dealt with 6229 // by this point), we can increase the critical path length if the loop 6230 // we're interleaving is inside another loop. For tree-wise reductions 6231 // set the limit to 2, and for ordered reductions it's best to disable 6232 // interleaving entirely. 6233 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6234 bool HasOrderedReductions = 6235 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6236 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6237 return RdxDesc.isOrdered(); 6238 }); 6239 if (HasOrderedReductions) { 6240 LLVM_DEBUG( 6241 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6242 return 1; 6243 } 6244 6245 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6246 SmallIC = std::min(SmallIC, F); 6247 StoresIC = std::min(StoresIC, F); 6248 LoadsIC = std::min(LoadsIC, F); 6249 } 6250 6251 if (EnableLoadStoreRuntimeInterleave && 6252 std::max(StoresIC, LoadsIC) > SmallIC) { 6253 LLVM_DEBUG( 6254 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6255 return std::max(StoresIC, LoadsIC); 6256 } 6257 6258 // If there are scalar reductions and TTI has enabled aggressive 6259 // interleaving for reductions, we will interleave to expose ILP. 6260 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6261 AggressivelyInterleaveReductions) { 6262 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6263 // Interleave no less than SmallIC but not as aggressive as the normal IC 6264 // to satisfy the rare situation when resources are too limited. 6265 return std::max(IC / 2, SmallIC); 6266 } else { 6267 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6268 return SmallIC; 6269 } 6270 } 6271 6272 // Interleave if this is a large loop (small loops are already dealt with by 6273 // this point) that could benefit from interleaving. 6274 if (AggressivelyInterleaveReductions) { 6275 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6276 return IC; 6277 } 6278 6279 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6280 return 1; 6281 } 6282 6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6285 // This function calculates the register usage by measuring the highest number 6286 // of values that are alive at a single location. Obviously, this is a very 6287 // rough estimation. We scan the loop in a topological order in order and 6288 // assign a number to each instruction. We use RPO to ensure that defs are 6289 // met before their users. We assume that each instruction that has in-loop 6290 // users starts an interval. We record every time that an in-loop value is 6291 // used, so we have a list of the first and last occurrences of each 6292 // instruction. Next, we transpose this data structure into a multi map that 6293 // holds the list of intervals that *end* at a specific location. This multi 6294 // map allows us to perform a linear search. We scan the instructions linearly 6295 // and record each time that a new interval starts, by placing it in a set. 6296 // If we find this value in the multi-map then we remove it from the set. 6297 // The max register usage is the maximum size of the set. 6298 // We also search for instructions that are defined outside the loop, but are 6299 // used inside the loop. We need this number separately from the max-interval 6300 // usage number because when we unroll, loop-invariant values do not take 6301 // more register. 6302 LoopBlocksDFS DFS(TheLoop); 6303 DFS.perform(LI); 6304 6305 RegisterUsage RU; 6306 6307 // Each 'key' in the map opens a new interval. The values 6308 // of the map are the index of the 'last seen' usage of the 6309 // instruction that is the key. 6310 using IntervalMap = DenseMap<Instruction *, unsigned>; 6311 6312 // Maps instruction to its index. 6313 SmallVector<Instruction *, 64> IdxToInstr; 6314 // Marks the end of each interval. 6315 IntervalMap EndPoint; 6316 // Saves the list of instruction indices that are used in the loop. 6317 SmallPtrSet<Instruction *, 8> Ends; 6318 // Saves the list of values that are used in the loop but are 6319 // defined outside the loop, such as arguments and constants. 6320 SmallPtrSet<Value *, 8> LoopInvariants; 6321 6322 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6323 for (Instruction &I : BB->instructionsWithoutDebug()) { 6324 IdxToInstr.push_back(&I); 6325 6326 // Save the end location of each USE. 6327 for (Value *U : I.operands()) { 6328 auto *Instr = dyn_cast<Instruction>(U); 6329 6330 // Ignore non-instruction values such as arguments, constants, etc. 6331 if (!Instr) 6332 continue; 6333 6334 // If this instruction is outside the loop then record it and continue. 6335 if (!TheLoop->contains(Instr)) { 6336 LoopInvariants.insert(Instr); 6337 continue; 6338 } 6339 6340 // Overwrite previous end points. 6341 EndPoint[Instr] = IdxToInstr.size(); 6342 Ends.insert(Instr); 6343 } 6344 } 6345 } 6346 6347 // Saves the list of intervals that end with the index in 'key'. 6348 using InstrList = SmallVector<Instruction *, 2>; 6349 DenseMap<unsigned, InstrList> TransposeEnds; 6350 6351 // Transpose the EndPoints to a list of values that end at each index. 6352 for (auto &Interval : EndPoint) 6353 TransposeEnds[Interval.second].push_back(Interval.first); 6354 6355 SmallPtrSet<Instruction *, 8> OpenIntervals; 6356 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6357 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6358 6359 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6360 6361 // A lambda that gets the register usage for the given type and VF. 6362 const auto &TTICapture = TTI; 6363 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6364 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6365 return 0; 6366 InstructionCost::CostType RegUsage = 6367 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6368 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() && 6369 "Nonsensical values for register usage."); 6370 return RegUsage; 6371 }; 6372 6373 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6374 Instruction *I = IdxToInstr[i]; 6375 6376 // Remove all of the instructions that end at this location. 6377 InstrList &List = TransposeEnds[i]; 6378 for (Instruction *ToRemove : List) 6379 OpenIntervals.erase(ToRemove); 6380 6381 // Ignore instructions that are never used within the loop. 6382 if (!Ends.count(I)) 6383 continue; 6384 6385 // Skip ignored values. 6386 if (ValuesToIgnore.count(I)) 6387 continue; 6388 6389 // For each VF find the maximum usage of registers. 6390 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6391 // Count the number of live intervals. 6392 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6393 6394 if (VFs[j].isScalar()) { 6395 for (auto Inst : OpenIntervals) { 6396 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6397 if (RegUsage.find(ClassID) == RegUsage.end()) 6398 RegUsage[ClassID] = 1; 6399 else 6400 RegUsage[ClassID] += 1; 6401 } 6402 } else { 6403 collectUniformsAndScalars(VFs[j]); 6404 for (auto Inst : OpenIntervals) { 6405 // Skip ignored values for VF > 1. 6406 if (VecValuesToIgnore.count(Inst)) 6407 continue; 6408 if (isScalarAfterVectorization(Inst, VFs[j])) { 6409 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6410 if (RegUsage.find(ClassID) == RegUsage.end()) 6411 RegUsage[ClassID] = 1; 6412 else 6413 RegUsage[ClassID] += 1; 6414 } else { 6415 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6416 if (RegUsage.find(ClassID) == RegUsage.end()) 6417 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6418 else 6419 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6420 } 6421 } 6422 } 6423 6424 for (auto& pair : RegUsage) { 6425 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6426 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6427 else 6428 MaxUsages[j][pair.first] = pair.second; 6429 } 6430 } 6431 6432 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6433 << OpenIntervals.size() << '\n'); 6434 6435 // Add the current instruction to the list of open intervals. 6436 OpenIntervals.insert(I); 6437 } 6438 6439 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6440 SmallMapVector<unsigned, unsigned, 4> Invariant; 6441 6442 for (auto Inst : LoopInvariants) { 6443 unsigned Usage = 6444 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6445 unsigned ClassID = 6446 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6447 if (Invariant.find(ClassID) == Invariant.end()) 6448 Invariant[ClassID] = Usage; 6449 else 6450 Invariant[ClassID] += Usage; 6451 } 6452 6453 LLVM_DEBUG({ 6454 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6455 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6456 << " item\n"; 6457 for (const auto &pair : MaxUsages[i]) { 6458 dbgs() << "LV(REG): RegisterClass: " 6459 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6460 << " registers\n"; 6461 } 6462 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6463 << " item\n"; 6464 for (const auto &pair : Invariant) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 }); 6470 6471 RU.LoopInvariantRegs = Invariant; 6472 RU.MaxLocalUsers = MaxUsages[i]; 6473 RUs[i] = RU; 6474 } 6475 6476 return RUs; 6477 } 6478 6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6480 ElementCount VF) { 6481 // TODO: Cost model for emulated masked load/store is completely 6482 // broken. This hack guides the cost model to use an artificially 6483 // high enough value to practically disable vectorization with such 6484 // operations, except where previously deployed legality hack allowed 6485 // using very low cost values. This is to avoid regressions coming simply 6486 // from moving "masked load/store" check from legality to cost model. 6487 // Masked Load/Gather emulation was previously never allowed. 6488 // Limited number of Masked Store/Scatter emulation was allowed. 6489 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6490 return isa<LoadInst>(I) || 6491 (isa<StoreInst>(I) && 6492 NumPredStores > NumberOfStoresToPredicate); 6493 } 6494 6495 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6496 // If we aren't vectorizing the loop, or if we've already collected the 6497 // instructions to scalarize, there's nothing to do. Collection may already 6498 // have occurred if we have a user-selected VF and are now computing the 6499 // expected cost for interleaving. 6500 if (VF.isScalar() || VF.isZero() || 6501 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6502 return; 6503 6504 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6505 // not profitable to scalarize any instructions, the presence of VF in the 6506 // map will indicate that we've analyzed it already. 6507 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6508 6509 // Find all the instructions that are scalar with predication in the loop and 6510 // determine if it would be better to not if-convert the blocks they are in. 6511 // If so, we also record the instructions to scalarize. 6512 for (BasicBlock *BB : TheLoop->blocks()) { 6513 if (!blockNeedsPredicationForAnyReason(BB)) 6514 continue; 6515 for (Instruction &I : *BB) 6516 if (isScalarWithPredication(&I, VF)) { 6517 ScalarCostsTy ScalarCosts; 6518 // Do not apply discount if scalable, because that would lead to 6519 // invalid scalarization costs. 6520 // Do not apply discount logic if hacked cost is needed 6521 // for emulated masked memrefs. 6522 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6523 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6524 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6525 // Remember that BB will remain after vectorization. 6526 PredicatedBBsAfterVectorization.insert(BB); 6527 } 6528 } 6529 } 6530 6531 int LoopVectorizationCostModel::computePredInstDiscount( 6532 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6533 assert(!isUniformAfterVectorization(PredInst, VF) && 6534 "Instruction marked uniform-after-vectorization will be predicated"); 6535 6536 // Initialize the discount to zero, meaning that the scalar version and the 6537 // vector version cost the same. 6538 InstructionCost Discount = 0; 6539 6540 // Holds instructions to analyze. The instructions we visit are mapped in 6541 // ScalarCosts. Those instructions are the ones that would be scalarized if 6542 // we find that the scalar version costs less. 6543 SmallVector<Instruction *, 8> Worklist; 6544 6545 // Returns true if the given instruction can be scalarized. 6546 auto canBeScalarized = [&](Instruction *I) -> bool { 6547 // We only attempt to scalarize instructions forming a single-use chain 6548 // from the original predicated block that would otherwise be vectorized. 6549 // Although not strictly necessary, we give up on instructions we know will 6550 // already be scalar to avoid traversing chains that are unlikely to be 6551 // beneficial. 6552 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6553 isScalarAfterVectorization(I, VF)) 6554 return false; 6555 6556 // If the instruction is scalar with predication, it will be analyzed 6557 // separately. We ignore it within the context of PredInst. 6558 if (isScalarWithPredication(I, VF)) 6559 return false; 6560 6561 // If any of the instruction's operands are uniform after vectorization, 6562 // the instruction cannot be scalarized. This prevents, for example, a 6563 // masked load from being scalarized. 6564 // 6565 // We assume we will only emit a value for lane zero of an instruction 6566 // marked uniform after vectorization, rather than VF identical values. 6567 // Thus, if we scalarize an instruction that uses a uniform, we would 6568 // create uses of values corresponding to the lanes we aren't emitting code 6569 // for. This behavior can be changed by allowing getScalarValue to clone 6570 // the lane zero values for uniforms rather than asserting. 6571 for (Use &U : I->operands()) 6572 if (auto *J = dyn_cast<Instruction>(U.get())) 6573 if (isUniformAfterVectorization(J, VF)) 6574 return false; 6575 6576 // Otherwise, we can scalarize the instruction. 6577 return true; 6578 }; 6579 6580 // Compute the expected cost discount from scalarizing the entire expression 6581 // feeding the predicated instruction. We currently only consider expressions 6582 // that are single-use instruction chains. 6583 Worklist.push_back(PredInst); 6584 while (!Worklist.empty()) { 6585 Instruction *I = Worklist.pop_back_val(); 6586 6587 // If we've already analyzed the instruction, there's nothing to do. 6588 if (ScalarCosts.find(I) != ScalarCosts.end()) 6589 continue; 6590 6591 // Compute the cost of the vector instruction. Note that this cost already 6592 // includes the scalarization overhead of the predicated instruction. 6593 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6594 6595 // Compute the cost of the scalarized instruction. This cost is the cost of 6596 // the instruction as if it wasn't if-converted and instead remained in the 6597 // predicated block. We will scale this cost by block probability after 6598 // computing the scalarization overhead. 6599 InstructionCost ScalarCost = 6600 VF.getFixedValue() * 6601 getInstructionCost(I, ElementCount::getFixed(1)).first; 6602 6603 // Compute the scalarization overhead of needed insertelement instructions 6604 // and phi nodes. 6605 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6606 ScalarCost += TTI.getScalarizationOverhead( 6607 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6608 APInt::getAllOnes(VF.getFixedValue()), true, false); 6609 ScalarCost += 6610 VF.getFixedValue() * 6611 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6612 } 6613 6614 // Compute the scalarization overhead of needed extractelement 6615 // instructions. For each of the instruction's operands, if the operand can 6616 // be scalarized, add it to the worklist; otherwise, account for the 6617 // overhead. 6618 for (Use &U : I->operands()) 6619 if (auto *J = dyn_cast<Instruction>(U.get())) { 6620 assert(VectorType::isValidElementType(J->getType()) && 6621 "Instruction has non-scalar type"); 6622 if (canBeScalarized(J)) 6623 Worklist.push_back(J); 6624 else if (needsExtract(J, VF)) { 6625 ScalarCost += TTI.getScalarizationOverhead( 6626 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6627 APInt::getAllOnes(VF.getFixedValue()), false, true); 6628 } 6629 } 6630 6631 // Scale the total scalar cost by block probability. 6632 ScalarCost /= getReciprocalPredBlockProb(); 6633 6634 // Compute the discount. A non-negative discount means the vector version 6635 // of the instruction costs more, and scalarizing would be beneficial. 6636 Discount += VectorCost - ScalarCost; 6637 ScalarCosts[I] = ScalarCost; 6638 } 6639 6640 return *Discount.getValue(); 6641 } 6642 6643 LoopVectorizationCostModel::VectorizationCostTy 6644 LoopVectorizationCostModel::expectedCost( 6645 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6646 VectorizationCostTy Cost; 6647 6648 // For each block. 6649 for (BasicBlock *BB : TheLoop->blocks()) { 6650 VectorizationCostTy BlockCost; 6651 6652 // For each instruction in the old loop. 6653 for (Instruction &I : BB->instructionsWithoutDebug()) { 6654 // Skip ignored values. 6655 if (ValuesToIgnore.count(&I) || 6656 (VF.isVector() && VecValuesToIgnore.count(&I))) 6657 continue; 6658 6659 VectorizationCostTy C = getInstructionCost(&I, VF); 6660 6661 // Check if we should override the cost. 6662 if (C.first.isValid() && 6663 ForceTargetInstructionCost.getNumOccurrences() > 0) 6664 C.first = InstructionCost(ForceTargetInstructionCost); 6665 6666 // Keep a list of instructions with invalid costs. 6667 if (Invalid && !C.first.isValid()) 6668 Invalid->emplace_back(&I, VF); 6669 6670 BlockCost.first += C.first; 6671 BlockCost.second |= C.second; 6672 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6673 << " for VF " << VF << " For instruction: " << I 6674 << '\n'); 6675 } 6676 6677 // If we are vectorizing a predicated block, it will have been 6678 // if-converted. This means that the block's instructions (aside from 6679 // stores and instructions that may divide by zero) will now be 6680 // unconditionally executed. For the scalar case, we may not always execute 6681 // the predicated block, if it is an if-else block. Thus, scale the block's 6682 // cost by the probability of executing it. blockNeedsPredication from 6683 // Legal is used so as to not include all blocks in tail folded loops. 6684 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6685 BlockCost.first /= getReciprocalPredBlockProb(); 6686 6687 Cost.first += BlockCost.first; 6688 Cost.second |= BlockCost.second; 6689 } 6690 6691 return Cost; 6692 } 6693 6694 /// Gets Address Access SCEV after verifying that the access pattern 6695 /// is loop invariant except the induction variable dependence. 6696 /// 6697 /// This SCEV can be sent to the Target in order to estimate the address 6698 /// calculation cost. 6699 static const SCEV *getAddressAccessSCEV( 6700 Value *Ptr, 6701 LoopVectorizationLegality *Legal, 6702 PredicatedScalarEvolution &PSE, 6703 const Loop *TheLoop) { 6704 6705 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6706 if (!Gep) 6707 return nullptr; 6708 6709 // We are looking for a gep with all loop invariant indices except for one 6710 // which should be an induction variable. 6711 auto SE = PSE.getSE(); 6712 unsigned NumOperands = Gep->getNumOperands(); 6713 for (unsigned i = 1; i < NumOperands; ++i) { 6714 Value *Opd = Gep->getOperand(i); 6715 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6716 !Legal->isInductionVariable(Opd)) 6717 return nullptr; 6718 } 6719 6720 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6721 return PSE.getSCEV(Ptr); 6722 } 6723 6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6725 return Legal->hasStride(I->getOperand(0)) || 6726 Legal->hasStride(I->getOperand(1)); 6727 } 6728 6729 InstructionCost 6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6731 ElementCount VF) { 6732 assert(VF.isVector() && 6733 "Scalarization cost of instruction implies vectorization."); 6734 if (VF.isScalable()) 6735 return InstructionCost::getInvalid(); 6736 6737 Type *ValTy = getLoadStoreType(I); 6738 auto SE = PSE.getSE(); 6739 6740 unsigned AS = getLoadStoreAddressSpace(I); 6741 Value *Ptr = getLoadStorePointerOperand(I); 6742 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6743 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6744 // that it is being called from this specific place. 6745 6746 // Figure out whether the access is strided and get the stride value 6747 // if it's known in compile time 6748 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6749 6750 // Get the cost of the scalar memory instruction and address computation. 6751 InstructionCost Cost = 6752 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6753 6754 // Don't pass *I here, since it is scalar but will actually be part of a 6755 // vectorized loop where the user of it is a vectorized instruction. 6756 const Align Alignment = getLoadStoreAlignment(I); 6757 Cost += VF.getKnownMinValue() * 6758 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6759 AS, TTI::TCK_RecipThroughput); 6760 6761 // Get the overhead of the extractelement and insertelement instructions 6762 // we might create due to scalarization. 6763 Cost += getScalarizationOverhead(I, VF); 6764 6765 // If we have a predicated load/store, it will need extra i1 extracts and 6766 // conditional branches, but may not be executed for each vector lane. Scale 6767 // the cost by the probability of executing the predicated block. 6768 if (isPredicatedInst(I, VF)) { 6769 Cost /= getReciprocalPredBlockProb(); 6770 6771 // Add the cost of an i1 extract and a branch 6772 auto *Vec_i1Ty = 6773 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6774 Cost += TTI.getScalarizationOverhead( 6775 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6776 /*Insert=*/false, /*Extract=*/true); 6777 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6778 6779 if (useEmulatedMaskMemRefHack(I, VF)) 6780 // Artificially setting to a high enough value to practically disable 6781 // vectorization with such operations. 6782 Cost = 3000000; 6783 } 6784 6785 return Cost; 6786 } 6787 6788 InstructionCost 6789 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6790 ElementCount VF) { 6791 Type *ValTy = getLoadStoreType(I); 6792 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6793 Value *Ptr = getLoadStorePointerOperand(I); 6794 unsigned AS = getLoadStoreAddressSpace(I); 6795 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6796 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6797 6798 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6799 "Stride should be 1 or -1 for consecutive memory access"); 6800 const Align Alignment = getLoadStoreAlignment(I); 6801 InstructionCost Cost = 0; 6802 if (Legal->isMaskRequired(I)) 6803 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6804 CostKind); 6805 else 6806 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6807 CostKind, I); 6808 6809 bool Reverse = ConsecutiveStride < 0; 6810 if (Reverse) 6811 Cost += 6812 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6813 return Cost; 6814 } 6815 6816 InstructionCost 6817 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6818 ElementCount VF) { 6819 assert(Legal->isUniformMemOp(*I)); 6820 6821 Type *ValTy = getLoadStoreType(I); 6822 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6823 const Align Alignment = getLoadStoreAlignment(I); 6824 unsigned AS = getLoadStoreAddressSpace(I); 6825 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6826 if (isa<LoadInst>(I)) { 6827 return TTI.getAddressComputationCost(ValTy) + 6828 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6829 CostKind) + 6830 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6831 } 6832 StoreInst *SI = cast<StoreInst>(I); 6833 6834 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6835 return TTI.getAddressComputationCost(ValTy) + 6836 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6837 CostKind) + 6838 (isLoopInvariantStoreValue 6839 ? 0 6840 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6841 VF.getKnownMinValue() - 1)); 6842 } 6843 6844 InstructionCost 6845 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6846 ElementCount VF) { 6847 Type *ValTy = getLoadStoreType(I); 6848 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6849 const Align Alignment = getLoadStoreAlignment(I); 6850 const Value *Ptr = getLoadStorePointerOperand(I); 6851 6852 return TTI.getAddressComputationCost(VectorTy) + 6853 TTI.getGatherScatterOpCost( 6854 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6855 TargetTransformInfo::TCK_RecipThroughput, I); 6856 } 6857 6858 InstructionCost 6859 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6860 ElementCount VF) { 6861 // TODO: Once we have support for interleaving with scalable vectors 6862 // we can calculate the cost properly here. 6863 if (VF.isScalable()) 6864 return InstructionCost::getInvalid(); 6865 6866 Type *ValTy = getLoadStoreType(I); 6867 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6868 unsigned AS = getLoadStoreAddressSpace(I); 6869 6870 auto Group = getInterleavedAccessGroup(I); 6871 assert(Group && "Fail to get an interleaved access group."); 6872 6873 unsigned InterleaveFactor = Group->getFactor(); 6874 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6875 6876 // Holds the indices of existing members in the interleaved group. 6877 SmallVector<unsigned, 4> Indices; 6878 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6879 if (Group->getMember(IF)) 6880 Indices.push_back(IF); 6881 6882 // Calculate the cost of the whole interleaved group. 6883 bool UseMaskForGaps = 6884 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6885 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6886 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6887 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6888 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6889 6890 if (Group->isReverse()) { 6891 // TODO: Add support for reversed masked interleaved access. 6892 assert(!Legal->isMaskRequired(I) && 6893 "Reverse masked interleaved access not supported."); 6894 Cost += 6895 Group->getNumMembers() * 6896 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6897 } 6898 return Cost; 6899 } 6900 6901 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6902 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6903 using namespace llvm::PatternMatch; 6904 // Early exit for no inloop reductions 6905 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6906 return None; 6907 auto *VectorTy = cast<VectorType>(Ty); 6908 6909 // We are looking for a pattern of, and finding the minimal acceptable cost: 6910 // reduce(mul(ext(A), ext(B))) or 6911 // reduce(mul(A, B)) or 6912 // reduce(ext(A)) or 6913 // reduce(A). 6914 // The basic idea is that we walk down the tree to do that, finding the root 6915 // reduction instruction in InLoopReductionImmediateChains. From there we find 6916 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6917 // of the components. If the reduction cost is lower then we return it for the 6918 // reduction instruction and 0 for the other instructions in the pattern. If 6919 // it is not we return an invalid cost specifying the orignal cost method 6920 // should be used. 6921 Instruction *RetI = I; 6922 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6923 if (!RetI->hasOneUser()) 6924 return None; 6925 RetI = RetI->user_back(); 6926 } 6927 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6928 RetI->user_back()->getOpcode() == Instruction::Add) { 6929 if (!RetI->hasOneUser()) 6930 return None; 6931 RetI = RetI->user_back(); 6932 } 6933 6934 // Test if the found instruction is a reduction, and if not return an invalid 6935 // cost specifying the parent to use the original cost modelling. 6936 if (!InLoopReductionImmediateChains.count(RetI)) 6937 return None; 6938 6939 // Find the reduction this chain is a part of and calculate the basic cost of 6940 // the reduction on its own. 6941 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6942 Instruction *ReductionPhi = LastChain; 6943 while (!isa<PHINode>(ReductionPhi)) 6944 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6945 6946 const RecurrenceDescriptor &RdxDesc = 6947 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6948 6949 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6950 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6951 6952 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6953 // normal fmul instruction to the cost of the fadd reduction. 6954 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6955 BaseCost += 6956 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6957 6958 // If we're using ordered reductions then we can just return the base cost 6959 // here, since getArithmeticReductionCost calculates the full ordered 6960 // reduction cost when FP reassociation is not allowed. 6961 if (useOrderedReductions(RdxDesc)) 6962 return BaseCost; 6963 6964 // Get the operand that was not the reduction chain and match it to one of the 6965 // patterns, returning the better cost if it is found. 6966 Instruction *RedOp = RetI->getOperand(1) == LastChain 6967 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6968 : dyn_cast<Instruction>(RetI->getOperand(1)); 6969 6970 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6971 6972 Instruction *Op0, *Op1; 6973 if (RedOp && 6974 match(RedOp, 6975 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6976 match(Op0, m_ZExtOrSExt(m_Value())) && 6977 Op0->getOpcode() == Op1->getOpcode() && 6978 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6979 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6980 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6981 6982 // Matched reduce(ext(mul(ext(A), ext(B))) 6983 // Note that the extend opcodes need to all match, or if A==B they will have 6984 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6985 // which is equally fine. 6986 bool IsUnsigned = isa<ZExtInst>(Op0); 6987 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6988 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6989 6990 InstructionCost ExtCost = 6991 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6992 TTI::CastContextHint::None, CostKind, Op0); 6993 InstructionCost MulCost = 6994 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6995 InstructionCost Ext2Cost = 6996 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6997 TTI::CastContextHint::None, CostKind, RedOp); 6998 6999 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7000 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7001 CostKind); 7002 7003 if (RedCost.isValid() && 7004 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 7005 return I == RetI ? RedCost : 0; 7006 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 7007 !TheLoop->isLoopInvariant(RedOp)) { 7008 // Matched reduce(ext(A)) 7009 bool IsUnsigned = isa<ZExtInst>(RedOp); 7010 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7011 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7012 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7013 CostKind); 7014 7015 InstructionCost ExtCost = 7016 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7017 TTI::CastContextHint::None, CostKind, RedOp); 7018 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7019 return I == RetI ? RedCost : 0; 7020 } else if (RedOp && 7021 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 7022 if (match(Op0, m_ZExtOrSExt(m_Value())) && 7023 Op0->getOpcode() == Op1->getOpcode() && 7024 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7025 bool IsUnsigned = isa<ZExtInst>(Op0); 7026 Type *Op0Ty = Op0->getOperand(0)->getType(); 7027 Type *Op1Ty = Op1->getOperand(0)->getType(); 7028 Type *LargestOpTy = 7029 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 7030 : Op0Ty; 7031 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 7032 7033 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 7034 // different sizes. We take the largest type as the ext to reduce, and add 7035 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 7036 InstructionCost ExtCost0 = TTI.getCastInstrCost( 7037 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 7038 TTI::CastContextHint::None, CostKind, Op0); 7039 InstructionCost ExtCost1 = TTI.getCastInstrCost( 7040 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 7041 TTI::CastContextHint::None, CostKind, Op1); 7042 InstructionCost MulCost = 7043 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7044 7045 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7046 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7047 CostKind); 7048 InstructionCost ExtraExtCost = 0; 7049 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 7050 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 7051 ExtraExtCost = TTI.getCastInstrCost( 7052 ExtraExtOp->getOpcode(), ExtType, 7053 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 7054 TTI::CastContextHint::None, CostKind, ExtraExtOp); 7055 } 7056 7057 if (RedCost.isValid() && 7058 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 7059 return I == RetI ? RedCost : 0; 7060 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 7061 // Matched reduce(mul()) 7062 InstructionCost MulCost = 7063 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7064 7065 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7066 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7067 CostKind); 7068 7069 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7070 return I == RetI ? RedCost : 0; 7071 } 7072 } 7073 7074 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 7075 } 7076 7077 InstructionCost 7078 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7079 ElementCount VF) { 7080 // Calculate scalar cost only. Vectorization cost should be ready at this 7081 // moment. 7082 if (VF.isScalar()) { 7083 Type *ValTy = getLoadStoreType(I); 7084 const Align Alignment = getLoadStoreAlignment(I); 7085 unsigned AS = getLoadStoreAddressSpace(I); 7086 7087 return TTI.getAddressComputationCost(ValTy) + 7088 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7089 TTI::TCK_RecipThroughput, I); 7090 } 7091 return getWideningCost(I, VF); 7092 } 7093 7094 LoopVectorizationCostModel::VectorizationCostTy 7095 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7096 ElementCount VF) { 7097 // If we know that this instruction will remain uniform, check the cost of 7098 // the scalar version. 7099 if (isUniformAfterVectorization(I, VF)) 7100 VF = ElementCount::getFixed(1); 7101 7102 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7103 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7104 7105 // Forced scalars do not have any scalarization overhead. 7106 auto ForcedScalar = ForcedScalars.find(VF); 7107 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7108 auto InstSet = ForcedScalar->second; 7109 if (InstSet.count(I)) 7110 return VectorizationCostTy( 7111 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7112 VF.getKnownMinValue()), 7113 false); 7114 } 7115 7116 Type *VectorTy; 7117 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7118 7119 bool TypeNotScalarized = false; 7120 if (VF.isVector() && VectorTy->isVectorTy()) { 7121 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 7122 if (NumParts) 7123 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 7124 else 7125 C = InstructionCost::getInvalid(); 7126 } 7127 return VectorizationCostTy(C, TypeNotScalarized); 7128 } 7129 7130 InstructionCost 7131 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7132 ElementCount VF) const { 7133 7134 // There is no mechanism yet to create a scalable scalarization loop, 7135 // so this is currently Invalid. 7136 if (VF.isScalable()) 7137 return InstructionCost::getInvalid(); 7138 7139 if (VF.isScalar()) 7140 return 0; 7141 7142 InstructionCost Cost = 0; 7143 Type *RetTy = ToVectorTy(I->getType(), VF); 7144 if (!RetTy->isVoidTy() && 7145 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7146 Cost += TTI.getScalarizationOverhead( 7147 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 7148 false); 7149 7150 // Some targets keep addresses scalar. 7151 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7152 return Cost; 7153 7154 // Some targets support efficient element stores. 7155 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7156 return Cost; 7157 7158 // Collect operands to consider. 7159 CallInst *CI = dyn_cast<CallInst>(I); 7160 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 7161 7162 // Skip operands that do not require extraction/scalarization and do not incur 7163 // any overhead. 7164 SmallVector<Type *> Tys; 7165 for (auto *V : filterExtractingOperands(Ops, VF)) 7166 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7167 return Cost + TTI.getOperandsScalarizationOverhead( 7168 filterExtractingOperands(Ops, VF), Tys); 7169 } 7170 7171 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7172 if (VF.isScalar()) 7173 return; 7174 NumPredStores = 0; 7175 for (BasicBlock *BB : TheLoop->blocks()) { 7176 // For each instruction in the old loop. 7177 for (Instruction &I : *BB) { 7178 Value *Ptr = getLoadStorePointerOperand(&I); 7179 if (!Ptr) 7180 continue; 7181 7182 // TODO: We should generate better code and update the cost model for 7183 // predicated uniform stores. Today they are treated as any other 7184 // predicated store (see added test cases in 7185 // invariant-store-vectorization.ll). 7186 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 7187 NumPredStores++; 7188 7189 if (Legal->isUniformMemOp(I)) { 7190 // TODO: Avoid replicating loads and stores instead of 7191 // relying on instcombine to remove them. 7192 // Load: Scalar load + broadcast 7193 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7194 InstructionCost Cost; 7195 if (isa<StoreInst>(&I) && VF.isScalable() && 7196 isLegalGatherOrScatter(&I, VF)) { 7197 Cost = getGatherScatterCost(&I, VF); 7198 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 7199 } else { 7200 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 7201 "Cannot yet scalarize uniform stores"); 7202 Cost = getUniformMemOpCost(&I, VF); 7203 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7204 } 7205 continue; 7206 } 7207 7208 // We assume that widening is the best solution when possible. 7209 if (memoryInstructionCanBeWidened(&I, VF)) { 7210 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7211 int ConsecutiveStride = Legal->isConsecutivePtr( 7212 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7213 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7214 "Expected consecutive stride."); 7215 InstWidening Decision = 7216 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7217 setWideningDecision(&I, VF, Decision, Cost); 7218 continue; 7219 } 7220 7221 // Choose between Interleaving, Gather/Scatter or Scalarization. 7222 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7223 unsigned NumAccesses = 1; 7224 if (isAccessInterleaved(&I)) { 7225 auto Group = getInterleavedAccessGroup(&I); 7226 assert(Group && "Fail to get an interleaved access group."); 7227 7228 // Make one decision for the whole group. 7229 if (getWideningDecision(&I, VF) != CM_Unknown) 7230 continue; 7231 7232 NumAccesses = Group->getNumMembers(); 7233 if (interleavedAccessCanBeWidened(&I, VF)) 7234 InterleaveCost = getInterleaveGroupCost(&I, VF); 7235 } 7236 7237 InstructionCost GatherScatterCost = 7238 isLegalGatherOrScatter(&I, VF) 7239 ? getGatherScatterCost(&I, VF) * NumAccesses 7240 : InstructionCost::getInvalid(); 7241 7242 InstructionCost ScalarizationCost = 7243 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7244 7245 // Choose better solution for the current VF, 7246 // write down this decision and use it during vectorization. 7247 InstructionCost Cost; 7248 InstWidening Decision; 7249 if (InterleaveCost <= GatherScatterCost && 7250 InterleaveCost < ScalarizationCost) { 7251 Decision = CM_Interleave; 7252 Cost = InterleaveCost; 7253 } else if (GatherScatterCost < ScalarizationCost) { 7254 Decision = CM_GatherScatter; 7255 Cost = GatherScatterCost; 7256 } else { 7257 Decision = CM_Scalarize; 7258 Cost = ScalarizationCost; 7259 } 7260 // If the instructions belongs to an interleave group, the whole group 7261 // receives the same decision. The whole group receives the cost, but 7262 // the cost will actually be assigned to one instruction. 7263 if (auto Group = getInterleavedAccessGroup(&I)) 7264 setWideningDecision(Group, VF, Decision, Cost); 7265 else 7266 setWideningDecision(&I, VF, Decision, Cost); 7267 } 7268 } 7269 7270 // Make sure that any load of address and any other address computation 7271 // remains scalar unless there is gather/scatter support. This avoids 7272 // inevitable extracts into address registers, and also has the benefit of 7273 // activating LSR more, since that pass can't optimize vectorized 7274 // addresses. 7275 if (TTI.prefersVectorizedAddressing()) 7276 return; 7277 7278 // Start with all scalar pointer uses. 7279 SmallPtrSet<Instruction *, 8> AddrDefs; 7280 for (BasicBlock *BB : TheLoop->blocks()) 7281 for (Instruction &I : *BB) { 7282 Instruction *PtrDef = 7283 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7284 if (PtrDef && TheLoop->contains(PtrDef) && 7285 getWideningDecision(&I, VF) != CM_GatherScatter) 7286 AddrDefs.insert(PtrDef); 7287 } 7288 7289 // Add all instructions used to generate the addresses. 7290 SmallVector<Instruction *, 4> Worklist; 7291 append_range(Worklist, AddrDefs); 7292 while (!Worklist.empty()) { 7293 Instruction *I = Worklist.pop_back_val(); 7294 for (auto &Op : I->operands()) 7295 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7296 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7297 AddrDefs.insert(InstOp).second) 7298 Worklist.push_back(InstOp); 7299 } 7300 7301 for (auto *I : AddrDefs) { 7302 if (isa<LoadInst>(I)) { 7303 // Setting the desired widening decision should ideally be handled in 7304 // by cost functions, but since this involves the task of finding out 7305 // if the loaded register is involved in an address computation, it is 7306 // instead changed here when we know this is the case. 7307 InstWidening Decision = getWideningDecision(I, VF); 7308 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7309 // Scalarize a widened load of address. 7310 setWideningDecision( 7311 I, VF, CM_Scalarize, 7312 (VF.getKnownMinValue() * 7313 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7314 else if (auto Group = getInterleavedAccessGroup(I)) { 7315 // Scalarize an interleave group of address loads. 7316 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7317 if (Instruction *Member = Group->getMember(I)) 7318 setWideningDecision( 7319 Member, VF, CM_Scalarize, 7320 (VF.getKnownMinValue() * 7321 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7322 } 7323 } 7324 } else 7325 // Make sure I gets scalarized and a cost estimate without 7326 // scalarization overhead. 7327 ForcedScalars[VF].insert(I); 7328 } 7329 } 7330 7331 InstructionCost 7332 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7333 Type *&VectorTy) { 7334 Type *RetTy = I->getType(); 7335 if (canTruncateToMinimalBitwidth(I, VF)) 7336 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7337 auto SE = PSE.getSE(); 7338 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7339 7340 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7341 ElementCount VF) -> bool { 7342 if (VF.isScalar()) 7343 return true; 7344 7345 auto Scalarized = InstsToScalarize.find(VF); 7346 assert(Scalarized != InstsToScalarize.end() && 7347 "VF not yet analyzed for scalarization profitability"); 7348 return !Scalarized->second.count(I) && 7349 llvm::all_of(I->users(), [&](User *U) { 7350 auto *UI = cast<Instruction>(U); 7351 return !Scalarized->second.count(UI); 7352 }); 7353 }; 7354 (void) hasSingleCopyAfterVectorization; 7355 7356 if (isScalarAfterVectorization(I, VF)) { 7357 // With the exception of GEPs and PHIs, after scalarization there should 7358 // only be one copy of the instruction generated in the loop. This is 7359 // because the VF is either 1, or any instructions that need scalarizing 7360 // have already been dealt with by the the time we get here. As a result, 7361 // it means we don't have to multiply the instruction cost by VF. 7362 assert(I->getOpcode() == Instruction::GetElementPtr || 7363 I->getOpcode() == Instruction::PHI || 7364 (I->getOpcode() == Instruction::BitCast && 7365 I->getType()->isPointerTy()) || 7366 hasSingleCopyAfterVectorization(I, VF)); 7367 VectorTy = RetTy; 7368 } else 7369 VectorTy = ToVectorTy(RetTy, VF); 7370 7371 // TODO: We need to estimate the cost of intrinsic calls. 7372 switch (I->getOpcode()) { 7373 case Instruction::GetElementPtr: 7374 // We mark this instruction as zero-cost because the cost of GEPs in 7375 // vectorized code depends on whether the corresponding memory instruction 7376 // is scalarized or not. Therefore, we handle GEPs with the memory 7377 // instruction cost. 7378 return 0; 7379 case Instruction::Br: { 7380 // In cases of scalarized and predicated instructions, there will be VF 7381 // predicated blocks in the vectorized loop. Each branch around these 7382 // blocks requires also an extract of its vector compare i1 element. 7383 bool ScalarPredicatedBB = false; 7384 BranchInst *BI = cast<BranchInst>(I); 7385 if (VF.isVector() && BI->isConditional() && 7386 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7387 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7388 ScalarPredicatedBB = true; 7389 7390 if (ScalarPredicatedBB) { 7391 // Not possible to scalarize scalable vector with predicated instructions. 7392 if (VF.isScalable()) 7393 return InstructionCost::getInvalid(); 7394 // Return cost for branches around scalarized and predicated blocks. 7395 auto *Vec_i1Ty = 7396 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7397 return ( 7398 TTI.getScalarizationOverhead( 7399 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7400 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7401 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7402 // The back-edge branch will remain, as will all scalar branches. 7403 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7404 else 7405 // This branch will be eliminated by if-conversion. 7406 return 0; 7407 // Note: We currently assume zero cost for an unconditional branch inside 7408 // a predicated block since it will become a fall-through, although we 7409 // may decide in the future to call TTI for all branches. 7410 } 7411 case Instruction::PHI: { 7412 auto *Phi = cast<PHINode>(I); 7413 7414 // First-order recurrences are replaced by vector shuffles inside the loop. 7415 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7416 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7417 return TTI.getShuffleCost( 7418 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7419 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7420 7421 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7422 // converted into select instructions. We require N - 1 selects per phi 7423 // node, where N is the number of incoming values. 7424 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7425 return (Phi->getNumIncomingValues() - 1) * 7426 TTI.getCmpSelInstrCost( 7427 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7428 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7429 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7430 7431 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7432 } 7433 case Instruction::UDiv: 7434 case Instruction::SDiv: 7435 case Instruction::URem: 7436 case Instruction::SRem: 7437 // If we have a predicated instruction, it may not be executed for each 7438 // vector lane. Get the scalarization cost and scale this amount by the 7439 // probability of executing the predicated block. If the instruction is not 7440 // predicated, we fall through to the next case. 7441 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7442 InstructionCost Cost = 0; 7443 7444 // These instructions have a non-void type, so account for the phi nodes 7445 // that we will create. This cost is likely to be zero. The phi node 7446 // cost, if any, should be scaled by the block probability because it 7447 // models a copy at the end of each predicated block. 7448 Cost += VF.getKnownMinValue() * 7449 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7450 7451 // The cost of the non-predicated instruction. 7452 Cost += VF.getKnownMinValue() * 7453 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7454 7455 // The cost of insertelement and extractelement instructions needed for 7456 // scalarization. 7457 Cost += getScalarizationOverhead(I, VF); 7458 7459 // Scale the cost by the probability of executing the predicated blocks. 7460 // This assumes the predicated block for each vector lane is equally 7461 // likely. 7462 return Cost / getReciprocalPredBlockProb(); 7463 } 7464 LLVM_FALLTHROUGH; 7465 case Instruction::Add: 7466 case Instruction::FAdd: 7467 case Instruction::Sub: 7468 case Instruction::FSub: 7469 case Instruction::Mul: 7470 case Instruction::FMul: 7471 case Instruction::FDiv: 7472 case Instruction::FRem: 7473 case Instruction::Shl: 7474 case Instruction::LShr: 7475 case Instruction::AShr: 7476 case Instruction::And: 7477 case Instruction::Or: 7478 case Instruction::Xor: { 7479 // Since we will replace the stride by 1 the multiplication should go away. 7480 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7481 return 0; 7482 7483 // Detect reduction patterns 7484 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7485 return *RedCost; 7486 7487 // Certain instructions can be cheaper to vectorize if they have a constant 7488 // second vector operand. One example of this are shifts on x86. 7489 Value *Op2 = I->getOperand(1); 7490 TargetTransformInfo::OperandValueProperties Op2VP; 7491 TargetTransformInfo::OperandValueKind Op2VK = 7492 TTI.getOperandInfo(Op2, Op2VP); 7493 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7494 Op2VK = TargetTransformInfo::OK_UniformValue; 7495 7496 SmallVector<const Value *, 4> Operands(I->operand_values()); 7497 return TTI.getArithmeticInstrCost( 7498 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7499 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7500 } 7501 case Instruction::FNeg: { 7502 return TTI.getArithmeticInstrCost( 7503 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7504 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7505 TargetTransformInfo::OP_None, I->getOperand(0), I); 7506 } 7507 case Instruction::Select: { 7508 SelectInst *SI = cast<SelectInst>(I); 7509 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7510 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7511 7512 const Value *Op0, *Op1; 7513 using namespace llvm::PatternMatch; 7514 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7515 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7516 // select x, y, false --> x & y 7517 // select x, true, y --> x | y 7518 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7519 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7520 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7521 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7522 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7523 Op1->getType()->getScalarSizeInBits() == 1); 7524 7525 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7526 return TTI.getArithmeticInstrCost( 7527 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7528 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7529 } 7530 7531 Type *CondTy = SI->getCondition()->getType(); 7532 if (!ScalarCond) 7533 CondTy = VectorType::get(CondTy, VF); 7534 7535 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7536 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7537 Pred = Cmp->getPredicate(); 7538 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7539 CostKind, I); 7540 } 7541 case Instruction::ICmp: 7542 case Instruction::FCmp: { 7543 Type *ValTy = I->getOperand(0)->getType(); 7544 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7545 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7546 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7547 VectorTy = ToVectorTy(ValTy, VF); 7548 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7549 cast<CmpInst>(I)->getPredicate(), CostKind, 7550 I); 7551 } 7552 case Instruction::Store: 7553 case Instruction::Load: { 7554 ElementCount Width = VF; 7555 if (Width.isVector()) { 7556 InstWidening Decision = getWideningDecision(I, Width); 7557 assert(Decision != CM_Unknown && 7558 "CM decision should be taken at this point"); 7559 if (Decision == CM_Scalarize) 7560 Width = ElementCount::getFixed(1); 7561 } 7562 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7563 return getMemoryInstructionCost(I, VF); 7564 } 7565 case Instruction::BitCast: 7566 if (I->getType()->isPointerTy()) 7567 return 0; 7568 LLVM_FALLTHROUGH; 7569 case Instruction::ZExt: 7570 case Instruction::SExt: 7571 case Instruction::FPToUI: 7572 case Instruction::FPToSI: 7573 case Instruction::FPExt: 7574 case Instruction::PtrToInt: 7575 case Instruction::IntToPtr: 7576 case Instruction::SIToFP: 7577 case Instruction::UIToFP: 7578 case Instruction::Trunc: 7579 case Instruction::FPTrunc: { 7580 // Computes the CastContextHint from a Load/Store instruction. 7581 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7582 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7583 "Expected a load or a store!"); 7584 7585 if (VF.isScalar() || !TheLoop->contains(I)) 7586 return TTI::CastContextHint::Normal; 7587 7588 switch (getWideningDecision(I, VF)) { 7589 case LoopVectorizationCostModel::CM_GatherScatter: 7590 return TTI::CastContextHint::GatherScatter; 7591 case LoopVectorizationCostModel::CM_Interleave: 7592 return TTI::CastContextHint::Interleave; 7593 case LoopVectorizationCostModel::CM_Scalarize: 7594 case LoopVectorizationCostModel::CM_Widen: 7595 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7596 : TTI::CastContextHint::Normal; 7597 case LoopVectorizationCostModel::CM_Widen_Reverse: 7598 return TTI::CastContextHint::Reversed; 7599 case LoopVectorizationCostModel::CM_Unknown: 7600 llvm_unreachable("Instr did not go through cost modelling?"); 7601 } 7602 7603 llvm_unreachable("Unhandled case!"); 7604 }; 7605 7606 unsigned Opcode = I->getOpcode(); 7607 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7608 // For Trunc, the context is the only user, which must be a StoreInst. 7609 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7610 if (I->hasOneUse()) 7611 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7612 CCH = ComputeCCH(Store); 7613 } 7614 // For Z/Sext, the context is the operand, which must be a LoadInst. 7615 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7616 Opcode == Instruction::FPExt) { 7617 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7618 CCH = ComputeCCH(Load); 7619 } 7620 7621 // We optimize the truncation of induction variables having constant 7622 // integer steps. The cost of these truncations is the same as the scalar 7623 // operation. 7624 if (isOptimizableIVTruncate(I, VF)) { 7625 auto *Trunc = cast<TruncInst>(I); 7626 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7627 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7628 } 7629 7630 // Detect reduction patterns 7631 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7632 return *RedCost; 7633 7634 Type *SrcScalarTy = I->getOperand(0)->getType(); 7635 Type *SrcVecTy = 7636 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7637 if (canTruncateToMinimalBitwidth(I, VF)) { 7638 // This cast is going to be shrunk. This may remove the cast or it might 7639 // turn it into slightly different cast. For example, if MinBW == 16, 7640 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7641 // 7642 // Calculate the modified src and dest types. 7643 Type *MinVecTy = VectorTy; 7644 if (Opcode == Instruction::Trunc) { 7645 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7646 VectorTy = 7647 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7648 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7649 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7650 VectorTy = 7651 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7652 } 7653 } 7654 7655 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7656 } 7657 case Instruction::Call: { 7658 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7659 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7660 return *RedCost; 7661 bool NeedToScalarize; 7662 CallInst *CI = cast<CallInst>(I); 7663 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7664 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7665 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7666 return std::min(CallCost, IntrinsicCost); 7667 } 7668 return CallCost; 7669 } 7670 case Instruction::ExtractValue: 7671 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7672 case Instruction::Alloca: 7673 // We cannot easily widen alloca to a scalable alloca, as 7674 // the result would need to be a vector of pointers. 7675 if (VF.isScalable()) 7676 return InstructionCost::getInvalid(); 7677 LLVM_FALLTHROUGH; 7678 default: 7679 // This opcode is unknown. Assume that it is the same as 'mul'. 7680 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7681 } // end of switch. 7682 } 7683 7684 char LoopVectorize::ID = 0; 7685 7686 static const char lv_name[] = "Loop Vectorization"; 7687 7688 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7689 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7690 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7691 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7692 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7693 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7694 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7695 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7697 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7698 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7699 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7700 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7701 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7702 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7703 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7704 7705 namespace llvm { 7706 7707 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7708 7709 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7710 bool VectorizeOnlyWhenForced) { 7711 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7712 } 7713 7714 } // end namespace llvm 7715 7716 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7717 // Check if the pointer operand of a load or store instruction is 7718 // consecutive. 7719 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7720 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7721 return false; 7722 } 7723 7724 void LoopVectorizationCostModel::collectValuesToIgnore() { 7725 // Ignore ephemeral values. 7726 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7727 7728 // Ignore type-promoting instructions we identified during reduction 7729 // detection. 7730 for (auto &Reduction : Legal->getReductionVars()) { 7731 const RecurrenceDescriptor &RedDes = Reduction.second; 7732 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7733 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7734 } 7735 // Ignore type-casting instructions we identified during induction 7736 // detection. 7737 for (auto &Induction : Legal->getInductionVars()) { 7738 const InductionDescriptor &IndDes = Induction.second; 7739 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7740 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7741 } 7742 } 7743 7744 void LoopVectorizationCostModel::collectInLoopReductions() { 7745 for (auto &Reduction : Legal->getReductionVars()) { 7746 PHINode *Phi = Reduction.first; 7747 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7748 7749 // We don't collect reductions that are type promoted (yet). 7750 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7751 continue; 7752 7753 // If the target would prefer this reduction to happen "in-loop", then we 7754 // want to record it as such. 7755 unsigned Opcode = RdxDesc.getOpcode(); 7756 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7757 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7758 TargetTransformInfo::ReductionFlags())) 7759 continue; 7760 7761 // Check that we can correctly put the reductions into the loop, by 7762 // finding the chain of operations that leads from the phi to the loop 7763 // exit value. 7764 SmallVector<Instruction *, 4> ReductionOperations = 7765 RdxDesc.getReductionOpChain(Phi, TheLoop); 7766 bool InLoop = !ReductionOperations.empty(); 7767 if (InLoop) { 7768 InLoopReductionChains[Phi] = ReductionOperations; 7769 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7770 Instruction *LastChain = Phi; 7771 for (auto *I : ReductionOperations) { 7772 InLoopReductionImmediateChains[I] = LastChain; 7773 LastChain = I; 7774 } 7775 } 7776 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7777 << " reduction for phi: " << *Phi << "\n"); 7778 } 7779 } 7780 7781 // TODO: we could return a pair of values that specify the max VF and 7782 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7783 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7784 // doesn't have a cost model that can choose which plan to execute if 7785 // more than one is generated. 7786 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7787 LoopVectorizationCostModel &CM) { 7788 unsigned WidestType; 7789 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7790 return WidestVectorRegBits / WidestType; 7791 } 7792 7793 VectorizationFactor 7794 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7795 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7796 ElementCount VF = UserVF; 7797 // Outer loop handling: They may require CFG and instruction level 7798 // transformations before even evaluating whether vectorization is profitable. 7799 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7800 // the vectorization pipeline. 7801 if (!OrigLoop->isInnermost()) { 7802 // If the user doesn't provide a vectorization factor, determine a 7803 // reasonable one. 7804 if (UserVF.isZero()) { 7805 VF = ElementCount::getFixed(determineVPlanVF( 7806 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7807 .getFixedSize(), 7808 CM)); 7809 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7810 7811 // Make sure we have a VF > 1 for stress testing. 7812 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7813 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7814 << "overriding computed VF.\n"); 7815 VF = ElementCount::getFixed(4); 7816 } 7817 } 7818 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7819 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7820 "VF needs to be a power of two"); 7821 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7822 << "VF " << VF << " to build VPlans.\n"); 7823 buildVPlans(VF, VF); 7824 7825 // For VPlan build stress testing, we bail out after VPlan construction. 7826 if (VPlanBuildStressTest) 7827 return VectorizationFactor::Disabled(); 7828 7829 return {VF, 0 /*Cost*/}; 7830 } 7831 7832 LLVM_DEBUG( 7833 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7834 "VPlan-native path.\n"); 7835 return VectorizationFactor::Disabled(); 7836 } 7837 7838 Optional<VectorizationFactor> 7839 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7840 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7841 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7842 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7843 return None; 7844 7845 // Invalidate interleave groups if all blocks of loop will be predicated. 7846 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7847 !useMaskedInterleavedAccesses(*TTI)) { 7848 LLVM_DEBUG( 7849 dbgs() 7850 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7851 "which requires masked-interleaved support.\n"); 7852 if (CM.InterleaveInfo.invalidateGroups()) 7853 // Invalidating interleave groups also requires invalidating all decisions 7854 // based on them, which includes widening decisions and uniform and scalar 7855 // values. 7856 CM.invalidateCostModelingDecisions(); 7857 } 7858 7859 ElementCount MaxUserVF = 7860 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7861 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7862 if (!UserVF.isZero() && UserVFIsLegal) { 7863 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7864 "VF needs to be a power of two"); 7865 // Collect the instructions (and their associated costs) that will be more 7866 // profitable to scalarize. 7867 if (CM.selectUserVectorizationFactor(UserVF)) { 7868 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7869 CM.collectInLoopReductions(); 7870 buildVPlansWithVPRecipes(UserVF, UserVF); 7871 LLVM_DEBUG(printPlans(dbgs())); 7872 return {{UserVF, 0}}; 7873 } else 7874 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7875 "InvalidCost", ORE, OrigLoop); 7876 } 7877 7878 // Populate the set of Vectorization Factor Candidates. 7879 ElementCountSet VFCandidates; 7880 for (auto VF = ElementCount::getFixed(1); 7881 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7882 VFCandidates.insert(VF); 7883 for (auto VF = ElementCount::getScalable(1); 7884 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7885 VFCandidates.insert(VF); 7886 7887 for (const auto &VF : VFCandidates) { 7888 // Collect Uniform and Scalar instructions after vectorization with VF. 7889 CM.collectUniformsAndScalars(VF); 7890 7891 // Collect the instructions (and their associated costs) that will be more 7892 // profitable to scalarize. 7893 if (VF.isVector()) 7894 CM.collectInstsToScalarize(VF); 7895 } 7896 7897 CM.collectInLoopReductions(); 7898 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7899 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7900 7901 LLVM_DEBUG(printPlans(dbgs())); 7902 if (!MaxFactors.hasVector()) 7903 return VectorizationFactor::Disabled(); 7904 7905 // Select the optimal vectorization factor. 7906 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 7907 7908 // Check if it is profitable to vectorize with runtime checks. 7909 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7910 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7911 bool PragmaThresholdReached = 7912 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7913 bool ThresholdReached = 7914 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7915 if ((ThresholdReached && !Hints.allowReordering()) || 7916 PragmaThresholdReached) { 7917 ORE->emit([&]() { 7918 return OptimizationRemarkAnalysisAliasing( 7919 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7920 OrigLoop->getHeader()) 7921 << "loop not vectorized: cannot prove it is safe to reorder " 7922 "memory operations"; 7923 }); 7924 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7925 Hints.emitRemarkWithHints(); 7926 return VectorizationFactor::Disabled(); 7927 } 7928 } 7929 return SelectedVF; 7930 } 7931 7932 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7933 assert(count_if(VPlans, 7934 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7935 1 && 7936 "Best VF has not a single VPlan."); 7937 7938 for (const VPlanPtr &Plan : VPlans) { 7939 if (Plan->hasVF(VF)) 7940 return *Plan.get(); 7941 } 7942 llvm_unreachable("No plan found!"); 7943 } 7944 7945 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7946 SmallVector<Metadata *, 4> MDs; 7947 // Reserve first location for self reference to the LoopID metadata node. 7948 MDs.push_back(nullptr); 7949 bool IsUnrollMetadata = false; 7950 MDNode *LoopID = L->getLoopID(); 7951 if (LoopID) { 7952 // First find existing loop unrolling disable metadata. 7953 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7954 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7955 if (MD) { 7956 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7957 IsUnrollMetadata = 7958 S && S->getString().startswith("llvm.loop.unroll.disable"); 7959 } 7960 MDs.push_back(LoopID->getOperand(i)); 7961 } 7962 } 7963 7964 if (!IsUnrollMetadata) { 7965 // Add runtime unroll disable metadata. 7966 LLVMContext &Context = L->getHeader()->getContext(); 7967 SmallVector<Metadata *, 1> DisableOperands; 7968 DisableOperands.push_back( 7969 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7970 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7971 MDs.push_back(DisableNode); 7972 MDNode *NewLoopID = MDNode::get(Context, MDs); 7973 // Set operand 0 to refer to the loop id itself. 7974 NewLoopID->replaceOperandWith(0, NewLoopID); 7975 L->setLoopID(NewLoopID); 7976 } 7977 } 7978 7979 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7980 VPlan &BestVPlan, 7981 InnerLoopVectorizer &ILV, 7982 DominatorTree *DT) { 7983 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7984 << '\n'); 7985 7986 // Perform the actual loop transformation. 7987 7988 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7989 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7990 Value *CanonicalIVStartValue; 7991 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7992 ILV.createVectorizedLoopSkeleton(); 7993 ILV.collectPoisonGeneratingRecipes(State); 7994 7995 ILV.printDebugTracesAtStart(); 7996 7997 //===------------------------------------------------===// 7998 // 7999 // Notice: any optimization or new instruction that go 8000 // into the code below should also be implemented in 8001 // the cost-model. 8002 // 8003 //===------------------------------------------------===// 8004 8005 // 2. Copy and widen instructions from the old loop into the new loop. 8006 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 8007 ILV.getOrCreateVectorTripCount(nullptr), 8008 CanonicalIVStartValue, State); 8009 BestVPlan.execute(&State); 8010 8011 // Keep all loop hints from the original loop on the vector loop (we'll 8012 // replace the vectorizer-specific hints below). 8013 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8014 8015 Optional<MDNode *> VectorizedLoopID = 8016 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8017 LLVMLoopVectorizeFollowupVectorized}); 8018 8019 Loop *L = LI->getLoopFor(State.CFG.PrevBB); 8020 if (VectorizedLoopID.hasValue()) 8021 L->setLoopID(VectorizedLoopID.getValue()); 8022 else { 8023 // Keep all loop hints from the original loop on the vector loop (we'll 8024 // replace the vectorizer-specific hints below). 8025 if (MDNode *LID = OrigLoop->getLoopID()) 8026 L->setLoopID(LID); 8027 8028 LoopVectorizeHints Hints(L, true, *ORE); 8029 Hints.setAlreadyVectorized(); 8030 } 8031 // Disable runtime unrolling when vectorizing the epilogue loop. 8032 if (CanonicalIVStartValue) 8033 AddRuntimeUnrollDisableMetaData(L); 8034 8035 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8036 // predication, updating analyses. 8037 ILV.fixVectorizedLoop(State); 8038 8039 ILV.printDebugTracesAtEnd(); 8040 } 8041 8042 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8043 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8044 for (const auto &Plan : VPlans) 8045 if (PrintVPlansInDotFormat) 8046 Plan->printDOT(O); 8047 else 8048 Plan->print(O); 8049 } 8050 #endif 8051 8052 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8053 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8054 8055 // We create new control-flow for the vectorized loop, so the original exit 8056 // conditions will be dead after vectorization if it's only used by the 8057 // terminator 8058 SmallVector<BasicBlock*> ExitingBlocks; 8059 OrigLoop->getExitingBlocks(ExitingBlocks); 8060 for (auto *BB : ExitingBlocks) { 8061 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8062 if (!Cmp || !Cmp->hasOneUse()) 8063 continue; 8064 8065 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8066 if (!DeadInstructions.insert(Cmp).second) 8067 continue; 8068 8069 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8070 // TODO: can recurse through operands in general 8071 for (Value *Op : Cmp->operands()) { 8072 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8073 DeadInstructions.insert(cast<Instruction>(Op)); 8074 } 8075 } 8076 8077 // We create new "steps" for induction variable updates to which the original 8078 // induction variables map. An original update instruction will be dead if 8079 // all its users except the induction variable are dead. 8080 auto *Latch = OrigLoop->getLoopLatch(); 8081 for (auto &Induction : Legal->getInductionVars()) { 8082 PHINode *Ind = Induction.first; 8083 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8084 8085 // If the tail is to be folded by masking, the primary induction variable, 8086 // if exists, isn't dead: it will be used for masking. Don't kill it. 8087 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8088 continue; 8089 8090 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8091 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8092 })) 8093 DeadInstructions.insert(IndUpdate); 8094 } 8095 } 8096 8097 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8098 8099 //===--------------------------------------------------------------------===// 8100 // EpilogueVectorizerMainLoop 8101 //===--------------------------------------------------------------------===// 8102 8103 /// This function is partially responsible for generating the control flow 8104 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8105 std::pair<BasicBlock *, Value *> 8106 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8107 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8108 Loop *Lp = createVectorLoopSkeleton(""); 8109 8110 // Generate the code to check the minimum iteration count of the vector 8111 // epilogue (see below). 8112 EPI.EpilogueIterationCountCheck = 8113 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8114 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8115 8116 // Generate the code to check any assumptions that we've made for SCEV 8117 // expressions. 8118 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8119 8120 // Generate the code that checks at runtime if arrays overlap. We put the 8121 // checks into a separate block to make the more common case of few elements 8122 // faster. 8123 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8124 8125 // Generate the iteration count check for the main loop, *after* the check 8126 // for the epilogue loop, so that the path-length is shorter for the case 8127 // that goes directly through the vector epilogue. The longer-path length for 8128 // the main loop is compensated for, by the gain from vectorizing the larger 8129 // trip count. Note: the branch will get updated later on when we vectorize 8130 // the epilogue. 8131 EPI.MainLoopIterationCountCheck = 8132 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8133 8134 // Generate the induction variable. 8135 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8136 EPI.VectorTripCount = CountRoundDown; 8137 createHeaderBranch(Lp); 8138 8139 // Skip induction resume value creation here because they will be created in 8140 // the second pass. If we created them here, they wouldn't be used anyway, 8141 // because the vplan in the second pass still contains the inductions from the 8142 // original loop. 8143 8144 return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; 8145 } 8146 8147 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8148 LLVM_DEBUG({ 8149 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8150 << "Main Loop VF:" << EPI.MainLoopVF 8151 << ", Main Loop UF:" << EPI.MainLoopUF 8152 << ", Epilogue Loop VF:" << EPI.EpilogueVF 8153 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8154 }); 8155 } 8156 8157 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8158 DEBUG_WITH_TYPE(VerboseDebug, { 8159 dbgs() << "intermediate fn:\n" 8160 << *OrigLoop->getHeader()->getParent() << "\n"; 8161 }); 8162 } 8163 8164 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8165 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8166 assert(L && "Expected valid Loop."); 8167 assert(Bypass && "Expected valid bypass basic block."); 8168 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 8169 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8170 Value *Count = getOrCreateTripCount(L); 8171 // Reuse existing vector loop preheader for TC checks. 8172 // Note that new preheader block is generated for vector loop. 8173 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8174 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8175 8176 // Generate code to check if the loop's trip count is less than VF * UF of the 8177 // main vector loop. 8178 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 8179 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8180 8181 Value *CheckMinIters = Builder.CreateICmp( 8182 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 8183 "min.iters.check"); 8184 8185 if (!ForEpilogue) 8186 TCCheckBlock->setName("vector.main.loop.iter.check"); 8187 8188 // Create new preheader for vector loop. 8189 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8190 DT, LI, nullptr, "vector.ph"); 8191 8192 if (ForEpilogue) { 8193 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8194 DT->getNode(Bypass)->getIDom()) && 8195 "TC check is expected to dominate Bypass"); 8196 8197 // Update dominator for Bypass & LoopExit. 8198 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8199 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8200 // For loops with multiple exits, there's no edge from the middle block 8201 // to exit blocks (as the epilogue must run) and thus no need to update 8202 // the immediate dominator of the exit blocks. 8203 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8204 8205 LoopBypassBlocks.push_back(TCCheckBlock); 8206 8207 // Save the trip count so we don't have to regenerate it in the 8208 // vec.epilog.iter.check. This is safe to do because the trip count 8209 // generated here dominates the vector epilog iter check. 8210 EPI.TripCount = Count; 8211 } 8212 8213 ReplaceInstWithInst( 8214 TCCheckBlock->getTerminator(), 8215 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8216 8217 return TCCheckBlock; 8218 } 8219 8220 //===--------------------------------------------------------------------===// 8221 // EpilogueVectorizerEpilogueLoop 8222 //===--------------------------------------------------------------------===// 8223 8224 /// This function is partially responsible for generating the control flow 8225 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8226 std::pair<BasicBlock *, Value *> 8227 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8228 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8229 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8230 8231 // Now, compare the remaining count and if there aren't enough iterations to 8232 // execute the vectorized epilogue skip to the scalar part. 8233 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8234 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8235 LoopVectorPreHeader = 8236 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8237 LI, nullptr, "vec.epilog.ph"); 8238 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8239 VecEpilogueIterationCountCheck); 8240 8241 // Adjust the control flow taking the state info from the main loop 8242 // vectorization into account. 8243 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8244 "expected this to be saved from the previous pass."); 8245 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8246 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8247 8248 DT->changeImmediateDominator(LoopVectorPreHeader, 8249 EPI.MainLoopIterationCountCheck); 8250 8251 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8252 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8253 8254 if (EPI.SCEVSafetyCheck) 8255 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8256 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8257 if (EPI.MemSafetyCheck) 8258 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8259 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8260 8261 DT->changeImmediateDominator( 8262 VecEpilogueIterationCountCheck, 8263 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8264 8265 DT->changeImmediateDominator(LoopScalarPreHeader, 8266 EPI.EpilogueIterationCountCheck); 8267 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 8268 // If there is an epilogue which must run, there's no edge from the 8269 // middle block to exit blocks and thus no need to update the immediate 8270 // dominator of the exit blocks. 8271 DT->changeImmediateDominator(LoopExitBlock, 8272 EPI.EpilogueIterationCountCheck); 8273 8274 // Keep track of bypass blocks, as they feed start values to the induction 8275 // phis in the scalar loop preheader. 8276 if (EPI.SCEVSafetyCheck) 8277 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8278 if (EPI.MemSafetyCheck) 8279 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8280 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8281 8282 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 8283 // merge control-flow from the latch block and the middle block. Update the 8284 // incoming values here and move the Phi into the preheader. 8285 SmallVector<PHINode *, 4> PhisInBlock; 8286 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8287 PhisInBlock.push_back(&Phi); 8288 8289 for (PHINode *Phi : PhisInBlock) { 8290 Phi->replaceIncomingBlockWith( 8291 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8292 VecEpilogueIterationCountCheck); 8293 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8294 if (EPI.SCEVSafetyCheck) 8295 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8296 if (EPI.MemSafetyCheck) 8297 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8298 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8299 } 8300 8301 // Generate a resume induction for the vector epilogue and put it in the 8302 // vector epilogue preheader 8303 Type *IdxTy = Legal->getWidestInductionType(); 8304 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8305 LoopVectorPreHeader->getFirstNonPHI()); 8306 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8307 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8308 EPI.MainLoopIterationCountCheck); 8309 8310 // Generate the induction variable. 8311 createHeaderBranch(Lp); 8312 8313 // Generate induction resume values. These variables save the new starting 8314 // indexes for the scalar loop. They are used to test if there are any tail 8315 // iterations left once the vector loop has completed. 8316 // Note that when the vectorized epilogue is skipped due to iteration count 8317 // check, then the resume value for the induction variable comes from 8318 // the trip count of the main vector loop, hence passing the AdditionalBypass 8319 // argument. 8320 createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck, 8321 EPI.VectorTripCount} /* AdditionalBypass */); 8322 8323 return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; 8324 } 8325 8326 BasicBlock * 8327 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8328 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8329 8330 assert(EPI.TripCount && 8331 "Expected trip count to have been safed in the first pass."); 8332 assert( 8333 (!isa<Instruction>(EPI.TripCount) || 8334 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8335 "saved trip count does not dominate insertion point."); 8336 Value *TC = EPI.TripCount; 8337 IRBuilder<> Builder(Insert->getTerminator()); 8338 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8339 8340 // Generate code to check if the loop's trip count is less than VF * UF of the 8341 // vector epilogue loop. 8342 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 8343 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8344 8345 Value *CheckMinIters = 8346 Builder.CreateICmp(P, Count, 8347 createStepForVF(Builder, Count->getType(), 8348 EPI.EpilogueVF, EPI.EpilogueUF), 8349 "min.epilog.iters.check"); 8350 8351 ReplaceInstWithInst( 8352 Insert->getTerminator(), 8353 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8354 8355 LoopBypassBlocks.push_back(Insert); 8356 return Insert; 8357 } 8358 8359 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8360 LLVM_DEBUG({ 8361 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8362 << "Epilogue Loop VF:" << EPI.EpilogueVF 8363 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8364 }); 8365 } 8366 8367 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8368 DEBUG_WITH_TYPE(VerboseDebug, { 8369 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8370 }); 8371 } 8372 8373 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8374 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8375 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8376 bool PredicateAtRangeStart = Predicate(Range.Start); 8377 8378 for (ElementCount TmpVF = Range.Start * 2; 8379 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8380 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8381 Range.End = TmpVF; 8382 break; 8383 } 8384 8385 return PredicateAtRangeStart; 8386 } 8387 8388 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8389 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8390 /// of VF's starting at a given VF and extending it as much as possible. Each 8391 /// vectorization decision can potentially shorten this sub-range during 8392 /// buildVPlan(). 8393 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8394 ElementCount MaxVF) { 8395 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8396 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8397 VFRange SubRange = {VF, MaxVFPlusOne}; 8398 VPlans.push_back(buildVPlan(SubRange)); 8399 VF = SubRange.End; 8400 } 8401 } 8402 8403 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8404 VPlanPtr &Plan) { 8405 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8406 8407 // Look for cached value. 8408 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8409 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8410 if (ECEntryIt != EdgeMaskCache.end()) 8411 return ECEntryIt->second; 8412 8413 VPValue *SrcMask = createBlockInMask(Src, Plan); 8414 8415 // The terminator has to be a branch inst! 8416 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8417 assert(BI && "Unexpected terminator found"); 8418 8419 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8420 return EdgeMaskCache[Edge] = SrcMask; 8421 8422 // If source is an exiting block, we know the exit edge is dynamically dead 8423 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8424 // adding uses of an otherwise potentially dead instruction. 8425 if (OrigLoop->isLoopExiting(Src)) 8426 return EdgeMaskCache[Edge] = SrcMask; 8427 8428 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8429 assert(EdgeMask && "No Edge Mask found for condition"); 8430 8431 if (BI->getSuccessor(0) != Dst) 8432 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8433 8434 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8435 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8436 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8437 // The select version does not introduce new UB if SrcMask is false and 8438 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8439 VPValue *False = Plan->getOrAddVPValue( 8440 ConstantInt::getFalse(BI->getCondition()->getType())); 8441 EdgeMask = 8442 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8443 } 8444 8445 return EdgeMaskCache[Edge] = EdgeMask; 8446 } 8447 8448 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8449 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8450 8451 // Look for cached value. 8452 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8453 if (BCEntryIt != BlockMaskCache.end()) 8454 return BCEntryIt->second; 8455 8456 // All-one mask is modelled as no-mask following the convention for masked 8457 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8458 VPValue *BlockMask = nullptr; 8459 8460 if (OrigLoop->getHeader() == BB) { 8461 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8462 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8463 8464 // Introduce the early-exit compare IV <= BTC to form header block mask. 8465 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8466 // constructing the desired canonical IV in the header block as its first 8467 // non-phi instructions. 8468 assert(CM.foldTailByMasking() && "must fold the tail"); 8469 VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); 8470 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8471 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8472 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8473 8474 VPBuilder::InsertPointGuard Guard(Builder); 8475 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8476 if (CM.TTI.emitGetActiveLaneMask()) { 8477 VPValue *TC = Plan->getOrCreateTripCount(); 8478 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8479 } else { 8480 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8481 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8482 } 8483 return BlockMaskCache[BB] = BlockMask; 8484 } 8485 8486 // This is the block mask. We OR all incoming edges. 8487 for (auto *Predecessor : predecessors(BB)) { 8488 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8489 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8490 return BlockMaskCache[BB] = EdgeMask; 8491 8492 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8493 BlockMask = EdgeMask; 8494 continue; 8495 } 8496 8497 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8498 } 8499 8500 return BlockMaskCache[BB] = BlockMask; 8501 } 8502 8503 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8504 ArrayRef<VPValue *> Operands, 8505 VFRange &Range, 8506 VPlanPtr &Plan) { 8507 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8508 "Must be called with either a load or store"); 8509 8510 auto willWiden = [&](ElementCount VF) -> bool { 8511 if (VF.isScalar()) 8512 return false; 8513 LoopVectorizationCostModel::InstWidening Decision = 8514 CM.getWideningDecision(I, VF); 8515 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8516 "CM decision should be taken at this point."); 8517 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8518 return true; 8519 if (CM.isScalarAfterVectorization(I, VF) || 8520 CM.isProfitableToScalarize(I, VF)) 8521 return false; 8522 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8523 }; 8524 8525 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8526 return nullptr; 8527 8528 VPValue *Mask = nullptr; 8529 if (Legal->isMaskRequired(I)) 8530 Mask = createBlockInMask(I->getParent(), Plan); 8531 8532 // Determine if the pointer operand of the access is either consecutive or 8533 // reverse consecutive. 8534 LoopVectorizationCostModel::InstWidening Decision = 8535 CM.getWideningDecision(I, Range.Start); 8536 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8537 bool Consecutive = 8538 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8539 8540 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8541 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8542 Consecutive, Reverse); 8543 8544 StoreInst *Store = cast<StoreInst>(I); 8545 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8546 Mask, Consecutive, Reverse); 8547 } 8548 8549 VPWidenIntOrFpInductionRecipe * 8550 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8551 ArrayRef<VPValue *> Operands) const { 8552 // Check if this is an integer or fp induction. If so, build the recipe that 8553 // produces its scalar and vector values. 8554 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { 8555 assert(II->getStartValue() == 8556 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8557 return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); 8558 } 8559 8560 return nullptr; 8561 } 8562 8563 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8564 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8565 VPlan &Plan) const { 8566 // Optimize the special case where the source is a constant integer 8567 // induction variable. Notice that we can only optimize the 'trunc' case 8568 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8569 // (c) other casts depend on pointer size. 8570 8571 // Determine whether \p K is a truncation based on an induction variable that 8572 // can be optimized. 8573 auto isOptimizableIVTruncate = 8574 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8575 return [=](ElementCount VF) -> bool { 8576 return CM.isOptimizableIVTruncate(K, VF); 8577 }; 8578 }; 8579 8580 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8581 isOptimizableIVTruncate(I), Range)) { 8582 8583 auto *Phi = cast<PHINode>(I->getOperand(0)); 8584 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8585 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8586 return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); 8587 } 8588 return nullptr; 8589 } 8590 8591 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8592 ArrayRef<VPValue *> Operands, 8593 VPlanPtr &Plan) { 8594 // If all incoming values are equal, the incoming VPValue can be used directly 8595 // instead of creating a new VPBlendRecipe. 8596 VPValue *FirstIncoming = Operands[0]; 8597 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8598 return FirstIncoming == Inc; 8599 })) { 8600 return Operands[0]; 8601 } 8602 8603 // We know that all PHIs in non-header blocks are converted into selects, so 8604 // we don't have to worry about the insertion order and we can just use the 8605 // builder. At this point we generate the predication tree. There may be 8606 // duplications since this is a simple recursive scan, but future 8607 // optimizations will clean it up. 8608 SmallVector<VPValue *, 2> OperandsWithMask; 8609 unsigned NumIncoming = Phi->getNumIncomingValues(); 8610 8611 for (unsigned In = 0; In < NumIncoming; In++) { 8612 VPValue *EdgeMask = 8613 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8614 assert((EdgeMask || NumIncoming == 1) && 8615 "Multiple predecessors with one having a full mask"); 8616 OperandsWithMask.push_back(Operands[In]); 8617 if (EdgeMask) 8618 OperandsWithMask.push_back(EdgeMask); 8619 } 8620 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8621 } 8622 8623 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8624 ArrayRef<VPValue *> Operands, 8625 VFRange &Range) const { 8626 8627 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8628 [this, CI](ElementCount VF) { 8629 return CM.isScalarWithPredication(CI, VF); 8630 }, 8631 Range); 8632 8633 if (IsPredicated) 8634 return nullptr; 8635 8636 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8637 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8638 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8639 ID == Intrinsic::pseudoprobe || 8640 ID == Intrinsic::experimental_noalias_scope_decl)) 8641 return nullptr; 8642 8643 auto willWiden = [&](ElementCount VF) -> bool { 8644 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8645 // The following case may be scalarized depending on the VF. 8646 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8647 // version of the instruction. 8648 // Is it beneficial to perform intrinsic call compared to lib call? 8649 bool NeedToScalarize = false; 8650 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8651 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8652 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8653 return UseVectorIntrinsic || !NeedToScalarize; 8654 }; 8655 8656 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8657 return nullptr; 8658 8659 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8660 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8661 } 8662 8663 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8664 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8665 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8666 // Instruction should be widened, unless it is scalar after vectorization, 8667 // scalarization is profitable or it is predicated. 8668 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8669 return CM.isScalarAfterVectorization(I, VF) || 8670 CM.isProfitableToScalarize(I, VF) || 8671 CM.isScalarWithPredication(I, VF); 8672 }; 8673 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8674 Range); 8675 } 8676 8677 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8678 ArrayRef<VPValue *> Operands) const { 8679 auto IsVectorizableOpcode = [](unsigned Opcode) { 8680 switch (Opcode) { 8681 case Instruction::Add: 8682 case Instruction::And: 8683 case Instruction::AShr: 8684 case Instruction::BitCast: 8685 case Instruction::FAdd: 8686 case Instruction::FCmp: 8687 case Instruction::FDiv: 8688 case Instruction::FMul: 8689 case Instruction::FNeg: 8690 case Instruction::FPExt: 8691 case Instruction::FPToSI: 8692 case Instruction::FPToUI: 8693 case Instruction::FPTrunc: 8694 case Instruction::FRem: 8695 case Instruction::FSub: 8696 case Instruction::ICmp: 8697 case Instruction::IntToPtr: 8698 case Instruction::LShr: 8699 case Instruction::Mul: 8700 case Instruction::Or: 8701 case Instruction::PtrToInt: 8702 case Instruction::SDiv: 8703 case Instruction::Select: 8704 case Instruction::SExt: 8705 case Instruction::Shl: 8706 case Instruction::SIToFP: 8707 case Instruction::SRem: 8708 case Instruction::Sub: 8709 case Instruction::Trunc: 8710 case Instruction::UDiv: 8711 case Instruction::UIToFP: 8712 case Instruction::URem: 8713 case Instruction::Xor: 8714 case Instruction::ZExt: 8715 return true; 8716 } 8717 return false; 8718 }; 8719 8720 if (!IsVectorizableOpcode(I->getOpcode())) 8721 return nullptr; 8722 8723 // Success: widen this instruction. 8724 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8725 } 8726 8727 void VPRecipeBuilder::fixHeaderPhis() { 8728 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8729 for (VPHeaderPHIRecipe *R : PhisToFix) { 8730 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8731 VPRecipeBase *IncR = 8732 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8733 R->addOperand(IncR->getVPSingleValue()); 8734 } 8735 } 8736 8737 VPBasicBlock *VPRecipeBuilder::handleReplication( 8738 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8739 VPlanPtr &Plan) { 8740 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8741 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8742 Range); 8743 8744 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8745 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8746 Range); 8747 8748 // Even if the instruction is not marked as uniform, there are certain 8749 // intrinsic calls that can be effectively treated as such, so we check for 8750 // them here. Conservatively, we only do this for scalable vectors, since 8751 // for fixed-width VFs we can always fall back on full scalarization. 8752 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8753 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8754 case Intrinsic::assume: 8755 case Intrinsic::lifetime_start: 8756 case Intrinsic::lifetime_end: 8757 // For scalable vectors if one of the operands is variant then we still 8758 // want to mark as uniform, which will generate one instruction for just 8759 // the first lane of the vector. We can't scalarize the call in the same 8760 // way as for fixed-width vectors because we don't know how many lanes 8761 // there are. 8762 // 8763 // The reasons for doing it this way for scalable vectors are: 8764 // 1. For the assume intrinsic generating the instruction for the first 8765 // lane is still be better than not generating any at all. For 8766 // example, the input may be a splat across all lanes. 8767 // 2. For the lifetime start/end intrinsics the pointer operand only 8768 // does anything useful when the input comes from a stack object, 8769 // which suggests it should always be uniform. For non-stack objects 8770 // the effect is to poison the object, which still allows us to 8771 // remove the call. 8772 IsUniform = true; 8773 break; 8774 default: 8775 break; 8776 } 8777 } 8778 8779 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8780 IsUniform, IsPredicated); 8781 setRecipe(I, Recipe); 8782 Plan->addVPValue(I, Recipe); 8783 8784 // Find if I uses a predicated instruction. If so, it will use its scalar 8785 // value. Avoid hoisting the insert-element which packs the scalar value into 8786 // a vector value, as that happens iff all users use the vector value. 8787 for (VPValue *Op : Recipe->operands()) { 8788 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8789 if (!PredR) 8790 continue; 8791 auto *RepR = 8792 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8793 assert(RepR->isPredicated() && 8794 "expected Replicate recipe to be predicated"); 8795 RepR->setAlsoPack(false); 8796 } 8797 8798 // Finalize the recipe for Instr, first if it is not predicated. 8799 if (!IsPredicated) { 8800 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8801 VPBB->appendRecipe(Recipe); 8802 return VPBB; 8803 } 8804 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8805 8806 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8807 assert(SingleSucc && "VPBB must have a single successor when handling " 8808 "predicated replication."); 8809 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8810 // Record predicated instructions for above packing optimizations. 8811 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8812 VPBlockUtils::insertBlockAfter(Region, VPBB); 8813 auto *RegSucc = new VPBasicBlock(); 8814 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8815 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8816 return RegSucc; 8817 } 8818 8819 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8820 VPRecipeBase *PredRecipe, 8821 VPlanPtr &Plan) { 8822 // Instructions marked for predication are replicated and placed under an 8823 // if-then construct to prevent side-effects. 8824 8825 // Generate recipes to compute the block mask for this region. 8826 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8827 8828 // Build the triangular if-then region. 8829 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8830 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8831 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8832 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8833 auto *PHIRecipe = Instr->getType()->isVoidTy() 8834 ? nullptr 8835 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8836 if (PHIRecipe) { 8837 Plan->removeVPValueFor(Instr); 8838 Plan->addVPValue(Instr, PHIRecipe); 8839 } 8840 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8841 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8842 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8843 8844 // Note: first set Entry as region entry and then connect successors starting 8845 // from it in order, to propagate the "parent" of each VPBasicBlock. 8846 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8847 VPBlockUtils::connectBlocks(Pred, Exit); 8848 8849 return Region; 8850 } 8851 8852 VPRecipeOrVPValueTy 8853 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8854 ArrayRef<VPValue *> Operands, 8855 VFRange &Range, VPlanPtr &Plan) { 8856 // First, check for specific widening recipes that deal with calls, memory 8857 // operations, inductions and Phi nodes. 8858 if (auto *CI = dyn_cast<CallInst>(Instr)) 8859 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8860 8861 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8862 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8863 8864 VPRecipeBase *Recipe; 8865 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8866 if (Phi->getParent() != OrigLoop->getHeader()) 8867 return tryToBlend(Phi, Operands, Plan); 8868 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8869 return toVPRecipeResult(Recipe); 8870 8871 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8872 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { 8873 VPValue *StartV = Operands[0]; 8874 if (Legal->isReductionVariable(Phi)) { 8875 const RecurrenceDescriptor &RdxDesc = 8876 Legal->getReductionVars().find(Phi)->second; 8877 assert(RdxDesc.getRecurrenceStartValue() == 8878 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8879 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8880 CM.isInLoopReduction(Phi), 8881 CM.useOrderedReductions(RdxDesc)); 8882 } else { 8883 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8884 } 8885 8886 // Record the incoming value from the backedge, so we can add the incoming 8887 // value from the backedge after all recipes have been created. 8888 recordRecipeOf(cast<Instruction>( 8889 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8890 PhisToFix.push_back(PhiRecipe); 8891 } else { 8892 // TODO: record backedge value for remaining pointer induction phis. 8893 assert(Phi->getType()->isPointerTy() && 8894 "only pointer phis should be handled here"); 8895 assert(Legal->getInductionVars().count(Phi) && 8896 "Not an induction variable"); 8897 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8898 VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); 8899 PhiRecipe = new VPWidenPHIRecipe(Phi, Start); 8900 } 8901 8902 return toVPRecipeResult(PhiRecipe); 8903 } 8904 8905 if (isa<TruncInst>(Instr) && 8906 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8907 Range, *Plan))) 8908 return toVPRecipeResult(Recipe); 8909 8910 if (!shouldWiden(Instr, Range)) 8911 return nullptr; 8912 8913 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8914 return toVPRecipeResult(new VPWidenGEPRecipe( 8915 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8916 8917 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8918 bool InvariantCond = 8919 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8920 return toVPRecipeResult(new VPWidenSelectRecipe( 8921 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8922 } 8923 8924 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8925 } 8926 8927 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8928 ElementCount MaxVF) { 8929 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8930 8931 // Collect instructions from the original loop that will become trivially dead 8932 // in the vectorized loop. We don't need to vectorize these instructions. For 8933 // example, original induction update instructions can become dead because we 8934 // separately emit induction "steps" when generating code for the new loop. 8935 // Similarly, we create a new latch condition when setting up the structure 8936 // of the new loop, so the old one can become dead. 8937 SmallPtrSet<Instruction *, 4> DeadInstructions; 8938 collectTriviallyDeadInstructions(DeadInstructions); 8939 8940 // Add assume instructions we need to drop to DeadInstructions, to prevent 8941 // them from being added to the VPlan. 8942 // TODO: We only need to drop assumes in blocks that get flattend. If the 8943 // control flow is preserved, we should keep them. 8944 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8945 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8946 8947 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8948 // Dead instructions do not need sinking. Remove them from SinkAfter. 8949 for (Instruction *I : DeadInstructions) 8950 SinkAfter.erase(I); 8951 8952 // Cannot sink instructions after dead instructions (there won't be any 8953 // recipes for them). Instead, find the first non-dead previous instruction. 8954 for (auto &P : Legal->getSinkAfter()) { 8955 Instruction *SinkTarget = P.second; 8956 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8957 (void)FirstInst; 8958 while (DeadInstructions.contains(SinkTarget)) { 8959 assert( 8960 SinkTarget != FirstInst && 8961 "Must find a live instruction (at least the one feeding the " 8962 "first-order recurrence PHI) before reaching beginning of the block"); 8963 SinkTarget = SinkTarget->getPrevNode(); 8964 assert(SinkTarget != P.first && 8965 "sink source equals target, no sinking required"); 8966 } 8967 P.second = SinkTarget; 8968 } 8969 8970 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8971 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8972 VFRange SubRange = {VF, MaxVFPlusOne}; 8973 VPlans.push_back( 8974 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8975 VF = SubRange.End; 8976 } 8977 } 8978 8979 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8980 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8981 // BranchOnCount VPInstruction to the latch. 8982 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8983 bool HasNUW, bool IsVPlanNative) { 8984 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8985 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8986 8987 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8988 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8989 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8990 if (IsVPlanNative) 8991 Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); 8992 Header->insert(CanonicalIVPHI, Header->begin()); 8993 8994 auto *CanonicalIVIncrement = 8995 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8996 : VPInstruction::CanonicalIVIncrement, 8997 {CanonicalIVPHI}, DL); 8998 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8999 9000 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 9001 if (IsVPlanNative) { 9002 EB = cast<VPBasicBlock>(EB->getSinglePredecessor()); 9003 EB->setCondBit(nullptr); 9004 } 9005 EB->appendRecipe(CanonicalIVIncrement); 9006 9007 auto *BranchOnCount = 9008 new VPInstruction(VPInstruction::BranchOnCount, 9009 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 9010 EB->appendRecipe(BranchOnCount); 9011 } 9012 9013 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9014 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9015 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9016 9017 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9018 9019 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9020 9021 // --------------------------------------------------------------------------- 9022 // Pre-construction: record ingredients whose recipes we'll need to further 9023 // process after constructing the initial VPlan. 9024 // --------------------------------------------------------------------------- 9025 9026 // Mark instructions we'll need to sink later and their targets as 9027 // ingredients whose recipe we'll need to record. 9028 for (auto &Entry : SinkAfter) { 9029 RecipeBuilder.recordRecipeOf(Entry.first); 9030 RecipeBuilder.recordRecipeOf(Entry.second); 9031 } 9032 for (auto &Reduction : CM.getInLoopReductionChains()) { 9033 PHINode *Phi = Reduction.first; 9034 RecurKind Kind = 9035 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 9036 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9037 9038 RecipeBuilder.recordRecipeOf(Phi); 9039 for (auto &R : ReductionOperations) { 9040 RecipeBuilder.recordRecipeOf(R); 9041 // For min/max reducitons, where we have a pair of icmp/select, we also 9042 // need to record the ICmp recipe, so it can be removed later. 9043 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9044 "Only min/max recurrences allowed for inloop reductions"); 9045 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9046 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9047 } 9048 } 9049 9050 // For each interleave group which is relevant for this (possibly trimmed) 9051 // Range, add it to the set of groups to be later applied to the VPlan and add 9052 // placeholders for its members' Recipes which we'll be replacing with a 9053 // single VPInterleaveRecipe. 9054 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9055 auto applyIG = [IG, this](ElementCount VF) -> bool { 9056 return (VF.isVector() && // Query is illegal for VF == 1 9057 CM.getWideningDecision(IG->getInsertPos(), VF) == 9058 LoopVectorizationCostModel::CM_Interleave); 9059 }; 9060 if (!getDecisionAndClampRange(applyIG, Range)) 9061 continue; 9062 InterleaveGroups.insert(IG); 9063 for (unsigned i = 0; i < IG->getFactor(); i++) 9064 if (Instruction *Member = IG->getMember(i)) 9065 RecipeBuilder.recordRecipeOf(Member); 9066 }; 9067 9068 // --------------------------------------------------------------------------- 9069 // Build initial VPlan: Scan the body of the loop in a topological order to 9070 // visit each basic block after having visited its predecessor basic blocks. 9071 // --------------------------------------------------------------------------- 9072 9073 // Create initial VPlan skeleton, with separate header and latch blocks. 9074 VPBasicBlock *HeaderVPBB = new VPBasicBlock(); 9075 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 9076 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 9077 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 9078 auto Plan = std::make_unique<VPlan>(TopRegion); 9079 9080 Instruction *DLInst = 9081 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 9082 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 9083 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 9084 !CM.foldTailByMasking(), false); 9085 9086 // Scan the body of the loop in a topological order to visit each basic block 9087 // after having visited its predecessor basic blocks. 9088 LoopBlocksDFS DFS(OrigLoop); 9089 DFS.perform(LI); 9090 9091 VPBasicBlock *VPBB = HeaderVPBB; 9092 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 9093 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9094 // Relevant instructions from basic block BB will be grouped into VPRecipe 9095 // ingredients and fill a new VPBasicBlock. 9096 unsigned VPBBsForBB = 0; 9097 VPBB->setName(BB->getName()); 9098 Builder.setInsertPoint(VPBB); 9099 9100 // Introduce each ingredient into VPlan. 9101 // TODO: Model and preserve debug instrinsics in VPlan. 9102 for (Instruction &I : BB->instructionsWithoutDebug()) { 9103 Instruction *Instr = &I; 9104 9105 // First filter out irrelevant instructions, to ensure no recipes are 9106 // built for them. 9107 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9108 continue; 9109 9110 SmallVector<VPValue *, 4> Operands; 9111 auto *Phi = dyn_cast<PHINode>(Instr); 9112 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9113 Operands.push_back(Plan->getOrAddVPValue( 9114 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9115 } else { 9116 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9117 Operands = {OpRange.begin(), OpRange.end()}; 9118 } 9119 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9120 Instr, Operands, Range, Plan)) { 9121 // If Instr can be simplified to an existing VPValue, use it. 9122 if (RecipeOrValue.is<VPValue *>()) { 9123 auto *VPV = RecipeOrValue.get<VPValue *>(); 9124 Plan->addVPValue(Instr, VPV); 9125 // If the re-used value is a recipe, register the recipe for the 9126 // instruction, in case the recipe for Instr needs to be recorded. 9127 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9128 RecipeBuilder.setRecipe(Instr, R); 9129 continue; 9130 } 9131 // Otherwise, add the new recipe. 9132 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9133 for (auto *Def : Recipe->definedValues()) { 9134 auto *UV = Def->getUnderlyingValue(); 9135 Plan->addVPValue(UV, Def); 9136 } 9137 9138 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9139 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9140 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 9141 // of the header block. That can happen for truncates of induction 9142 // variables. Those recipes are moved to the phi section of the header 9143 // block after applying SinkAfter, which relies on the original 9144 // position of the trunc. 9145 assert(isa<TruncInst>(Instr)); 9146 InductionsToMove.push_back( 9147 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 9148 } 9149 RecipeBuilder.setRecipe(Instr, Recipe); 9150 VPBB->appendRecipe(Recipe); 9151 continue; 9152 } 9153 9154 // Otherwise, if all widening options failed, Instruction is to be 9155 // replicated. This may create a successor for VPBB. 9156 VPBasicBlock *NextVPBB = 9157 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9158 if (NextVPBB != VPBB) { 9159 VPBB = NextVPBB; 9160 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9161 : ""); 9162 } 9163 } 9164 9165 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9166 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9167 } 9168 9169 // Fold the last, empty block into its predecessor. 9170 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 9171 assert(VPBB && "expected to fold last (empty) block"); 9172 // After here, VPBB should not be used. 9173 VPBB = nullptr; 9174 9175 assert(isa<VPRegionBlock>(Plan->getEntry()) && 9176 !Plan->getEntry()->getEntryBasicBlock()->empty() && 9177 "entry block must be set to a VPRegionBlock having a non-empty entry " 9178 "VPBasicBlock"); 9179 RecipeBuilder.fixHeaderPhis(); 9180 9181 // --------------------------------------------------------------------------- 9182 // Transform initial VPlan: Apply previously taken decisions, in order, to 9183 // bring the VPlan to its final state. 9184 // --------------------------------------------------------------------------- 9185 9186 // Apply Sink-After legal constraints. 9187 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9188 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9189 if (Region && Region->isReplicator()) { 9190 assert(Region->getNumSuccessors() == 1 && 9191 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 9192 assert(R->getParent()->size() == 1 && 9193 "A recipe in an original replicator region must be the only " 9194 "recipe in its block"); 9195 return Region; 9196 } 9197 return nullptr; 9198 }; 9199 for (auto &Entry : SinkAfter) { 9200 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9201 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9202 9203 auto *TargetRegion = GetReplicateRegion(Target); 9204 auto *SinkRegion = GetReplicateRegion(Sink); 9205 if (!SinkRegion) { 9206 // If the sink source is not a replicate region, sink the recipe directly. 9207 if (TargetRegion) { 9208 // The target is in a replication region, make sure to move Sink to 9209 // the block after it, not into the replication region itself. 9210 VPBasicBlock *NextBlock = 9211 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9212 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9213 } else 9214 Sink->moveAfter(Target); 9215 continue; 9216 } 9217 9218 // The sink source is in a replicate region. Unhook the region from the CFG. 9219 auto *SinkPred = SinkRegion->getSinglePredecessor(); 9220 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 9221 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 9222 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 9223 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 9224 9225 if (TargetRegion) { 9226 // The target recipe is also in a replicate region, move the sink region 9227 // after the target region. 9228 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 9229 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 9230 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 9231 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 9232 } else { 9233 // The sink source is in a replicate region, we need to move the whole 9234 // replicate region, which should only contain a single recipe in the 9235 // main block. 9236 auto *SplitBlock = 9237 Target->getParent()->splitAt(std::next(Target->getIterator())); 9238 9239 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9240 9241 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9242 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9243 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9244 } 9245 } 9246 9247 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9248 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9249 9250 // Now that sink-after is done, move induction recipes for optimized truncates 9251 // to the phi section of the header block. 9252 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 9253 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9254 9255 // Adjust the recipes for any inloop reductions. 9256 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 9257 RecipeBuilder, Range.Start); 9258 9259 // Introduce a recipe to combine the incoming and previous values of a 9260 // first-order recurrence. 9261 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9262 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 9263 if (!RecurPhi) 9264 continue; 9265 9266 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 9267 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 9268 auto *Region = GetReplicateRegion(PrevRecipe); 9269 if (Region) 9270 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 9271 if (Region || PrevRecipe->isPhi()) 9272 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 9273 else 9274 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9275 9276 auto *RecurSplice = cast<VPInstruction>( 9277 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9278 {RecurPhi, RecurPhi->getBackedgeValue()})); 9279 9280 RecurPhi->replaceAllUsesWith(RecurSplice); 9281 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9282 // all users. 9283 RecurSplice->setOperand(0, RecurPhi); 9284 } 9285 9286 // Interleave memory: for each Interleave Group we marked earlier as relevant 9287 // for this VPlan, replace the Recipes widening its memory instructions with a 9288 // single VPInterleaveRecipe at its insertion point. 9289 for (auto IG : InterleaveGroups) { 9290 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9291 RecipeBuilder.getRecipe(IG->getInsertPos())); 9292 SmallVector<VPValue *, 4> StoredValues; 9293 for (unsigned i = 0; i < IG->getFactor(); ++i) 9294 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9295 auto *StoreR = 9296 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9297 StoredValues.push_back(StoreR->getStoredValue()); 9298 } 9299 9300 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9301 Recipe->getMask()); 9302 VPIG->insertBefore(Recipe); 9303 unsigned J = 0; 9304 for (unsigned i = 0; i < IG->getFactor(); ++i) 9305 if (Instruction *Member = IG->getMember(i)) { 9306 if (!Member->getType()->isVoidTy()) { 9307 VPValue *OriginalV = Plan->getVPValue(Member); 9308 Plan->removeVPValueFor(Member); 9309 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9310 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9311 J++; 9312 } 9313 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9314 } 9315 } 9316 9317 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9318 // in ways that accessing values using original IR values is incorrect. 9319 Plan->disableValue2VPValue(); 9320 9321 VPlanTransforms::sinkScalarOperands(*Plan); 9322 VPlanTransforms::mergeReplicateRegions(*Plan); 9323 9324 std::string PlanName; 9325 raw_string_ostream RSO(PlanName); 9326 ElementCount VF = Range.Start; 9327 Plan->addVF(VF); 9328 RSO << "Initial VPlan for VF={" << VF; 9329 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9330 Plan->addVF(VF); 9331 RSO << "," << VF; 9332 } 9333 RSO << "},UF>=1"; 9334 RSO.flush(); 9335 Plan->setName(PlanName); 9336 9337 // Fold Exit block into its predecessor if possible. 9338 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9339 // VPBasicBlock as exit. 9340 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9341 9342 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9343 return Plan; 9344 } 9345 9346 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9347 // Outer loop handling: They may require CFG and instruction level 9348 // transformations before even evaluating whether vectorization is profitable. 9349 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9350 // the vectorization pipeline. 9351 assert(!OrigLoop->isInnermost()); 9352 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9353 9354 // Create new empty VPlan 9355 auto Plan = std::make_unique<VPlan>(); 9356 9357 // Build hierarchical CFG 9358 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9359 HCFGBuilder.buildHierarchicalCFG(); 9360 9361 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9362 VF *= 2) 9363 Plan->addVF(VF); 9364 9365 if (EnableVPlanPredication) { 9366 VPlanPredicator VPP(*Plan); 9367 VPP.predicate(); 9368 9369 // Avoid running transformation to recipes until masked code generation in 9370 // VPlan-native path is in place. 9371 return Plan; 9372 } 9373 9374 SmallPtrSet<Instruction *, 1> DeadInstructions; 9375 VPlanTransforms::VPInstructionsToVPRecipes( 9376 OrigLoop, Plan, 9377 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9378 DeadInstructions, *PSE.getSE()); 9379 9380 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9381 true, true); 9382 return Plan; 9383 } 9384 9385 // Adjust the recipes for reductions. For in-loop reductions the chain of 9386 // instructions leading from the loop exit instr to the phi need to be converted 9387 // to reductions, with one operand being vector and the other being the scalar 9388 // reduction chain. For other reductions, a select is introduced between the phi 9389 // and live-out recipes when folding the tail. 9390 void LoopVectorizationPlanner::adjustRecipesForReductions( 9391 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9392 ElementCount MinVF) { 9393 for (auto &Reduction : CM.getInLoopReductionChains()) { 9394 PHINode *Phi = Reduction.first; 9395 const RecurrenceDescriptor &RdxDesc = 9396 Legal->getReductionVars().find(Phi)->second; 9397 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9398 9399 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9400 continue; 9401 9402 // ReductionOperations are orders top-down from the phi's use to the 9403 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9404 // which of the two operands will remain scalar and which will be reduced. 9405 // For minmax the chain will be the select instructions. 9406 Instruction *Chain = Phi; 9407 for (Instruction *R : ReductionOperations) { 9408 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9409 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9410 9411 VPValue *ChainOp = Plan->getVPValue(Chain); 9412 unsigned FirstOpId; 9413 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9414 "Only min/max recurrences allowed for inloop reductions"); 9415 // Recognize a call to the llvm.fmuladd intrinsic. 9416 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9417 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9418 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9419 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9420 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9421 "Expected to replace a VPWidenSelectSC"); 9422 FirstOpId = 1; 9423 } else { 9424 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9425 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9426 "Expected to replace a VPWidenSC"); 9427 FirstOpId = 0; 9428 } 9429 unsigned VecOpId = 9430 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9431 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9432 9433 auto *CondOp = CM.foldTailByMasking() 9434 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9435 : nullptr; 9436 9437 if (IsFMulAdd) { 9438 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9439 // need to create an fmul recipe to use as the vector operand for the 9440 // fadd reduction. 9441 VPInstruction *FMulRecipe = new VPInstruction( 9442 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9443 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9444 WidenRecipe->getParent()->insert(FMulRecipe, 9445 WidenRecipe->getIterator()); 9446 VecOp = FMulRecipe; 9447 } 9448 VPReductionRecipe *RedRecipe = 9449 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9450 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9451 Plan->removeVPValueFor(R); 9452 Plan->addVPValue(R, RedRecipe); 9453 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9454 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9455 WidenRecipe->eraseFromParent(); 9456 9457 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9458 VPRecipeBase *CompareRecipe = 9459 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9460 assert(isa<VPWidenRecipe>(CompareRecipe) && 9461 "Expected to replace a VPWidenSC"); 9462 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9463 "Expected no remaining users"); 9464 CompareRecipe->eraseFromParent(); 9465 } 9466 Chain = R; 9467 } 9468 } 9469 9470 // If tail is folded by masking, introduce selects between the phi 9471 // and the live-out instruction of each reduction, at the beginning of the 9472 // dedicated latch block. 9473 if (CM.foldTailByMasking()) { 9474 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9475 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) { 9476 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9477 if (!PhiR || PhiR->isInLoop()) 9478 continue; 9479 VPValue *Cond = 9480 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9481 VPValue *Red = PhiR->getBackedgeValue(); 9482 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9483 "reduction recipe must be defined before latch"); 9484 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9485 } 9486 } 9487 } 9488 9489 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9490 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9491 VPSlotTracker &SlotTracker) const { 9492 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9493 IG->getInsertPos()->printAsOperand(O, false); 9494 O << ", "; 9495 getAddr()->printAsOperand(O, SlotTracker); 9496 VPValue *Mask = getMask(); 9497 if (Mask) { 9498 O << ", "; 9499 Mask->printAsOperand(O, SlotTracker); 9500 } 9501 9502 unsigned OpIdx = 0; 9503 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9504 if (!IG->getMember(i)) 9505 continue; 9506 if (getNumStoreOperands() > 0) { 9507 O << "\n" << Indent << " store "; 9508 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9509 O << " to index " << i; 9510 } else { 9511 O << "\n" << Indent << " "; 9512 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9513 O << " = load from index " << i; 9514 } 9515 ++OpIdx; 9516 } 9517 } 9518 #endif 9519 9520 void VPWidenCallRecipe::execute(VPTransformState &State) { 9521 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9522 *this, State); 9523 } 9524 9525 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9526 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9527 State.ILV->setDebugLocFromInst(&I); 9528 9529 // The condition can be loop invariant but still defined inside the 9530 // loop. This means that we can't just use the original 'cond' value. 9531 // We have to take the 'vectorized' value and pick the first lane. 9532 // Instcombine will make this a no-op. 9533 auto *InvarCond = 9534 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9535 9536 for (unsigned Part = 0; Part < State.UF; ++Part) { 9537 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9538 Value *Op0 = State.get(getOperand(1), Part); 9539 Value *Op1 = State.get(getOperand(2), Part); 9540 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9541 State.set(this, Sel, Part); 9542 State.ILV->addMetadata(Sel, &I); 9543 } 9544 } 9545 9546 void VPWidenRecipe::execute(VPTransformState &State) { 9547 auto &I = *cast<Instruction>(getUnderlyingValue()); 9548 auto &Builder = State.Builder; 9549 switch (I.getOpcode()) { 9550 case Instruction::Call: 9551 case Instruction::Br: 9552 case Instruction::PHI: 9553 case Instruction::GetElementPtr: 9554 case Instruction::Select: 9555 llvm_unreachable("This instruction is handled by a different recipe."); 9556 case Instruction::UDiv: 9557 case Instruction::SDiv: 9558 case Instruction::SRem: 9559 case Instruction::URem: 9560 case Instruction::Add: 9561 case Instruction::FAdd: 9562 case Instruction::Sub: 9563 case Instruction::FSub: 9564 case Instruction::FNeg: 9565 case Instruction::Mul: 9566 case Instruction::FMul: 9567 case Instruction::FDiv: 9568 case Instruction::FRem: 9569 case Instruction::Shl: 9570 case Instruction::LShr: 9571 case Instruction::AShr: 9572 case Instruction::And: 9573 case Instruction::Or: 9574 case Instruction::Xor: { 9575 // Just widen unops and binops. 9576 State.ILV->setDebugLocFromInst(&I); 9577 9578 for (unsigned Part = 0; Part < State.UF; ++Part) { 9579 SmallVector<Value *, 2> Ops; 9580 for (VPValue *VPOp : operands()) 9581 Ops.push_back(State.get(VPOp, Part)); 9582 9583 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9584 9585 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9586 VecOp->copyIRFlags(&I); 9587 9588 // If the instruction is vectorized and was in a basic block that needed 9589 // predication, we can't propagate poison-generating flags (nuw/nsw, 9590 // exact, etc.). The control flow has been linearized and the 9591 // instruction is no longer guarded by the predicate, which could make 9592 // the flag properties to no longer hold. 9593 if (State.MayGeneratePoisonRecipes.contains(this)) 9594 VecOp->dropPoisonGeneratingFlags(); 9595 } 9596 9597 // Use this vector value for all users of the original instruction. 9598 State.set(this, V, Part); 9599 State.ILV->addMetadata(V, &I); 9600 } 9601 9602 break; 9603 } 9604 case Instruction::ICmp: 9605 case Instruction::FCmp: { 9606 // Widen compares. Generate vector compares. 9607 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9608 auto *Cmp = cast<CmpInst>(&I); 9609 State.ILV->setDebugLocFromInst(Cmp); 9610 for (unsigned Part = 0; Part < State.UF; ++Part) { 9611 Value *A = State.get(getOperand(0), Part); 9612 Value *B = State.get(getOperand(1), Part); 9613 Value *C = nullptr; 9614 if (FCmp) { 9615 // Propagate fast math flags. 9616 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9617 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9618 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9619 } else { 9620 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9621 } 9622 State.set(this, C, Part); 9623 State.ILV->addMetadata(C, &I); 9624 } 9625 9626 break; 9627 } 9628 9629 case Instruction::ZExt: 9630 case Instruction::SExt: 9631 case Instruction::FPToUI: 9632 case Instruction::FPToSI: 9633 case Instruction::FPExt: 9634 case Instruction::PtrToInt: 9635 case Instruction::IntToPtr: 9636 case Instruction::SIToFP: 9637 case Instruction::UIToFP: 9638 case Instruction::Trunc: 9639 case Instruction::FPTrunc: 9640 case Instruction::BitCast: { 9641 auto *CI = cast<CastInst>(&I); 9642 State.ILV->setDebugLocFromInst(CI); 9643 9644 /// Vectorize casts. 9645 Type *DestTy = (State.VF.isScalar()) 9646 ? CI->getType() 9647 : VectorType::get(CI->getType(), State.VF); 9648 9649 for (unsigned Part = 0; Part < State.UF; ++Part) { 9650 Value *A = State.get(getOperand(0), Part); 9651 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9652 State.set(this, Cast, Part); 9653 State.ILV->addMetadata(Cast, &I); 9654 } 9655 break; 9656 } 9657 default: 9658 // This instruction is not vectorized by simple widening. 9659 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9660 llvm_unreachable("Unhandled instruction!"); 9661 } // end of switch. 9662 } 9663 9664 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9665 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9666 // Construct a vector GEP by widening the operands of the scalar GEP as 9667 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9668 // results in a vector of pointers when at least one operand of the GEP 9669 // is vector-typed. Thus, to keep the representation compact, we only use 9670 // vector-typed operands for loop-varying values. 9671 9672 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9673 // If we are vectorizing, but the GEP has only loop-invariant operands, 9674 // the GEP we build (by only using vector-typed operands for 9675 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9676 // produce a vector of pointers, we need to either arbitrarily pick an 9677 // operand to broadcast, or broadcast a clone of the original GEP. 9678 // Here, we broadcast a clone of the original. 9679 // 9680 // TODO: If at some point we decide to scalarize instructions having 9681 // loop-invariant operands, this special case will no longer be 9682 // required. We would add the scalarization decision to 9683 // collectLoopScalars() and teach getVectorValue() to broadcast 9684 // the lane-zero scalar value. 9685 auto *Clone = State.Builder.Insert(GEP->clone()); 9686 for (unsigned Part = 0; Part < State.UF; ++Part) { 9687 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9688 State.set(this, EntryPart, Part); 9689 State.ILV->addMetadata(EntryPart, GEP); 9690 } 9691 } else { 9692 // If the GEP has at least one loop-varying operand, we are sure to 9693 // produce a vector of pointers. But if we are only unrolling, we want 9694 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9695 // produce with the code below will be scalar (if VF == 1) or vector 9696 // (otherwise). Note that for the unroll-only case, we still maintain 9697 // values in the vector mapping with initVector, as we do for other 9698 // instructions. 9699 for (unsigned Part = 0; Part < State.UF; ++Part) { 9700 // The pointer operand of the new GEP. If it's loop-invariant, we 9701 // won't broadcast it. 9702 auto *Ptr = IsPtrLoopInvariant 9703 ? State.get(getOperand(0), VPIteration(0, 0)) 9704 : State.get(getOperand(0), Part); 9705 9706 // Collect all the indices for the new GEP. If any index is 9707 // loop-invariant, we won't broadcast it. 9708 SmallVector<Value *, 4> Indices; 9709 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9710 VPValue *Operand = getOperand(I); 9711 if (IsIndexLoopInvariant[I - 1]) 9712 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9713 else 9714 Indices.push_back(State.get(Operand, Part)); 9715 } 9716 9717 // If the GEP instruction is vectorized and was in a basic block that 9718 // needed predication, we can't propagate the poison-generating 'inbounds' 9719 // flag. The control flow has been linearized and the GEP is no longer 9720 // guarded by the predicate, which could make the 'inbounds' properties to 9721 // no longer hold. 9722 bool IsInBounds = 9723 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9724 9725 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9726 // but it should be a vector, otherwise. 9727 auto *NewGEP = IsInBounds 9728 ? State.Builder.CreateInBoundsGEP( 9729 GEP->getSourceElementType(), Ptr, Indices) 9730 : State.Builder.CreateGEP(GEP->getSourceElementType(), 9731 Ptr, Indices); 9732 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9733 "NewGEP is not a pointer vector"); 9734 State.set(this, NewGEP, Part); 9735 State.ILV->addMetadata(NewGEP, GEP); 9736 } 9737 } 9738 } 9739 9740 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9741 assert(!State.Instance && "Int or FP induction being replicated."); 9742 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9743 State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV); 9744 } 9745 9746 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9747 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9748 State); 9749 } 9750 9751 void VPBlendRecipe::execute(VPTransformState &State) { 9752 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9753 // We know that all PHIs in non-header blocks are converted into 9754 // selects, so we don't have to worry about the insertion order and we 9755 // can just use the builder. 9756 // At this point we generate the predication tree. There may be 9757 // duplications since this is a simple recursive scan, but future 9758 // optimizations will clean it up. 9759 9760 unsigned NumIncoming = getNumIncomingValues(); 9761 9762 // Generate a sequence of selects of the form: 9763 // SELECT(Mask3, In3, 9764 // SELECT(Mask2, In2, 9765 // SELECT(Mask1, In1, 9766 // In0))) 9767 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9768 // are essentially undef are taken from In0. 9769 InnerLoopVectorizer::VectorParts Entry(State.UF); 9770 for (unsigned In = 0; In < NumIncoming; ++In) { 9771 for (unsigned Part = 0; Part < State.UF; ++Part) { 9772 // We might have single edge PHIs (blocks) - use an identity 9773 // 'select' for the first PHI operand. 9774 Value *In0 = State.get(getIncomingValue(In), Part); 9775 if (In == 0) 9776 Entry[Part] = In0; // Initialize with the first incoming value. 9777 else { 9778 // Select between the current value and the previous incoming edge 9779 // based on the incoming mask. 9780 Value *Cond = State.get(getMask(In), Part); 9781 Entry[Part] = 9782 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9783 } 9784 } 9785 } 9786 for (unsigned Part = 0; Part < State.UF; ++Part) 9787 State.set(this, Entry[Part], Part); 9788 } 9789 9790 void VPInterleaveRecipe::execute(VPTransformState &State) { 9791 assert(!State.Instance && "Interleave group being replicated."); 9792 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9793 getStoredValues(), getMask()); 9794 } 9795 9796 void VPReductionRecipe::execute(VPTransformState &State) { 9797 assert(!State.Instance && "Reduction being replicated."); 9798 Value *PrevInChain = State.get(getChainOp(), 0); 9799 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9800 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9801 // Propagate the fast-math flags carried by the underlying instruction. 9802 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9803 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9804 for (unsigned Part = 0; Part < State.UF; ++Part) { 9805 Value *NewVecOp = State.get(getVecOp(), Part); 9806 if (VPValue *Cond = getCondOp()) { 9807 Value *NewCond = State.get(Cond, Part); 9808 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9809 Value *Iden = RdxDesc->getRecurrenceIdentity( 9810 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9811 Value *IdenVec = 9812 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9813 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9814 NewVecOp = Select; 9815 } 9816 Value *NewRed; 9817 Value *NextInChain; 9818 if (IsOrdered) { 9819 if (State.VF.isVector()) 9820 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9821 PrevInChain); 9822 else 9823 NewRed = State.Builder.CreateBinOp( 9824 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9825 NewVecOp); 9826 PrevInChain = NewRed; 9827 } else { 9828 PrevInChain = State.get(getChainOp(), Part); 9829 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9830 } 9831 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9832 NextInChain = 9833 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9834 NewRed, PrevInChain); 9835 } else if (IsOrdered) 9836 NextInChain = NewRed; 9837 else 9838 NextInChain = State.Builder.CreateBinOp( 9839 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9840 PrevInChain); 9841 State.set(this, NextInChain, Part); 9842 } 9843 } 9844 9845 void VPReplicateRecipe::execute(VPTransformState &State) { 9846 if (State.Instance) { // Generate a single instance. 9847 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9848 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9849 IsPredicated, State); 9850 // Insert scalar instance packing it into a vector. 9851 if (AlsoPack && State.VF.isVector()) { 9852 // If we're constructing lane 0, initialize to start from poison. 9853 if (State.Instance->Lane.isFirstLane()) { 9854 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9855 Value *Poison = PoisonValue::get( 9856 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9857 State.set(this, Poison, State.Instance->Part); 9858 } 9859 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9860 } 9861 return; 9862 } 9863 9864 // Generate scalar instances for all VF lanes of all UF parts, unless the 9865 // instruction is uniform inwhich case generate only the first lane for each 9866 // of the UF parts. 9867 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9868 assert((!State.VF.isScalable() || IsUniform) && 9869 "Can't scalarize a scalable vector"); 9870 for (unsigned Part = 0; Part < State.UF; ++Part) 9871 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9872 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9873 VPIteration(Part, Lane), IsPredicated, 9874 State); 9875 } 9876 9877 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9878 assert(State.Instance && "Branch on Mask works only on single instance."); 9879 9880 unsigned Part = State.Instance->Part; 9881 unsigned Lane = State.Instance->Lane.getKnownLane(); 9882 9883 Value *ConditionBit = nullptr; 9884 VPValue *BlockInMask = getMask(); 9885 if (BlockInMask) { 9886 ConditionBit = State.get(BlockInMask, Part); 9887 if (ConditionBit->getType()->isVectorTy()) 9888 ConditionBit = State.Builder.CreateExtractElement( 9889 ConditionBit, State.Builder.getInt32(Lane)); 9890 } else // Block in mask is all-one. 9891 ConditionBit = State.Builder.getTrue(); 9892 9893 // Replace the temporary unreachable terminator with a new conditional branch, 9894 // whose two destinations will be set later when they are created. 9895 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9896 assert(isa<UnreachableInst>(CurrentTerminator) && 9897 "Expected to replace unreachable terminator with conditional branch."); 9898 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9899 CondBr->setSuccessor(0, nullptr); 9900 ReplaceInstWithInst(CurrentTerminator, CondBr); 9901 } 9902 9903 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9904 assert(State.Instance && "Predicated instruction PHI works per instance."); 9905 Instruction *ScalarPredInst = 9906 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9907 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9908 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9909 assert(PredicatingBB && "Predicated block has no single predecessor."); 9910 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9911 "operand must be VPReplicateRecipe"); 9912 9913 // By current pack/unpack logic we need to generate only a single phi node: if 9914 // a vector value for the predicated instruction exists at this point it means 9915 // the instruction has vector users only, and a phi for the vector value is 9916 // needed. In this case the recipe of the predicated instruction is marked to 9917 // also do that packing, thereby "hoisting" the insert-element sequence. 9918 // Otherwise, a phi node for the scalar value is needed. 9919 unsigned Part = State.Instance->Part; 9920 if (State.hasVectorValue(getOperand(0), Part)) { 9921 Value *VectorValue = State.get(getOperand(0), Part); 9922 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9923 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9924 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9925 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9926 if (State.hasVectorValue(this, Part)) 9927 State.reset(this, VPhi, Part); 9928 else 9929 State.set(this, VPhi, Part); 9930 // NOTE: Currently we need to update the value of the operand, so the next 9931 // predicated iteration inserts its generated value in the correct vector. 9932 State.reset(getOperand(0), VPhi, Part); 9933 } else { 9934 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9935 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9936 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9937 PredicatingBB); 9938 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9939 if (State.hasScalarValue(this, *State.Instance)) 9940 State.reset(this, Phi, *State.Instance); 9941 else 9942 State.set(this, Phi, *State.Instance); 9943 // NOTE: Currently we need to update the value of the operand, so the next 9944 // predicated iteration inserts its generated value in the correct vector. 9945 State.reset(getOperand(0), Phi, *State.Instance); 9946 } 9947 } 9948 9949 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9950 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9951 9952 // Attempt to issue a wide load. 9953 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9954 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9955 9956 assert((LI || SI) && "Invalid Load/Store instruction"); 9957 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9958 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9959 9960 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9961 9962 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9963 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9964 bool CreateGatherScatter = !Consecutive; 9965 9966 auto &Builder = State.Builder; 9967 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9968 bool isMaskRequired = getMask(); 9969 if (isMaskRequired) 9970 for (unsigned Part = 0; Part < State.UF; ++Part) 9971 BlockInMaskParts[Part] = State.get(getMask(), Part); 9972 9973 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9974 // Calculate the pointer for the specific unroll-part. 9975 GetElementPtrInst *PartPtr = nullptr; 9976 9977 bool InBounds = false; 9978 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9979 InBounds = gep->isInBounds(); 9980 if (Reverse) { 9981 // If the address is consecutive but reversed, then the 9982 // wide store needs to start at the last vector element. 9983 // RunTimeVF = VScale * VF.getKnownMinValue() 9984 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9985 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9986 // NumElt = -Part * RunTimeVF 9987 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9988 // LastLane = 1 - RunTimeVF 9989 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9990 PartPtr = 9991 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9992 PartPtr->setIsInBounds(InBounds); 9993 PartPtr = cast<GetElementPtrInst>( 9994 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9995 PartPtr->setIsInBounds(InBounds); 9996 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9997 BlockInMaskParts[Part] = 9998 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9999 } else { 10000 Value *Increment = 10001 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10002 PartPtr = cast<GetElementPtrInst>( 10003 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10004 PartPtr->setIsInBounds(InBounds); 10005 } 10006 10007 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10008 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10009 }; 10010 10011 // Handle Stores: 10012 if (SI) { 10013 State.ILV->setDebugLocFromInst(SI); 10014 10015 for (unsigned Part = 0; Part < State.UF; ++Part) { 10016 Instruction *NewSI = nullptr; 10017 Value *StoredVal = State.get(StoredValue, Part); 10018 if (CreateGatherScatter) { 10019 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10020 Value *VectorGep = State.get(getAddr(), Part); 10021 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10022 MaskPart); 10023 } else { 10024 if (Reverse) { 10025 // If we store to reverse consecutive memory locations, then we need 10026 // to reverse the order of elements in the stored value. 10027 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10028 // We don't want to update the value in the map as it might be used in 10029 // another expression. So don't call resetVectorValue(StoredVal). 10030 } 10031 auto *VecPtr = 10032 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10033 if (isMaskRequired) 10034 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10035 BlockInMaskParts[Part]); 10036 else 10037 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10038 } 10039 State.ILV->addMetadata(NewSI, SI); 10040 } 10041 return; 10042 } 10043 10044 // Handle loads. 10045 assert(LI && "Must have a load instruction"); 10046 State.ILV->setDebugLocFromInst(LI); 10047 for (unsigned Part = 0; Part < State.UF; ++Part) { 10048 Value *NewLI; 10049 if (CreateGatherScatter) { 10050 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10051 Value *VectorGep = State.get(getAddr(), Part); 10052 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10053 nullptr, "wide.masked.gather"); 10054 State.ILV->addMetadata(NewLI, LI); 10055 } else { 10056 auto *VecPtr = 10057 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10058 if (isMaskRequired) 10059 NewLI = Builder.CreateMaskedLoad( 10060 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10061 PoisonValue::get(DataTy), "wide.masked.load"); 10062 else 10063 NewLI = 10064 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10065 10066 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10067 State.ILV->addMetadata(NewLI, LI); 10068 if (Reverse) 10069 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10070 } 10071 10072 State.set(this, NewLI, Part); 10073 } 10074 } 10075 10076 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10077 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10078 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10079 // for predication. 10080 static ScalarEpilogueLowering getScalarEpilogueLowering( 10081 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10082 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10083 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10084 LoopVectorizationLegality &LVL) { 10085 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10086 // don't look at hints or options, and don't request a scalar epilogue. 10087 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10088 // LoopAccessInfo (due to code dependency and not being able to reliably get 10089 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10090 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10091 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10092 // back to the old way and vectorize with versioning when forced. See D81345.) 10093 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10094 PGSOQueryType::IRPass) && 10095 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10096 return CM_ScalarEpilogueNotAllowedOptSize; 10097 10098 // 2) If set, obey the directives 10099 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10100 switch (PreferPredicateOverEpilogue) { 10101 case PreferPredicateTy::ScalarEpilogue: 10102 return CM_ScalarEpilogueAllowed; 10103 case PreferPredicateTy::PredicateElseScalarEpilogue: 10104 return CM_ScalarEpilogueNotNeededUsePredicate; 10105 case PreferPredicateTy::PredicateOrDontVectorize: 10106 return CM_ScalarEpilogueNotAllowedUsePredicate; 10107 }; 10108 } 10109 10110 // 3) If set, obey the hints 10111 switch (Hints.getPredicate()) { 10112 case LoopVectorizeHints::FK_Enabled: 10113 return CM_ScalarEpilogueNotNeededUsePredicate; 10114 case LoopVectorizeHints::FK_Disabled: 10115 return CM_ScalarEpilogueAllowed; 10116 }; 10117 10118 // 4) if the TTI hook indicates this is profitable, request predication. 10119 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10120 LVL.getLAI())) 10121 return CM_ScalarEpilogueNotNeededUsePredicate; 10122 10123 return CM_ScalarEpilogueAllowed; 10124 } 10125 10126 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10127 // If Values have been set for this Def return the one relevant for \p Part. 10128 if (hasVectorValue(Def, Part)) 10129 return Data.PerPartOutput[Def][Part]; 10130 10131 if (!hasScalarValue(Def, {Part, 0})) { 10132 Value *IRV = Def->getLiveInIRValue(); 10133 Value *B = ILV->getBroadcastInstrs(IRV); 10134 set(Def, B, Part); 10135 return B; 10136 } 10137 10138 Value *ScalarValue = get(Def, {Part, 0}); 10139 // If we aren't vectorizing, we can just copy the scalar map values over 10140 // to the vector map. 10141 if (VF.isScalar()) { 10142 set(Def, ScalarValue, Part); 10143 return ScalarValue; 10144 } 10145 10146 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10147 bool IsUniform = RepR && RepR->isUniform(); 10148 10149 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10150 // Check if there is a scalar value for the selected lane. 10151 if (!hasScalarValue(Def, {Part, LastLane})) { 10152 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10153 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 10154 "unexpected recipe found to be invariant"); 10155 IsUniform = true; 10156 LastLane = 0; 10157 } 10158 10159 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10160 // Set the insert point after the last scalarized instruction or after the 10161 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10162 // will directly follow the scalar definitions. 10163 auto OldIP = Builder.saveIP(); 10164 auto NewIP = 10165 isa<PHINode>(LastInst) 10166 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10167 : std::next(BasicBlock::iterator(LastInst)); 10168 Builder.SetInsertPoint(&*NewIP); 10169 10170 // However, if we are vectorizing, we need to construct the vector values. 10171 // If the value is known to be uniform after vectorization, we can just 10172 // broadcast the scalar value corresponding to lane zero for each unroll 10173 // iteration. Otherwise, we construct the vector values using 10174 // insertelement instructions. Since the resulting vectors are stored in 10175 // State, we will only generate the insertelements once. 10176 Value *VectorValue = nullptr; 10177 if (IsUniform) { 10178 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10179 set(Def, VectorValue, Part); 10180 } else { 10181 // Initialize packing with insertelements to start from undef. 10182 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10183 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10184 set(Def, Undef, Part); 10185 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10186 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10187 VectorValue = get(Def, Part); 10188 } 10189 Builder.restoreIP(OldIP); 10190 return VectorValue; 10191 } 10192 10193 // Process the loop in the VPlan-native vectorization path. This path builds 10194 // VPlan upfront in the vectorization pipeline, which allows to apply 10195 // VPlan-to-VPlan transformations from the very beginning without modifying the 10196 // input LLVM IR. 10197 static bool processLoopInVPlanNativePath( 10198 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10199 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10200 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10201 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10202 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10203 LoopVectorizationRequirements &Requirements) { 10204 10205 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10206 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10207 return false; 10208 } 10209 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10210 Function *F = L->getHeader()->getParent(); 10211 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10212 10213 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10214 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10215 10216 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10217 &Hints, IAI); 10218 // Use the planner for outer loop vectorization. 10219 // TODO: CM is not used at this point inside the planner. Turn CM into an 10220 // optional argument if we don't need it in the future. 10221 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10222 Requirements, ORE); 10223 10224 // Get user vectorization factor. 10225 ElementCount UserVF = Hints.getWidth(); 10226 10227 CM.collectElementTypesForWidening(); 10228 10229 // Plan how to best vectorize, return the best VF and its cost. 10230 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10231 10232 // If we are stress testing VPlan builds, do not attempt to generate vector 10233 // code. Masked vector code generation support will follow soon. 10234 // Also, do not attempt to vectorize if no vector code will be produced. 10235 if (VPlanBuildStressTest || EnableVPlanPredication || 10236 VectorizationFactor::Disabled() == VF) 10237 return false; 10238 10239 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10240 10241 { 10242 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10243 F->getParent()->getDataLayout()); 10244 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10245 &CM, BFI, PSI, Checks); 10246 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10247 << L->getHeader()->getParent()->getName() << "\"\n"); 10248 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10249 } 10250 10251 // Mark the loop as already vectorized to avoid vectorizing again. 10252 Hints.setAlreadyVectorized(); 10253 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10254 return true; 10255 } 10256 10257 // Emit a remark if there are stores to floats that required a floating point 10258 // extension. If the vectorized loop was generated with floating point there 10259 // will be a performance penalty from the conversion overhead and the change in 10260 // the vector width. 10261 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10262 SmallVector<Instruction *, 4> Worklist; 10263 for (BasicBlock *BB : L->getBlocks()) { 10264 for (Instruction &Inst : *BB) { 10265 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10266 if (S->getValueOperand()->getType()->isFloatTy()) 10267 Worklist.push_back(S); 10268 } 10269 } 10270 } 10271 10272 // Traverse the floating point stores upwards searching, for floating point 10273 // conversions. 10274 SmallPtrSet<const Instruction *, 4> Visited; 10275 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10276 while (!Worklist.empty()) { 10277 auto *I = Worklist.pop_back_val(); 10278 if (!L->contains(I)) 10279 continue; 10280 if (!Visited.insert(I).second) 10281 continue; 10282 10283 // Emit a remark if the floating point store required a floating 10284 // point conversion. 10285 // TODO: More work could be done to identify the root cause such as a 10286 // constant or a function return type and point the user to it. 10287 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10288 ORE->emit([&]() { 10289 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10290 I->getDebugLoc(), L->getHeader()) 10291 << "floating point conversion changes vector width. " 10292 << "Mixed floating point precision requires an up/down " 10293 << "cast that will negatively impact performance."; 10294 }); 10295 10296 for (Use &Op : I->operands()) 10297 if (auto *OpI = dyn_cast<Instruction>(Op)) 10298 Worklist.push_back(OpI); 10299 } 10300 } 10301 10302 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10303 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10304 !EnableLoopInterleaving), 10305 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10306 !EnableLoopVectorization) {} 10307 10308 bool LoopVectorizePass::processLoop(Loop *L) { 10309 assert((EnableVPlanNativePath || L->isInnermost()) && 10310 "VPlan-native path is not enabled. Only process inner loops."); 10311 10312 #ifndef NDEBUG 10313 const std::string DebugLocStr = getDebugLocString(L); 10314 #endif /* NDEBUG */ 10315 10316 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 10317 << L->getHeader()->getParent()->getName() << "\" from " 10318 << DebugLocStr << "\n"); 10319 10320 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10321 10322 LLVM_DEBUG( 10323 dbgs() << "LV: Loop hints:" 10324 << " force=" 10325 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10326 ? "disabled" 10327 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10328 ? "enabled" 10329 : "?")) 10330 << " width=" << Hints.getWidth() 10331 << " interleave=" << Hints.getInterleave() << "\n"); 10332 10333 // Function containing loop 10334 Function *F = L->getHeader()->getParent(); 10335 10336 // Looking at the diagnostic output is the only way to determine if a loop 10337 // was vectorized (other than looking at the IR or machine code), so it 10338 // is important to generate an optimization remark for each loop. Most of 10339 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10340 // generated as OptimizationRemark and OptimizationRemarkMissed are 10341 // less verbose reporting vectorized loops and unvectorized loops that may 10342 // benefit from vectorization, respectively. 10343 10344 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10345 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10346 return false; 10347 } 10348 10349 PredicatedScalarEvolution PSE(*SE, *L); 10350 10351 // Check if it is legal to vectorize the loop. 10352 LoopVectorizationRequirements Requirements; 10353 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10354 &Requirements, &Hints, DB, AC, BFI, PSI); 10355 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10356 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10357 Hints.emitRemarkWithHints(); 10358 return false; 10359 } 10360 10361 // Check the function attributes and profiles to find out if this function 10362 // should be optimized for size. 10363 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10364 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10365 10366 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10367 // here. They may require CFG and instruction level transformations before 10368 // even evaluating whether vectorization is profitable. Since we cannot modify 10369 // the incoming IR, we need to build VPlan upfront in the vectorization 10370 // pipeline. 10371 if (!L->isInnermost()) 10372 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10373 ORE, BFI, PSI, Hints, Requirements); 10374 10375 assert(L->isInnermost() && "Inner loop expected."); 10376 10377 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10378 // count by optimizing for size, to minimize overheads. 10379 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10380 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10381 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10382 << "This loop is worth vectorizing only if no scalar " 10383 << "iteration overheads are incurred."); 10384 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10385 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10386 else { 10387 LLVM_DEBUG(dbgs() << "\n"); 10388 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10389 } 10390 } 10391 10392 // Check the function attributes to see if implicit floats are allowed. 10393 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10394 // an integer loop and the vector instructions selected are purely integer 10395 // vector instructions? 10396 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10397 reportVectorizationFailure( 10398 "Can't vectorize when the NoImplicitFloat attribute is used", 10399 "loop not vectorized due to NoImplicitFloat attribute", 10400 "NoImplicitFloat", ORE, L); 10401 Hints.emitRemarkWithHints(); 10402 return false; 10403 } 10404 10405 // Check if the target supports potentially unsafe FP vectorization. 10406 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10407 // for the target we're vectorizing for, to make sure none of the 10408 // additional fp-math flags can help. 10409 if (Hints.isPotentiallyUnsafe() && 10410 TTI->isFPVectorizationPotentiallyUnsafe()) { 10411 reportVectorizationFailure( 10412 "Potentially unsafe FP op prevents vectorization", 10413 "loop not vectorized due to unsafe FP support.", 10414 "UnsafeFP", ORE, L); 10415 Hints.emitRemarkWithHints(); 10416 return false; 10417 } 10418 10419 bool AllowOrderedReductions; 10420 // If the flag is set, use that instead and override the TTI behaviour. 10421 if (ForceOrderedReductions.getNumOccurrences() > 0) 10422 AllowOrderedReductions = ForceOrderedReductions; 10423 else 10424 AllowOrderedReductions = TTI->enableOrderedReductions(); 10425 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10426 ORE->emit([&]() { 10427 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10428 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10429 ExactFPMathInst->getDebugLoc(), 10430 ExactFPMathInst->getParent()) 10431 << "loop not vectorized: cannot prove it is safe to reorder " 10432 "floating-point operations"; 10433 }); 10434 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10435 "reorder floating-point operations\n"); 10436 Hints.emitRemarkWithHints(); 10437 return false; 10438 } 10439 10440 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10441 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10442 10443 // If an override option has been passed in for interleaved accesses, use it. 10444 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10445 UseInterleaved = EnableInterleavedMemAccesses; 10446 10447 // Analyze interleaved memory accesses. 10448 if (UseInterleaved) { 10449 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10450 } 10451 10452 // Use the cost model. 10453 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10454 F, &Hints, IAI); 10455 CM.collectValuesToIgnore(); 10456 CM.collectElementTypesForWidening(); 10457 10458 // Use the planner for vectorization. 10459 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10460 Requirements, ORE); 10461 10462 // Get user vectorization factor and interleave count. 10463 ElementCount UserVF = Hints.getWidth(); 10464 unsigned UserIC = Hints.getInterleave(); 10465 10466 // Plan how to best vectorize, return the best VF and its cost. 10467 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10468 10469 VectorizationFactor VF = VectorizationFactor::Disabled(); 10470 unsigned IC = 1; 10471 10472 if (MaybeVF) { 10473 VF = *MaybeVF; 10474 // Select the interleave count. 10475 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10476 } 10477 10478 // Identify the diagnostic messages that should be produced. 10479 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10480 bool VectorizeLoop = true, InterleaveLoop = true; 10481 if (VF.Width.isScalar()) { 10482 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10483 VecDiagMsg = std::make_pair( 10484 "VectorizationNotBeneficial", 10485 "the cost-model indicates that vectorization is not beneficial"); 10486 VectorizeLoop = false; 10487 } 10488 10489 if (!MaybeVF && UserIC > 1) { 10490 // Tell the user interleaving was avoided up-front, despite being explicitly 10491 // requested. 10492 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10493 "interleaving should be avoided up front\n"); 10494 IntDiagMsg = std::make_pair( 10495 "InterleavingAvoided", 10496 "Ignoring UserIC, because interleaving was avoided up front"); 10497 InterleaveLoop = false; 10498 } else if (IC == 1 && UserIC <= 1) { 10499 // Tell the user interleaving is not beneficial. 10500 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10501 IntDiagMsg = std::make_pair( 10502 "InterleavingNotBeneficial", 10503 "the cost-model indicates that interleaving is not beneficial"); 10504 InterleaveLoop = false; 10505 if (UserIC == 1) { 10506 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10507 IntDiagMsg.second += 10508 " and is explicitly disabled or interleave count is set to 1"; 10509 } 10510 } else if (IC > 1 && UserIC == 1) { 10511 // Tell the user interleaving is beneficial, but it explicitly disabled. 10512 LLVM_DEBUG( 10513 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10514 IntDiagMsg = std::make_pair( 10515 "InterleavingBeneficialButDisabled", 10516 "the cost-model indicates that interleaving is beneficial " 10517 "but is explicitly disabled or interleave count is set to 1"); 10518 InterleaveLoop = false; 10519 } 10520 10521 // Override IC if user provided an interleave count. 10522 IC = UserIC > 0 ? UserIC : IC; 10523 10524 // Emit diagnostic messages, if any. 10525 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10526 if (!VectorizeLoop && !InterleaveLoop) { 10527 // Do not vectorize or interleaving the loop. 10528 ORE->emit([&]() { 10529 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10530 L->getStartLoc(), L->getHeader()) 10531 << VecDiagMsg.second; 10532 }); 10533 ORE->emit([&]() { 10534 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10535 L->getStartLoc(), L->getHeader()) 10536 << IntDiagMsg.second; 10537 }); 10538 return false; 10539 } else if (!VectorizeLoop && InterleaveLoop) { 10540 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10541 ORE->emit([&]() { 10542 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10543 L->getStartLoc(), L->getHeader()) 10544 << VecDiagMsg.second; 10545 }); 10546 } else if (VectorizeLoop && !InterleaveLoop) { 10547 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10548 << ") in " << DebugLocStr << '\n'); 10549 ORE->emit([&]() { 10550 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10551 L->getStartLoc(), L->getHeader()) 10552 << IntDiagMsg.second; 10553 }); 10554 } else if (VectorizeLoop && InterleaveLoop) { 10555 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10556 << ") in " << DebugLocStr << '\n'); 10557 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10558 } 10559 10560 bool DisableRuntimeUnroll = false; 10561 MDNode *OrigLoopID = L->getLoopID(); 10562 { 10563 // Optimistically generate runtime checks. Drop them if they turn out to not 10564 // be profitable. Limit the scope of Checks, so the cleanup happens 10565 // immediately after vector codegeneration is done. 10566 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10567 F->getParent()->getDataLayout()); 10568 if (!VF.Width.isScalar() || IC > 1) 10569 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10570 10571 using namespace ore; 10572 if (!VectorizeLoop) { 10573 assert(IC > 1 && "interleave count should not be 1 or 0"); 10574 // If we decided that it is not legal to vectorize the loop, then 10575 // interleave it. 10576 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10577 &CM, BFI, PSI, Checks); 10578 10579 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10580 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10581 10582 ORE->emit([&]() { 10583 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10584 L->getHeader()) 10585 << "interleaved loop (interleaved count: " 10586 << NV("InterleaveCount", IC) << ")"; 10587 }); 10588 } else { 10589 // If we decided that it is *legal* to vectorize the loop, then do it. 10590 10591 // Consider vectorizing the epilogue too if it's profitable. 10592 VectorizationFactor EpilogueVF = 10593 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10594 if (EpilogueVF.Width.isVector()) { 10595 10596 // The first pass vectorizes the main loop and creates a scalar epilogue 10597 // to be vectorized by executing the plan (potentially with a different 10598 // factor) again shortly afterwards. 10599 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10600 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10601 EPI, &LVL, &CM, BFI, PSI, Checks); 10602 10603 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10604 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10605 DT); 10606 ++LoopsVectorized; 10607 10608 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10609 formLCSSARecursively(*L, *DT, LI, SE); 10610 10611 // Second pass vectorizes the epilogue and adjusts the control flow 10612 // edges from the first pass. 10613 EPI.MainLoopVF = EPI.EpilogueVF; 10614 EPI.MainLoopUF = EPI.EpilogueUF; 10615 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10616 ORE, EPI, &LVL, &CM, BFI, PSI, 10617 Checks); 10618 10619 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10620 10621 // Ensure that the start values for any VPReductionPHIRecipes are 10622 // updated before vectorising the epilogue loop. 10623 VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock(); 10624 for (VPRecipeBase &R : Header->phis()) { 10625 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10626 if (auto *Resume = MainILV.getReductionResumeValue( 10627 ReductionPhi->getRecurrenceDescriptor())) { 10628 VPValue *StartVal = new VPValue(Resume); 10629 BestEpiPlan.addExternalDef(StartVal); 10630 ReductionPhi->setOperand(0, StartVal); 10631 } 10632 } 10633 } 10634 10635 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10636 DT); 10637 ++LoopsEpilogueVectorized; 10638 10639 if (!MainILV.areSafetyChecksAdded()) 10640 DisableRuntimeUnroll = true; 10641 } else { 10642 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10643 &LVL, &CM, BFI, PSI, Checks); 10644 10645 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10646 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10647 ++LoopsVectorized; 10648 10649 // Add metadata to disable runtime unrolling a scalar loop when there 10650 // are no runtime checks about strides and memory. A scalar loop that is 10651 // rarely used is not worth unrolling. 10652 if (!LB.areSafetyChecksAdded()) 10653 DisableRuntimeUnroll = true; 10654 } 10655 // Report the vectorization decision. 10656 ORE->emit([&]() { 10657 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10658 L->getHeader()) 10659 << "vectorized loop (vectorization width: " 10660 << NV("VectorizationFactor", VF.Width) 10661 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10662 }); 10663 } 10664 10665 if (ORE->allowExtraAnalysis(LV_NAME)) 10666 checkMixedPrecision(L, ORE); 10667 } 10668 10669 Optional<MDNode *> RemainderLoopID = 10670 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10671 LLVMLoopVectorizeFollowupEpilogue}); 10672 if (RemainderLoopID.hasValue()) { 10673 L->setLoopID(RemainderLoopID.getValue()); 10674 } else { 10675 if (DisableRuntimeUnroll) 10676 AddRuntimeUnrollDisableMetaData(L); 10677 10678 // Mark the loop as already vectorized to avoid vectorizing again. 10679 Hints.setAlreadyVectorized(); 10680 } 10681 10682 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10683 return true; 10684 } 10685 10686 LoopVectorizeResult LoopVectorizePass::runImpl( 10687 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10688 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10689 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10690 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10691 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10692 SE = &SE_; 10693 LI = &LI_; 10694 TTI = &TTI_; 10695 DT = &DT_; 10696 BFI = &BFI_; 10697 TLI = TLI_; 10698 AA = &AA_; 10699 AC = &AC_; 10700 GetLAA = &GetLAA_; 10701 DB = &DB_; 10702 ORE = &ORE_; 10703 PSI = PSI_; 10704 10705 // Don't attempt if 10706 // 1. the target claims to have no vector registers, and 10707 // 2. interleaving won't help ILP. 10708 // 10709 // The second condition is necessary because, even if the target has no 10710 // vector registers, loop vectorization may still enable scalar 10711 // interleaving. 10712 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10713 TTI->getMaxInterleaveFactor(1) < 2) 10714 return LoopVectorizeResult(false, false); 10715 10716 bool Changed = false, CFGChanged = false; 10717 10718 // The vectorizer requires loops to be in simplified form. 10719 // Since simplification may add new inner loops, it has to run before the 10720 // legality and profitability checks. This means running the loop vectorizer 10721 // will simplify all loops, regardless of whether anything end up being 10722 // vectorized. 10723 for (auto &L : *LI) 10724 Changed |= CFGChanged |= 10725 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10726 10727 // Build up a worklist of inner-loops to vectorize. This is necessary as 10728 // the act of vectorizing or partially unrolling a loop creates new loops 10729 // and can invalidate iterators across the loops. 10730 SmallVector<Loop *, 8> Worklist; 10731 10732 for (Loop *L : *LI) 10733 collectSupportedLoops(*L, LI, ORE, Worklist); 10734 10735 LoopsAnalyzed += Worklist.size(); 10736 10737 // Now walk the identified inner loops. 10738 while (!Worklist.empty()) { 10739 Loop *L = Worklist.pop_back_val(); 10740 10741 // For the inner loops we actually process, form LCSSA to simplify the 10742 // transform. 10743 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10744 10745 Changed |= CFGChanged |= processLoop(L); 10746 } 10747 10748 // Process each loop nest in the function. 10749 return LoopVectorizeResult(Changed, CFGChanged); 10750 } 10751 10752 PreservedAnalyses LoopVectorizePass::run(Function &F, 10753 FunctionAnalysisManager &AM) { 10754 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10755 auto &LI = AM.getResult<LoopAnalysis>(F); 10756 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10757 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10758 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10759 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10760 auto &AA = AM.getResult<AAManager>(F); 10761 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10762 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10763 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10764 10765 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10766 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10767 [&](Loop &L) -> const LoopAccessInfo & { 10768 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10769 TLI, TTI, nullptr, nullptr, nullptr}; 10770 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10771 }; 10772 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10773 ProfileSummaryInfo *PSI = 10774 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10775 LoopVectorizeResult Result = 10776 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10777 if (!Result.MadeAnyChange) 10778 return PreservedAnalyses::all(); 10779 PreservedAnalyses PA; 10780 10781 // We currently do not preserve loopinfo/dominator analyses with outer loop 10782 // vectorization. Until this is addressed, mark these analyses as preserved 10783 // only for non-VPlan-native path. 10784 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10785 if (!EnableVPlanNativePath) { 10786 PA.preserve<LoopAnalysis>(); 10787 PA.preserve<DominatorTreeAnalysis>(); 10788 } 10789 10790 if (Result.MadeCFGChange) { 10791 // Making CFG changes likely means a loop got vectorized. Indicate that 10792 // extra simplification passes should be run. 10793 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10794 // be run if runtime checks have been added. 10795 AM.getResult<ShouldRunExtraVectorPasses>(F); 10796 PA.preserve<ShouldRunExtraVectorPasses>(); 10797 } else { 10798 PA.preserveSet<CFGAnalyses>(); 10799 } 10800 return PA; 10801 } 10802 10803 void LoopVectorizePass::printPipeline( 10804 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10805 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10806 OS, MapClassName2PassName); 10807 10808 OS << "<"; 10809 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10810 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10811 OS << ">"; 10812 } 10813